From d85ff4953c29faed7b278268fe73e6b57787a8f4 Mon Sep 17 00:00:00 2001
From: Anirban Rahut <arahut@fb.com>
Date: Wed, 10 May 2017 14:54:35 -0700
Subject: [PATCH] Blob storage pr

Summary:
The final pull request for Blob Storage.
Closes https://github.com/facebook/rocksdb/pull/2269

Differential Revision: D5033189

Pulled By: yiwu-arbug

fbshipit-source-id: 6356b683ccd58cbf38a1dc55e2ea400feecd5d06
---
 CMakeLists.txt                                |    5 +
 Makefile                                      |    5 +-
 TARGETS                                       |    6 +
 db/compaction_iterator.cc                     |    4 +-
 db/db_impl.cc                                 |   22 +
 db/db_impl.h                                  |    5 +
 env/env_posix.cc                              |   21 +-
 include/rocksdb/env.h                         |    2 -
 src.mk                                        |    7 +
 tools/db_bench_tool.cc                        |   32 +-
 util/file_reader_writer.h                     |    2 +
 util/mpsc.h                                   |  158 ++
 util/timer_queue.h                            |  217 ++
 util/timer_queue_test.cc                      |   72 +
 utilities/blob_db/blob_db.cc                  |  307 +--
 utilities/blob_db/blob_db.h                   |  183 +-
 utilities/blob_db/blob_db_impl.cc             | 2210 +++++++++++++++++
 utilities/blob_db/blob_db_impl.h              |  657 +++++
 utilities/blob_db/blob_db_options_impl.cc     |   66 +
 utilities/blob_db/blob_db_options_impl.h      |   73 +
 utilities/blob_db/blob_db_test.cc             |  547 +++-
 utilities/blob_db/blob_file.cc                |  225 ++
 utilities/blob_db/blob_log_format.cc          |  313 +++
 utilities/blob_db/blob_log_format.h           |  226 ++
 utilities/blob_db/blob_log_reader.cc          |  163 ++
 utilities/blob_db/blob_log_reader.h           |   93 +
 utilities/blob_db/blob_log_writer.cc          |  172 ++
 utilities/blob_db/blob_log_writer.h           |   98 +
 .../optimistic_transaction_db_impl.h          |   11 +-
 29 files changed, 5687 insertions(+), 215 deletions(-)
 create mode 100644 util/mpsc.h
 create mode 100644 util/timer_queue.h
 create mode 100644 util/timer_queue_test.cc
 create mode 100644 utilities/blob_db/blob_db_impl.cc
 create mode 100644 utilities/blob_db/blob_db_impl.h
 create mode 100644 utilities/blob_db/blob_db_options_impl.cc
 create mode 100644 utilities/blob_db/blob_db_options_impl.h
 create mode 100644 utilities/blob_db/blob_file.cc
 create mode 100644 utilities/blob_db/blob_log_format.cc
 create mode 100644 utilities/blob_db/blob_log_format.h
 create mode 100644 utilities/blob_db/blob_log_reader.cc
 create mode 100644 utilities/blob_db/blob_log_reader.h
 create mode 100644 utilities/blob_db/blob_log_writer.cc
 create mode 100644 utilities/blob_db/blob_log_writer.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c2cbd19ae..057f6ed7e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -446,6 +446,10 @@ set(SOURCES
         util/xxhash.cc
         utilities/backupable/backupable_db.cc
         utilities/blob_db/blob_db.cc
+        utilities/blob_db/blob_db_impl.cc
+        utilities/blob_db/blob_log_reader.cc
+        utilities/blob_db/blob_log_writer.cc
+        utilities/blob_db/blob_log_format.cc
         utilities/checkpoint/checkpoint_impl.cc
         utilities/col_buf_decoder.cc
         utilities/col_buf_encoder.cc
@@ -658,6 +662,7 @@ set(TESTS
         util/heap_test.cc
         util/rate_limiter_test.cc
         util/slice_transform_test.cc
+        util/timer_queue_test.cc
         util/thread_list_test.cc
         util/thread_local_test.cc
         utilities/backupable/backupable_db_test.cc
diff --git a/Makefile b/Makefile
index 8d60f90e3..c68ac141f 100644
--- a/Makefile
+++ b/Makefile
@@ -403,7 +403,6 @@ TESTS = \
 	ttl_test \
 	date_tiered_test \
 	backupable_db_test \
-	blob_db_test \
 	document_db_test \
 	json_document_test \
 	sim_cache_test \
@@ -424,6 +423,7 @@ TESTS = \
 	options_settable_test \
 	options_util_test \
 	event_logger_test \
+	timer_queue_test \
 	cuckoo_table_builder_test \
 	cuckoo_table_reader_test \
 	cuckoo_table_db_test \
@@ -1307,6 +1307,9 @@ db_bench_tool_test: tools/db_bench_tool_test.o $(BENCHTOOLOBJECTS) $(TESTHARNESS
 event_logger_test: util/event_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
+timer_queue_test: util/timer_queue_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
 sst_dump_test: tools/sst_dump_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
diff --git a/TARGETS b/TARGETS
index ce532d7d8..8ddeec34f 100644
--- a/TARGETS
+++ b/TARGETS
@@ -196,6 +196,12 @@ cpp_library(
       "util/xxhash.cc",
       "utilities/backupable/backupable_db.cc",
       "utilities/blob_db/blob_db.cc",
+      "utilities/blob_db/blob_db_impl.cc",
+      "utilities/blob_db/blob_db_options_impl.cc",
+      "utilities/blob_db/blob_file.cc",
+      "utilities/blob_db/blob_log_reader.cc",
+      "utilities/blob_db/blob_log_writer.cc",
+      "utilities/blob_db/blob_log_format.cc",
       "utilities/checkpoint/checkpoint_impl.cc",
       "utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc",
       "utilities/convenience/info_log_finder.cc",
diff --git a/db/compaction_iterator.cc b/db/compaction_iterator.cc
index e742813cf..4332acb5a 100644
--- a/db/compaction_iterator.cc
+++ b/db/compaction_iterator.cc
@@ -93,7 +93,9 @@ CompactionIterator::CompactionIterator(
     latest_snapshot_ = snapshots_->back();
   }
   if (compaction_filter_ != nullptr) {
-    if (compaction_filter_->IgnoreSnapshots()) ignore_snapshots_ = true;
+    if (compaction_filter_->IgnoreSnapshots()) {
+      ignore_snapshots_ = true;
+    }
   } else {
     ignore_snapshots_ = false;
   }
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 3ceb3bbcd..348b3b40d 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1577,6 +1577,14 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
   delete casted_s;
 }
 
+bool DBImpl::HasActiveSnapshotLaterThanSN(SequenceNumber sn) {
+  InstrumentedMutexLock l(&mutex_);
+  if (snapshots_.empty()) {
+    return false;
+  }
+  return (snapshots_.newest()->GetSequenceNumber() > sn);
+}
+
 #ifndef ROCKSDB_LITE
 Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
                                         TablePropertiesCollection* props) {
@@ -1821,6 +1829,20 @@ ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) {
   return cf_memtables->GetColumnFamilyHandle();
 }
 
+// REQUIRED: mutex is NOT held.
+ColumnFamilyHandle* DBImpl::GetColumnFamilyHandleUnlocked(
+    uint32_t column_family_id) {
+  ColumnFamilyMemTables* cf_memtables = column_family_memtables_.get();
+
+  InstrumentedMutexLock l(&mutex_);
+
+  if (!cf_memtables->Seek(column_family_id)) {
+    return nullptr;
+  }
+
+  return cf_memtables->GetColumnFamilyHandle();
+}
+
 void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
                                          const Range& range,
                                          uint64_t* const count,
diff --git a/db/db_impl.h b/db/db_impl.h
index 20c3c0ae6..689ca575f 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -203,6 +203,8 @@ class DBImpl : public DB {
 
   virtual SequenceNumber GetLatestSequenceNumber() const override;
 
+  bool HasActiveSnapshotLaterThanSN(SequenceNumber sn);
+
 #ifndef ROCKSDB_LITE
   using DB::ResetStats;
   virtual Status ResetStats() override;
@@ -465,6 +467,9 @@ class DBImpl : public DB {
   // mutex is released.
   ColumnFamilyHandle* GetColumnFamilyHandle(uint32_t column_family_id);
 
+  // Same as above, should called without mutex held and not on write thread.
+  ColumnFamilyHandle* GetColumnFamilyHandleUnlocked(uint32_t column_family_id);
+
   // Returns the number of currently running flushes.
   // REQUIREMENT: mutex_ must be held when calling this function.
   int num_running_flushes() {
diff --git a/env/env_posix.cc b/env/env_posix.cc
index b0befba8c..7d726176a 100644
--- a/env/env_posix.cc
+++ b/env/env_posix.cc
@@ -253,13 +253,14 @@ class PosixEnv : public Env {
     return s;
   }
 
-  virtual Status NewWritableFile(const std::string& fname,
-                                 unique_ptr<WritableFile>* result,
-                                 const EnvOptions& options) override {
+  virtual Status OpenWritableFile(const std::string& fname,
+                                  unique_ptr<WritableFile>* result,
+                                  const EnvOptions& options,
+                                  bool reopen = false) {
     result->reset();
     Status s;
     int fd = -1;
-    int flags = O_CREAT | O_TRUNC;
+    int flags = (reopen) ? (O_CREAT | O_APPEND) : (O_CREAT | O_TRUNC);
     // Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX)
     if (options.use_direct_writes && !options.use_mmap_writes) {
       // Note: we should avoid O_APPEND here due to ta the following bug:
@@ -333,6 +334,18 @@ class PosixEnv : public Env {
     return s;
   }
 
+  virtual Status NewWritableFile(const std::string& fname,
+                                 unique_ptr<WritableFile>* result,
+                                 const EnvOptions& options) override {
+    return OpenWritableFile(fname, result, options, false);
+  }
+
+  virtual Status ReopenWritableFile(const std::string& fname,
+                                    unique_ptr<WritableFile>* result,
+                                    const EnvOptions& options) override {
+    return OpenWritableFile(fname, result, options, true);
+  }
+
   virtual Status ReuseWritableFile(const std::string& fname,
                                    const std::string& old_fname,
                                    unique_ptr<WritableFile>* result,
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index 9af0261fa..ad59dd1a0 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -468,8 +468,6 @@ class SequentialFile {
   // aligned buffer for Direct I/O
   virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
 
-  virtual void Rewind() {}
-
   // Remove any kind of caching of data from the offset to offset+length
   // of this file. If the length is 0, then it refers to the end of file.
   // If the system is not caching the file contents, then this is a noop.
diff --git a/src.mk b/src.mk
index 4a570d1bd..677b18b18 100644
--- a/src.mk
+++ b/src.mk
@@ -150,6 +150,12 @@ LIB_SOURCES =                                                   \
   util/xxhash.cc                                                \
   utilities/backupable/backupable_db.cc                         \
   utilities/blob_db/blob_db.cc                                  \
+  utilities/blob_db/blob_db_impl.cc                             \
+  utilities/blob_db/blob_db_options_impl.cc                     \
+  utilities/blob_db/blob_file.cc                                \
+  utilities/blob_db/blob_log_reader.cc                          \
+  utilities/blob_db/blob_log_writer.cc                          \
+  utilities/blob_db/blob_log_format.cc                          \
   utilities/checkpoint/checkpoint_impl.cc                       \
   utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc    \
   utilities/convenience/info_log_finder.cc                      \
@@ -308,6 +314,7 @@ MAIN_SOURCES =                                                    \
   util/log_write_bench.cc                                               \
   util/rate_limiter_test.cc                                             \
   util/slice_transform_test.cc                                          \
+  util/timer_queue_test.cc                                             \
   util/thread_list_test.cc                                              \
   util/thread_local_test.cc                                             \
   utilities/backupable/backupable_db_test.cc                            \
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 029747271..195c1c660 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -583,6 +583,10 @@ DEFINE_bool(optimistic_transaction_db, false,
             "Open a OptimisticTransactionDB instance. "
             "Required for randomtransaction benchmark.");
 
+DEFINE_bool(use_blob_db, false,
+            "Open a BlobDB instance. "
+            "Required for largevalue benchmark.");
+
 DEFINE_bool(transaction_db, false,
             "Open a TransactionDB instance. "
             "Required for randomtransaction benchmark.");
@@ -630,8 +634,6 @@ DEFINE_bool(report_bg_io_stats, false,
 DEFINE_bool(use_stderr_info_logger, false,
             "Write info logs to stderr instead of to LOG file. ");
 
-DEFINE_bool(use_blob_db, false, "Whether to use BlobDB. ");
-
 static enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
   assert(ctype);
 
@@ -1128,6 +1130,15 @@ class RandomGenerator {
     pos_ += len;
     return Slice(data_.data() + pos_ - len, len);
   }
+
+  Slice GenerateWithTTL(unsigned int len) {
+    assert(len <= data_.size());
+    if (pos_ + len > data_.size()) {
+      pos_ = 0;
+    }
+    pos_ += len;
+    return Slice(data_.data() + pos_ - len, len);
+  }
 };
 
 static void AppendWithSpace(std::string* str, Slice msg) {
@@ -3227,9 +3238,14 @@ void VerifyDBFromDB(std::string& truth_db_name) {
       if (s.ok()) {
         db->db = ptr;
       }
-#endif  // ROCKSDB_LITE
     } else if (FLAGS_use_blob_db) {
-      s = NewBlobDB(options, db_name, &db->db);
+      blob_db::BlobDBOptions blob_db_options;
+      blob_db::BlobDB* ptr;
+      s = blob_db::BlobDB::Open(options, blob_db_options, db_name, &ptr);
+      if (s.ok()) {
+        db->db = ptr;
+      }
+#endif  // ROCKSDB_LITE
     } else {
       s = DB::Open(options, db_name, &db->db);
     }
@@ -3406,8 +3422,12 @@ void VerifyDBFromDB(std::string& truth_db_name) {
         int64_t rand_num = key_gens[id]->Next();
         GenerateKeyFromInt(rand_num, FLAGS_num, &key);
         if (FLAGS_use_blob_db) {
-          s = db_with_cfh->db->Put(write_options_, key,
-                                   gen.Generate(value_size_));
+          Slice val = gen.Generate(value_size_);
+          int ttl = rand() % 86400;
+          blob_db::BlobDB* blobdb =
+              static_cast<blob_db::BlobDB*>(db_with_cfh->db);
+          s = blobdb->PutWithTTL(write_options_, key, val, ttl);
+
         } else if (FLAGS_num_column_families <= 1) {
           batch.Put(key, gen.Generate(value_size_));
         } else {
diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h
index 8204b4303..8de41a203 100644
--- a/util/file_reader_writer.h
+++ b/util/file_reader_writer.h
@@ -48,6 +48,8 @@ class SequentialFileReader {
 
   Status Skip(uint64_t n);
 
+  void Rewind();
+
   SequentialFile* file() { return file_.get(); }
 
   bool use_direct_io() const { return file_->use_direct_io(); }
diff --git a/util/mpsc.h b/util/mpsc.h
new file mode 100644
index 000000000..b81492738
--- /dev/null
+++ b/util/mpsc.h
@@ -0,0 +1,158 @@
+//  Portions Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Large parts of this file is borrowed from the public domain code below.
+// from https://github.com/mstump/queues
+
+// C++ implementation of Dmitry Vyukov's non-intrusive
+// lock free unbound MPSC queue
+// http://www.1024cores.net/home/
+// lock-free-algorithms/queues/non-intrusive-mpsc-node-based-queue
+
+// License from mstump/queues
+// This is free and unencumbered software released into the public domain.
+//
+// Anyone is free to copy, modify, publish, use, compile, sell, or
+// distribute this software, either in source code form or as a compiled
+// binary, for any purpose, commercial or non-commercial, and by any
+// means.
+//
+// In jurisdictions that recognize copyright laws, the author or authors
+// of this software dedicate any and all copyright interest in the
+// software to the public domain. We make this dedication for the benefit
+// of the public at large and to the detriment of our heirs and
+// successors. We intend this dedication to be an overt act of
+// relinquishment in perpetuity of all present and future rights to this
+// software under copyright law.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+// IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// For more information, please refer to <http://unlicense.org>
+
+// License from http://www.1024cores.net/home/
+// lock-free-algorithms/queues/non-intrusive-mpsc-node-based-queue
+// Copyright (c) 2010-2011 Dmitry Vyukov. All rights reserved.
+// Redistribution and use in source and binary forms, with or
+// without modification, are permitted provided that the following
+// conditions are met:
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY DMITRY VYUKOV "AS IS" AND ANY EXPRESS OR
+// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+// EVENT SHALL DMITRY VYUKOV OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// The views and conclusions contained in the software and documentation
+// are those of the authors and should not be interpreted as representing
+// official policies, either expressed or implied, of Dmitry Vyukov.
+//
+
+#ifndef UTIL_MPSC_H_
+#define UTIL_MPSC_H_
+
+#include <atomic>
+#include <cassert>
+#include <type_traits>
+
+/**
+ * Multiple Producer Single Consumer Lockless Q
+ */
+template <typename T>
+class mpsc_queue_t {
+ public:
+  struct buffer_node_t {
+    T data;
+    std::atomic<buffer_node_t*> next;
+  };
+
+  mpsc_queue_t() {
+    buffer_node_aligned_t* al_st = new buffer_node_aligned_t;
+    buffer_node_t* node = new (al_st) buffer_node_t();
+    _head.store(node);
+    _tail.store(node);
+
+    node->next.store(nullptr, std::memory_order_relaxed);
+  }
+
+  ~mpsc_queue_t() {
+    T output;
+    while (this->dequeue(&output)) {
+    }
+    buffer_node_t* front = _head.load(std::memory_order_relaxed);
+    front->~buffer_node_t();
+
+    ::operator delete(front);
+  }
+
+  void enqueue(const T& input) {
+    buffer_node_aligned_t* al_st = new buffer_node_aligned_t;
+    buffer_node_t* node = new (al_st) buffer_node_t();
+
+    node->data = input;
+    node->next.store(nullptr, std::memory_order_relaxed);
+
+    buffer_node_t* prev_head = _head.exchange(node, std::memory_order_acq_rel);
+    prev_head->next.store(node, std::memory_order_release);
+  }
+
+  bool dequeue(T* output) {
+    buffer_node_t* tail = _tail.load(std::memory_order_relaxed);
+    buffer_node_t* next = tail->next.load(std::memory_order_acquire);
+
+    if (next == nullptr) {
+      return false;
+    }
+
+    *output = next->data;
+    _tail.store(next, std::memory_order_release);
+
+    tail->~buffer_node_t();
+
+    ::operator delete(tail);
+    return true;
+  }
+
+  // you can only use pop_all if the queue is SPSC
+  buffer_node_t* pop_all() {
+    // nobody else can move the tail pointer.
+    buffer_node_t* tptr = _tail.load(std::memory_order_relaxed);
+    buffer_node_t* next =
+        tptr->next.exchange(nullptr, std::memory_order_acquire);
+    _head.exchange(tptr, std::memory_order_acquire);
+
+    // there is a race condition here
+    return next;
+  }
+
+ private:
+  typedef typename std::aligned_storage<
+      sizeof(buffer_node_t), std::alignment_of<buffer_node_t>::value>::type
+      buffer_node_aligned_t;
+
+  std::atomic<buffer_node_t*> _head;
+  std::atomic<buffer_node_t*> _tail;
+
+  mpsc_queue_t(const mpsc_queue_t&) = delete;
+  mpsc_queue_t& operator=(const mpsc_queue_t&) = delete;
+};
+
+#endif  // UTIL_MPSC_H_
diff --git a/util/timer_queue.h b/util/timer_queue.h
new file mode 100644
index 000000000..72b44dc2d
--- /dev/null
+++ b/util/timer_queue.h
@@ -0,0 +1,217 @@
+//  Portions Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Borrowed from
+// http://www.crazygaze.com/blog/2016/03/24/portable-c-timer-queue/
+// Timer Queue
+//
+// License
+//
+// The source code in this article is licensed under the CC0 license, so feel
+// free to copy, modify, share, do whatever you want with it.
+// No attribution is required, but Ill be happy if you do.
+// CC0 license
+
+// The person who associated a work with this deed has dedicated the work to the
+// public domain by waiving all of his or her rights to the work worldwide
+// under copyright law, including all related and neighboring rights, to the
+// extent allowed by law.  You can copy, modify, distribute and perform the
+// work, even for commercial purposes, all without asking permission.
+
+#pragma once
+#include <assert.h>
+#include <chrono>
+#include <condition_variable>
+#include <functional>
+#include <queue>
+#include <thread>
+#include <utility>
+#include <vector>
+
+// Allows execution of handlers at a specified time in the future
+// Guarantees:
+//  - All handlers are executed ONCE, even if cancelled (aborted parameter will
+// be set to true)
+//      - If TimerQueue is destroyed, it will cancel all handlers.
+//  - Handlers are ALWAYS executed in the Timer Queue worker thread.
+//  - Handlers execution order is NOT guaranteed
+//
+////////////////////////////////////////////////////////////////////////////////
+// borrowed from
+// http://www.crazygaze.com/blog/2016/03/24/portable-c-timer-queue/
+class TimerQueue {
+ public:
+  TimerQueue() : m_th(&TimerQueue::run, this) {}
+
+  ~TimerQueue() {
+    cancelAll();
+    // Abusing the timer queue to trigger the shutdown.
+    add(0, [this](bool) {
+      m_finish = true;
+      return std::make_pair(false, 0);
+    });
+    m_th.join();
+  }
+
+  // Adds a new timer
+  // \return
+  //  Returns the ID of the new timer. You can use this ID to cancel the
+  // timer
+  uint64_t add(int64_t milliseconds,
+               std::function<std::pair<bool, int64_t>(bool)> handler) {
+    WorkItem item;
+    Clock::time_point tp = Clock::now();
+    item.end = tp + std::chrono::milliseconds(milliseconds);
+    item.period = milliseconds;
+    item.handler = std::move(handler);
+
+    std::unique_lock<std::mutex> lk(m_mtx);
+    uint64_t id = ++m_idcounter;
+    item.id = id;
+    m_items.push(std::move(item));
+
+    // Something changed, so wake up timer thread
+    m_checkWork.notify_one();
+    return id;
+  }
+
+  // Cancels the specified timer
+  // \return
+  //  1 if the timer was cancelled.
+  //  0 if you were too late to cancel (or the timer ID was never valid to
+  // start with)
+  size_t cancel(uint64_t id) {
+    // Instead of removing the item from the container (thus breaking the
+    // heap integrity), we set the item as having no handler, and put
+    // that handler on a new item at the top for immediate execution
+    // The timer thread will then ignore the original item, since it has no
+    // handler.
+    std::unique_lock<std::mutex> lk(m_mtx);
+    for (auto&& item : m_items.getContainer()) {
+      if (item.id == id && item.handler) {
+        WorkItem newItem;
+        // Zero time, so it stays at the top for immediate execution
+        newItem.end = Clock::time_point();
+        newItem.id = 0;  // Means it is a canceled item
+        // Move the handler from item to newitem (thus clearing item)
+        newItem.handler = std::move(item.handler);
+        m_items.push(std::move(newItem));
+
+        // Something changed, so wake up timer thread
+        m_checkWork.notify_one();
+        return 1;
+      }
+    }
+    return 0;
+  }
+
+  // Cancels all timers
+  // \return
+  //  The number of timers cancelled
+  size_t cancelAll() {
+    // Setting all "end" to 0 (for immediate execution) is ok,
+    // since it maintains the heap integrity
+    std::unique_lock<std::mutex> lk(m_mtx);
+    m_cancel = true;
+    for (auto&& item : m_items.getContainer()) {
+      if (item.id && item.handler) {
+        item.end = Clock::time_point();
+        item.id = 0;
+      }
+    }
+    auto ret = m_items.size();
+
+    m_checkWork.notify_one();
+    return ret;
+  }
+
+ private:
+  using Clock = std::chrono::steady_clock;
+  TimerQueue(const TimerQueue&) = delete;
+  TimerQueue& operator=(const TimerQueue&) = delete;
+
+  void run() {
+    std::unique_lock<std::mutex> lk(m_mtx);
+    while (!m_finish) {
+      auto end = calcWaitTime_lock();
+      if (end.first) {
+        // Timers found, so wait until it expires (or something else
+        // changes)
+        m_checkWork.wait_until(lk, end.second);
+      } else {
+        // No timers exist, so wait forever until something changes
+        m_checkWork.wait(lk);
+      }
+
+      // Check and execute as much work as possible, such as, all expired
+      // timers
+      checkWork(&lk);
+    }
+
+    // If we are shutting down, we should not have any items left,
+    // since the shutdown cancels all items
+    assert(m_items.size() == 0);
+  }
+
+  std::pair<bool, Clock::time_point> calcWaitTime_lock() {
+    while (m_items.size()) {
+      if (m_items.top().handler) {
+        // Item present, so return the new wait time
+        return std::make_pair(true, m_items.top().end);
+      } else {
+        // Discard empty handlers (they were cancelled)
+        m_items.pop();
+      }
+    }
+
+    // No items found, so return no wait time (causes the thread to wait
+    // indefinitely)
+    return std::make_pair(false, Clock::time_point());
+  }
+
+  void checkWork(std::unique_lock<std::mutex>* lk) {
+    while (m_items.size() && m_items.top().end <= Clock::now()) {
+      WorkItem item(m_items.top());
+      m_items.pop();
+
+      if (item.handler) {
+        (*lk).unlock();
+        auto reschedule_pair = item.handler(item.id == 0);
+        (*lk).lock();
+        if (!m_cancel && reschedule_pair.first) {
+          int64_t new_period = (reschedule_pair.second == -1)
+                                   ? item.period
+                                   : reschedule_pair.second;
+
+          item.period = new_period;
+          item.end = Clock::now() + std::chrono::milliseconds(new_period);
+          m_items.push(std::move(item));
+        }
+      }
+    }
+  }
+
+  bool m_finish = false;
+  bool m_cancel = false;
+  uint64_t m_idcounter = 0;
+  std::condition_variable m_checkWork;
+
+  struct WorkItem {
+    Clock::time_point end;
+    int64_t period;
+    uint64_t id;  // id==0 means it was cancelled
+    std::function<std::pair<bool, int64_t>(bool)> handler;
+    bool operator>(const WorkItem& other) const { return end > other.end; }
+  };
+
+  std::mutex m_mtx;
+  // Inheriting from priority_queue, so we can access the internal container
+  class Queue : public std::priority_queue<WorkItem, std::vector<WorkItem>,
+                                           std::greater<WorkItem>> {
+   public:
+    std::vector<WorkItem>& getContainer() { return this->c; }
+  } m_items;
+  std::thread m_th;
+};
diff --git a/util/timer_queue_test.cc b/util/timer_queue_test.cc
new file mode 100644
index 000000000..e0c545d0d
--- /dev/null
+++ b/util/timer_queue_test.cc
@@ -0,0 +1,72 @@
+//  Portions Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+// borrowed from
+// http://www.crazygaze.com/blog/2016/03/24/portable-c-timer-queue/
+// Timer Queue
+//
+// License
+//
+// The source code in this article is licensed under the CC0 license, so feel
+// free
+// to copy, modify, share, do whatever you want with it.
+// No attribution is required, but Ill be happy if you do.
+// CC0 license
+
+// The person who associated a work with this deed has dedicated the work to the
+// public domain by waiving all of his or her rights to the work worldwide
+// under copyright law, including all related and neighboring rights, to the
+// extent allowed by law.  You can copy, modify, distribute and perform the
+// work, even for
+// commercial purposes, all without asking permission. See Other Information
+// below.
+//
+
+#include "util/timer_queue.h"
+#include <future>
+
+namespace Timing {
+
+using Clock = std::chrono::high_resolution_clock;
+double now() {
+  static auto start = Clock::now();
+  return std::chrono::duration<double, std::milli>(Clock::now() - start)
+      .count();
+}
+
+}  // namespace Timing
+
+int main() {
+  TimerQueue q;
+
+  double tnow = Timing::now();
+
+  q.add(10000, [tnow](bool aborted) mutable {
+    printf("T 1: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow);
+    return std::make_pair(false, 0);
+  });
+  q.add(10001, [tnow](bool aborted) mutable {
+    printf("T 2: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow);
+    return std::make_pair(false, 0);
+  });
+
+  q.add(1000, [tnow](bool aborted) mutable {
+    printf("T 3: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow);
+    return std::make_pair(!aborted, 1000);
+  });
+
+  auto id = q.add(2000, [tnow](bool aborted) mutable {
+    printf("T 4: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow);
+    return std::make_pair(!aborted, 2000);
+  });
+
+  (void)id;
+  // auto ret = q.cancel(id);
+  // assert(ret == 1);
+  // q.cancelAll();
+
+  return 0;
+}
+//////////////////////////////////////////
diff --git a/utilities/blob_db/blob_db.cc b/utilities/blob_db/blob_db.cc
index dcc9b5e3f..b3ef96bf9 100644
--- a/utilities/blob_db/blob_db.cc
+++ b/utilities/blob_db/blob_db.cc
@@ -1,12 +1,15 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-#include "utilities/blob_db/blob_db.h"
-
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
 #ifndef ROCKSDB_LITE
+
+#include "utilities/blob_db/blob_db.h"
 #include "db/write_batch_internal.h"
 #include "monitoring/instrumented_mutex.h"
 #include "options/cf_options.h"
+#include "rocksdb/compaction_filter.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
@@ -17,194 +20,152 @@
 #include "util/crc32c.h"
 #include "util/file_reader_writer.h"
 #include "util/filename.h"
+#include "utilities/blob_db/blob_db_impl.h"
 
 namespace rocksdb {
 
-namespace {
-int kBlockBasedTableVersionFormat = 2;
-}  // namespace
-
-class BlobDB : public StackableDB {
- public:
-  using rocksdb::StackableDB::Put;
-  Status Put(const WriteOptions& options, const Slice& key,
-             const Slice& value) override;
-
-  using rocksdb::StackableDB::Get;
-  Status Get(const ReadOptions& options, const Slice& key,
-             std::string* value) override;
-
-  Status Open();
-
-  explicit BlobDB(DB* db);
-
- private:
-  std::string dbname_;
-  ImmutableCFOptions ioptions_;
-  InstrumentedMutex mutex_;
-  std::unique_ptr<RandomAccessFileReader> file_reader_;
-  std::unique_ptr<WritableFileWriter> file_writer_;
-  size_t writer_offset_;
-  size_t next_sync_offset_;
-
-  static const std::string kFileName;
-  static const size_t kBlockHeaderSize;
-  static const size_t kBytesPerSync;
-};
-
-Status NewBlobDB(Options options, std::string dbname, DB** blob_db) {
-  DB* db;
-  Status s = DB::Open(options, dbname, &db);
-  if (!s.ok()) {
-    return s;
-  }
-  BlobDB* bdb = new BlobDB(db);
-  s = bdb->Open();
-  if (!s.ok()) {
-    delete bdb;
+namespace blob_db {
+port::Mutex listener_mutex;
+typedef std::shared_ptr<BlobDBFlushBeginListener> FlushBeginListener_t;
+typedef std::shared_ptr<BlobReconcileWalFilter> ReconcileWalFilter_t;
+typedef std::shared_ptr<EvictAllVersionsCompactionListener>
+    CompactionListener_t;
+
+// to ensure the lifetime of the listeners
+std::vector<std::shared_ptr<EventListener>> all_blobdb_listeners;
+std::vector<ReconcileWalFilter_t> all_wal_filters;
+
+Status BlobDB::OpenAndLoad(const Options& options,
+                           const BlobDBOptions& bdb_options,
+                           const std::string& dbname, BlobDB** blob_db,
+                           Options* changed_options) {
+  *changed_options = options;
+  *blob_db = nullptr;
+
+  FlushBeginListener_t fblistener =
+      std::make_shared<BlobDBFlushBeginListener>();
+  ReconcileWalFilter_t rw_filter = std::make_shared<BlobReconcileWalFilter>();
+  CompactionListener_t ce_listener =
+      std::make_shared<EvictAllVersionsCompactionListener>();
+
+  {
+    MutexLock l(&listener_mutex);
+    all_blobdb_listeners.push_back(fblistener);
+    all_blobdb_listeners.push_back(ce_listener);
+    all_wal_filters.push_back(rw_filter);
   }
+
+  changed_options->listeners.emplace_back(fblistener);
+  changed_options->listeners.emplace_back(ce_listener);
+  changed_options->wal_filter = rw_filter.get();
+
+  DBOptions db_options(*changed_options);
+
+  // we need to open blob db first so that recovery can happen
+  BlobDBImpl* bdb = new BlobDBImpl(dbname, bdb_options, db_options);
+
+  fblistener->SetImplPtr(bdb);
+  ce_listener->SetImplPtr(bdb);
+  rw_filter->SetImplPtr(bdb);
+
+  Status s = bdb->OpenPhase1();
+  if (!s.ok()) return s;
+
   *blob_db = bdb;
   return s;
 }
 
-const std::string BlobDB::kFileName = "blob_log";
-const size_t BlobDB::kBlockHeaderSize = 8;
-const size_t BlobDB::kBytesPerSync = 1024 * 1024 * 128;
-
-BlobDB::BlobDB(DB* db)
-    : StackableDB(db),
-      ioptions_(db->GetOptions()),
-      writer_offset_(0),
-      next_sync_offset_(kBytesPerSync) {}
-
-Status BlobDB::Open() {
-  unique_ptr<WritableFile> wfile;
-  EnvOptions env_options(db_->GetOptions());
-  Status s = ioptions_.env->NewWritableFile(db_->GetName() + "/" + kFileName,
-                                            &wfile, env_options);
-  if (!s.ok()) {
-    return s;
+Status BlobDB::Open(const Options& options, const BlobDBOptions& bdb_options,
+                    const std::string& dbname, BlobDB** blob_db) {
+  *blob_db = nullptr;
+  DBOptions db_options(options);
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+  std::vector<ColumnFamilyHandle*> handles;
+  Status s = BlobDB::Open(db_options, bdb_options, dbname, column_families,
+                          &handles, blob_db);
+  if (s.ok()) {
+    assert(handles.size() == 1);
+    // i can delete the handle since DBImpl is always holding a reference to
+    // default column family
+    delete handles[0];
   }
-  file_writer_.reset(new WritableFileWriter(std::move(wfile), env_options));
-
-  // Write version
-  std::string version;
-  PutFixed64(&version, 0);
-  s = file_writer_->Append(Slice(version));
-  if (!s.ok()) {
-    return s;
-  }
-  writer_offset_ += version.size();
-
-  std::unique_ptr<RandomAccessFile> rfile;
-  s = ioptions_.env->NewRandomAccessFile(db_->GetName() + "/" + kFileName,
-                                         &rfile, env_options);
-  if (!s.ok()) {
-    return s;
-  }
-  file_reader_.reset(new RandomAccessFileReader(std::move(rfile)));
   return s;
 }
 
-Status BlobDB::Put(const WriteOptions& options, const Slice& key,
-                   const Slice& value) {
-  BlockBuilder block_builder(1, false);
-  block_builder.Add(key, value);
-
-  CompressionType compression = CompressionType::kLZ4Compression;
-  CompressionOptions compression_opts;
-
-  Slice block_contents;
-  std::string compression_output;
+Status BlobDB::Open(const DBOptions& db_options,
+                    const BlobDBOptions& bdb_options, const std::string& dbname,
+                    const std::vector<ColumnFamilyDescriptor>& column_families,
+                    std::vector<ColumnFamilyHandle*>* handles, BlobDB** blob_db,
+                    bool no_base_db) {
+  *blob_db = nullptr;
 
-  block_contents = CompressBlock(block_builder.Finish(), compression_opts,
-                                 &compression, kBlockBasedTableVersionFormat,
-                                 Slice() /* dictionary */, &compression_output);
+  DBOptions my_db_options(db_options);
+  FlushBeginListener_t fblistener =
+      std::make_shared<BlobDBFlushBeginListener>();
+  CompactionListener_t ce_listener =
+      std::make_shared<EvictAllVersionsCompactionListener>();
+  ReconcileWalFilter_t rw_filter = std::make_shared<BlobReconcileWalFilter>();
 
-  char header[kBlockHeaderSize];
-  char trailer[kBlockTrailerSize];
-  trailer[0] = compression;
-  auto crc = crc32c::Value(block_contents.data(), block_contents.size());
-  crc = crc32c::Extend(crc, trailer, 1);  // Extend to cover block type
-  EncodeFixed32(trailer + 1, crc32c::Mask(crc));
+  my_db_options.listeners.emplace_back(fblistener);
+  my_db_options.listeners.emplace_back(ce_listener);
+  my_db_options.wal_filter = rw_filter.get();
 
-  BlockHandle handle;
-  std::string index_entry;
-  Status s;
   {
-    InstrumentedMutexLock l(&mutex_);
-    auto raw_block_size = block_contents.size();
-    EncodeFixed64(header, raw_block_size);
-    s = file_writer_->Append(Slice(header, kBlockHeaderSize));
-    writer_offset_ += kBlockHeaderSize;
-    if (s.ok()) {
-      handle.set_offset(writer_offset_);
-      handle.set_size(raw_block_size);
-      s = file_writer_->Append(block_contents);
-    }
-    if (s.ok()) {
-      s = file_writer_->Append(Slice(trailer, kBlockTrailerSize));
-    }
-    if (s.ok()) {
-      s = file_writer_->Flush();
-    }
-    if (s.ok() && writer_offset_ > next_sync_offset_) {
-      // Sync every kBytesPerSync. This is a hacky way to limit unsynced data.
-      next_sync_offset_ += kBytesPerSync;
-      s = file_writer_->Sync(db_->GetOptions().use_fsync);
-    }
-    if (s.ok()) {
-      writer_offset_ += block_contents.size() + kBlockTrailerSize;
-      // Put file number
-      PutVarint64(&index_entry, 0);
-      handle.EncodeTo(&index_entry);
-      s = db_->Put(options, key, index_entry);
-    }
+    MutexLock l(&listener_mutex);
+    all_blobdb_listeners.push_back(fblistener);
+    all_blobdb_listeners.push_back(ce_listener);
+    all_wal_filters.push_back(rw_filter);
   }
-  return s;
-}
 
-Status BlobDB::Get(const ReadOptions& options, const Slice& key,
-                   std::string* value) {
-  Status s;
-  std::string index_entry;
-  s = db_->Get(options, key, &index_entry);
-  if (!s.ok()) {
-    return s;
-  }
-  BlockHandle handle;
-  Slice index_entry_slice(index_entry);
-  uint64_t file_number;
-  if (!GetVarint64(&index_entry_slice, &file_number)) {
-    return Status::Corruption();
-  }
-  assert(file_number == 0);
-  s = handle.DecodeFrom(&index_entry_slice);
-  if (!s.ok()) {
-    return s;
-  }
-  Footer footer(0, kBlockBasedTableVersionFormat);
-  BlockContents contents;
-  s = ReadBlockContents(file_reader_.get(), footer, options, handle, &contents,
-                        ioptions_);
+  // we need to open blob db first so that recovery can happen
+  BlobDBImpl* bdb = new BlobDBImpl(dbname, bdb_options, my_db_options);
+  fblistener->SetImplPtr(bdb);
+  ce_listener->SetImplPtr(bdb);
+  rw_filter->SetImplPtr(bdb);
+
+  Status s = bdb->OpenPhase1();
+  if (!s.ok()) return s;
+
+  if (no_base_db) return s;
+
+  DB* db = nullptr;
+  s = DB::Open(my_db_options, dbname, column_families, handles, &db);
+  if (!s.ok()) return s;
+
+  // set the implementation pointer
+  s = bdb->LinkToBaseDB(db);
   if (!s.ok()) {
-    return s;
-  }
-  Block block(std::move(contents), kDisableGlobalSequenceNumber);
-  BlockIter bit;
-  InternalIterator* it = block.NewIterator(nullptr, &bit);
-  it->SeekToFirst();
-  if (!it->status().ok()) {
-    return it->status();
+    delete bdb;
+    bdb = nullptr;
   }
-  *value = it->value().ToString();
+  *blob_db = bdb;
   return s;
 }
+
+BlobDB::BlobDB(DB* db) : StackableDB(db) {}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+//
+// std::function<int(double)> fnCaller =
+//     std::bind(&A::fn, &anInstance, std::placeholders::_1);
+////////////////////////////////////////////////////////////////////////////////
+BlobDBOptions::BlobDBOptions()
+    : blob_dir("blob_dir"),
+      path_relative(true),
+      is_fifo(false),
+      blob_dir_size(1000ULL * 1024ULL * 1024ULL * 1024ULL),
+      ttl_range_secs(3600),
+      min_blob_size(512),
+      bytes_per_sync(0),
+      blob_file_size(256 * 1024 * 1024),
+      num_concurrent_simple_blobs(4),
+      default_ttl_extractor(false),
+      compression(kNoCompression) {}
+
+}  // namespace blob_db
 }  // namespace rocksdb
-#else
-namespace rocksdb {
-Status NewBlobDB(Options options, std::string dbname, DB** blob_db) {
-  return Status::NotSupported();
-}
-}  // namespace rocksdb
-#endif  // ROCKSDB_LITE
+#endif
diff --git a/utilities/blob_db/blob_db.h b/utilities/blob_db/blob_db.h
index 43111fa0e..fea8063a4 100644
--- a/utilities/blob_db/blob_db.h
+++ b/utilities/blob_db/blob_db.h
@@ -7,12 +7,19 @@
 
 #pragma once
 
+#ifndef ROCKSDB_LITE
+
+#include <functional>
 #include <string>
+#include <vector>
 #include "rocksdb/db.h"
 #include "rocksdb/status.h"
+#include "rocksdb/utilities/stackable_db.h"
 
 namespace rocksdb {
-// EXPERIMENAL ONLY
+
+namespace blob_db {
+
 // A wrapped database which puts values of KV pairs in a separate log
 // and store location to the log in the underlying DB.
 // It lacks lots of importatant functionalities, e.g. DB restarts,
@@ -20,5 +27,177 @@ namespace rocksdb {
 //
 // The factory needs to be moved to include/rocksdb/utilities to allow
 // users to use blob DB.
-extern Status NewBlobDB(Options options, std::string dbname, DB** blob_db);
+
+struct BlobDBOptions {
+  // name of the directory under main db, where blobs will be stored.
+  // default is "blob_dir"
+  std::string blob_dir;
+
+  // whether the blob_dir path is relative or absolute.
+  bool path_relative;
+
+  // is the eviction strategy fifo based
+  bool is_fifo;
+
+  // maximum size of the blob dir. Once this gets used, up
+  // evict the blob file which is oldest (is_fifo )
+  // 0 means no limits
+  uint64_t blob_dir_size;
+
+  // a new bucket is opened, for ttl_range. So if ttl_range is 600seconds
+  // (10 minutes), and the first bucket starts at 1471542000
+  // then the blob buckets will be
+  // first bucket is 1471542000 - 1471542600
+  // second bucket is 1471542600 - 1471543200
+  // and so on
+  uint32_t ttl_range_secs;
+
+  // at what size will the blobs be stored in separate log rather than
+  // inline
+  uint64_t min_blob_size;
+
+  // at what bytes will the blob files be synced to blob log.
+  uint64_t bytes_per_sync;
+
+  // the target size of each blob file. File will become immutable
+  // after it exceeds that size
+  uint64_t blob_file_size;
+
+  // how many files to use for simple blobs at one time
+  uint32_t num_concurrent_simple_blobs;
+
+  // this function is to be provided by client if they intend to
+  // use Put API to provide TTL.
+  // the first argument is the value in the Put API
+  // in case you want to do some modifications to the value,
+  // return a new Slice in the second.
+  // otherwise just copy the input value into output.
+  // the ttl should be extracted and returned in last pointer.
+  // otherwise assign it to -1
+  std::function<bool(const Slice&, Slice*, int32_t*)> extract_ttl_fn;
+
+  // eviction callback.
+  // this function will be called for every blob that is getting
+  // evicted.
+  std::function<void(const ColumnFamilyHandle*, const Slice&, const Slice&)>
+      gc_evict_cb_fn;
+
+  // default ttl extactor
+  bool default_ttl_extractor;
+
+  // what compression to use for Blob's
+  CompressionType compression;
+
+  // default constructor
+  BlobDBOptions();
+
+  BlobDBOptions(const BlobDBOptions& in) = default;
+
+  virtual ~BlobDBOptions() = default;
+};
+
+class BlobDB : public StackableDB {
+ public:
+  // the suffix to a blob value to represent "ttl:TTLVAL"
+  static const uint64_t kTTLSuffixLength = 8;
+
+ public:
+  using rocksdb::StackableDB::Put;
+
+  // This function needs to be called before destroying
+  // the base DB
+  static Status DestroyBlobDB(const std::string& dbname, const Options& options,
+                              const BlobDBOptions& bdb_options);
+
+  virtual Status Put(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& value) override = 0;
+
+  using rocksdb::StackableDB::Delete;
+  virtual Status Delete(const WriteOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const Slice& key) override = 0;
+
+  virtual Status PutWithTTL(const WriteOptions& options,
+                            ColumnFamilyHandle* column_family, const Slice& key,
+                            const Slice& value, int32_t ttl) = 0;
+
+  virtual Status PutWithTTL(const WriteOptions& options, const Slice& key,
+                            const Slice& value, int32_t ttl) {
+    return PutWithTTL(options, DefaultColumnFamily(), key, value, ttl);
+  }
+
+  virtual Status PutUntil(const WriteOptions& options,
+                          ColumnFamilyHandle* column_family, const Slice& key,
+                          const Slice& value, int32_t expiration) = 0;
+
+  virtual Status PutUntil(const WriteOptions& options, const Slice& key,
+                          const Slice& value, int32_t expiration) {
+    return PutUntil(options, DefaultColumnFamily(), key, value, expiration);
+  }
+
+  using rocksdb::StackableDB::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     std::string* value) override = 0;
+
+  using rocksdb::StackableDB::MultiGet;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override = 0;
+
+  using rocksdb::StackableDB::SingleDelete;
+  virtual Status SingleDelete(const WriteOptions& wopts,
+                              ColumnFamilyHandle* column_family,
+                              const Slice& key) override = 0;
+
+  using rocksdb::StackableDB::Merge;
+  virtual Status Merge(const WriteOptions& options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) override {
+    return Status::NotSupported("Not supported operation in blob db.");
+  }
+
+  virtual Status Write(const WriteOptions& opts,
+                       WriteBatch* updates) override = 0;
+
+  // Starting point for opening a Blob DB.
+  // changed_options - critical. Blob DB loads and inserts listeners
+  // into options which are necessary for recovery and atomicity
+  // Use this pattern if you need control on step 2, i.e. your
+  // BaseDB is not just a simple rocksdb but a stacked DB
+  // 1. ::OpenAndLoad
+  // 2. Open Base DB with the changed_options
+  // 3. ::LinkToBaseDB
+  static Status OpenAndLoad(const Options& options,
+                            const BlobDBOptions& bdb_options,
+                            const std::string& dbname, BlobDB** blob_db,
+                            Options* changed_options);
+
+  // This is another way to open BLOB DB which do not have other
+  // Stackable DB's in play
+  // Steps.
+  // 1. ::Open
+  static Status Open(const Options& options, const BlobDBOptions& bdb_options,
+                     const std::string& dbname, BlobDB** blob_db);
+
+  static Status Open(const DBOptions& db_options,
+                     const BlobDBOptions& bdb_options,
+                     const std::string& dbname,
+                     const std::vector<ColumnFamilyDescriptor>& column_families,
+                     std::vector<ColumnFamilyHandle*>* handles,
+                     BlobDB** blob_db, bool no_base_db = false);
+
+  virtual ~BlobDB() {}
+
+  virtual Status LinkToBaseDB(DB* db_base) = 0;
+
+ protected:
+  explicit BlobDB(DB* db);
+};
+
+}  // namespace blob_db
 }  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
new file mode 100644
index 000000000..72c5d0e7e
--- /dev/null
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -0,0 +1,2210 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#ifndef ROCKSDB_LITE
+
+#include "utilities/blob_db/blob_db_impl.h"
+#include <algorithm>
+#include <chrono>
+#include <cinttypes>
+#include <ctime>
+#include <iomanip>
+#include <limits>
+#include <memory>
+
+#include "db/db_impl.h"
+#include "db/write_batch_internal.h"
+#include "monitoring/instrumented_mutex.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/utilities/stackable_db.h"
+#include "rocksdb/utilities/transaction.h"
+#include "table/block.h"
+#include "table/block_based_table_builder.h"
+#include "table/block_builder.h"
+#include "table/meta_blocks.h"
+#include "util/crc32c.h"
+#include "util/file_reader_writer.h"
+#include "util/filename.h"
+#include "util/random.h"
+#include "util/timer_queue.h"
+#include "utilities/transactions/optimistic_transaction_db_impl.h"
+#include "utilities/transactions/optimistic_transaction_impl.h"
+
+namespace {
+int kBlockBasedTableVersionFormat = 2;
+
+void extendTTL(rocksdb::blob_db::ttlrange_t* ttl_range, uint32_t ttl) {
+  ttl_range->first = std::min(ttl_range->first, ttl);
+  ttl_range->second = std::max(ttl_range->second, ttl);
+}
+
+void extendTimestamps(rocksdb::blob_db::tsrange_t* ts_range, uint64_t ts) {
+  ts_range->first = std::min(ts_range->first, ts);
+  ts_range->second = std::max(ts_range->second, ts);
+}
+
+void extendSN(rocksdb::blob_db::snrange_t* sn_range,
+              rocksdb::SequenceNumber sn) {
+  sn_range->first = std::min(sn_range->first, sn);
+  sn_range->second = std::max(sn_range->second, sn);
+}
+}  // end namespace
+
+namespace rocksdb {
+
+namespace blob_db {
+
+struct GCStats {
+  uint64_t blob_count;
+  uint64_t num_deletes;
+  uint64_t deleted_size;
+  uint64_t num_relocs;
+  uint64_t succ_deletes_lsm;
+  uint64_t succ_relocs;
+  std::shared_ptr<BlobFile> newfile;
+  GCStats()
+      : blob_count(0),
+        num_deletes(0),
+        deleted_size(0),
+        num_relocs(0),
+        succ_deletes_lsm(0),
+        succ_relocs(0) {}
+};
+
+// BlobHandle is a pointer to the blob that is stored in the LSM
+class BlobHandle {
+ public:
+  BlobHandle()
+      : file_number_(std::numeric_limits<uint64_t>::max()),
+        offset_(std::numeric_limits<uint64_t>::max()),
+        size_(std::numeric_limits<uint64_t>::max()),
+        compression_(kNoCompression) {}
+
+  uint64_t filenumber() const { return file_number_; }
+  void set_filenumber(uint64_t fn) { file_number_ = fn; }
+
+  // The offset of the block in the file.
+  uint64_t offset() const { return offset_; }
+  void set_offset(uint64_t _offset) { offset_ = _offset; }
+
+  // The size of the stored block
+  uint64_t size() const { return size_; }
+  void set_size(uint64_t _size) { size_ = _size; }
+
+  CompressionType compression() const { return compression_; }
+  void set_compression(CompressionType t) { compression_ = t; }
+
+  void EncodeTo(std::string* dst) const;
+  Status DecodeFrom(Slice* input);
+
+  void clear();
+
+ private:
+  uint64_t file_number_;
+  uint64_t offset_;
+  uint64_t size_;
+  CompressionType compression_;
+};
+
+void BlobHandle::EncodeTo(std::string* dst) const {
+  // Sanity check that all fields have been set
+  assert(offset_ != std::numeric_limits<uint64_t>::max());
+  assert(size_ != std::numeric_limits<uint64_t>::max());
+  assert(file_number_ != std::numeric_limits<uint64_t>::max());
+
+  dst->reserve(30);
+  PutVarint64(dst, file_number_);
+  PutVarint64(dst, offset_);
+  PutVarint64(dst, size_);
+  dst->push_back(static_cast<unsigned char>(compression_));
+}
+
+void BlobHandle::clear() {
+  file_number_ = std::numeric_limits<uint64_t>::max();
+  offset_ = std::numeric_limits<uint64_t>::max();
+  size_ = std::numeric_limits<uint64_t>::max();
+  compression_ = kNoCompression;
+}
+
+Status BlobHandle::DecodeFrom(Slice* input) {
+  if (GetVarint64(input, &file_number_) && GetVarint64(input, &offset_) &&
+      GetVarint64(input, &size_)) {
+    compression_ = static_cast<CompressionType>(input->data()[0]);
+    return Status::OK();
+  } else {
+    clear();
+    return Status::Corruption("bad blob handle");
+  }
+}
+
+Random blob_rgen(static_cast<uint32_t>(time(nullptr)));
+
+void BlobDBFlushBeginListener::OnFlushBegin(DB* db, const FlushJobInfo& info) {
+  if (impl_) impl_->OnFlushBeginHandler(db, info);
+}
+
+WalFilter::WalProcessingOption BlobReconcileWalFilter::LogRecordFound(
+    unsigned long long log_number, const std::string& log_file_name,
+    const WriteBatch& batch, WriteBatch* new_batch, bool* batch_changed) {
+  return WalFilter::WalProcessingOption::kContinueProcessing;
+}
+
+bool blobf_compare_ttl::operator()(const std::shared_ptr<BlobFile>& lhs,
+                                   const std::shared_ptr<BlobFile>& rhs) const {
+  if (lhs->ttl_range_.first < rhs->ttl_range_.first) return true;
+
+  if (lhs->ttl_range_.first > rhs->ttl_range_.first) return false;
+
+  return lhs->BlobFileNumber() > rhs->BlobFileNumber();
+}
+
+void EvictAllVersionsCompactionListener::InternalListener::OnCompaction(
+    int level, const Slice& key,
+    CompactionEventListener::CompactionListenerValueType value_type,
+    const Slice& existing_value, const SequenceNumber& sn, bool is_new) {
+  if (!is_new &&
+      value_type ==
+          CompactionEventListener::CompactionListenerValueType::kValue) {
+    BlobHandle handle;
+    Slice lsmval(existing_value);
+    Status s = handle.DecodeFrom(&lsmval);
+    if (s.ok()) {
+      if (impl_->debug_level_ >= 3)
+        Log(InfoLogLevel::INFO_LEVEL, impl_->db_options_.info_log,
+            "CALLBACK COMPACTED OUT KEY: %s SN: %d "
+            "NEW: %d FN: %" PRIu64 " OFFSET: %" PRIu64 " SIZE: %" PRIu64,
+            key.ToString().c_str(), sn, is_new, handle.filenumber(),
+            handle.offset(), handle.size());
+
+      impl_->override_vals_q_.enqueue({handle.filenumber(), key.size(),
+                                       handle.offset(), handle.size(), sn});
+    }
+  } else {
+    if (impl_->debug_level_ >= 3)
+      Log(InfoLogLevel::INFO_LEVEL, impl_->db_options_.info_log,
+          "CALLBACK NEW KEY: %s SN: %d NEW: %d", key.ToString().c_str(), sn,
+          is_new);
+  }
+}
+
+Status BlobDB::DestroyBlobDB(const std::string& dbname, const Options& options,
+                             const BlobDBOptions& bdb_options) {
+  const ImmutableDBOptions soptions(SanitizeOptions(dbname, options));
+  Env* env = soptions.env;
+
+  Status result;
+  std::string blobdir;
+  blobdir = (bdb_options.path_relative) ? dbname + "/" + bdb_options.blob_dir
+                                        : bdb_options.blob_dir;
+
+  std::vector<std::string> filenames;
+  Status status = env->GetChildren(blobdir, &filenames);
+
+  for (const auto& f : filenames) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(f, &number, &type) && type == kBlobFile) {
+      Status del = env->DeleteFile(blobdir + "/" + f);
+      if (result.ok() && !del.ok()) {
+        result = del;
+      }
+    }
+  }
+
+  env->DeleteDir(blobdir);
+  return result;
+}
+
+BlobDBImpl::BlobDBImpl(const std::string& dbname,
+                       const BlobDBOptions& blob_db_options,
+                       const DBOptions& db_options)
+    : BlobDB(nullptr),
+      db_impl_(nullptr),
+      myenv_(db_options.env),
+      wo_set_(false),
+      bdb_options_(blob_db_options),
+      db_options_(db_options),
+      env_options_(db_options),
+      dir_change_(false),
+      next_file_number_(1),
+      epoch_of_(0),
+      shutdown_(false),
+      current_epoch_(0),
+      open_file_count_(0),
+      last_period_write_(0),
+      last_period_ampl_(0),
+      total_periods_write_(0),
+      total_periods_ampl_(0),
+      total_blob_space_(0),
+      open_p1_done_(false),
+      debug_level_(0) {
+  const BlobDBOptionsImpl* options_impl =
+      dynamic_cast<const BlobDBOptionsImpl*>(&blob_db_options);
+  if (options_impl) {
+    bdb_options_ = *options_impl;
+  }
+  blob_dir_ = (bdb_options_.path_relative)
+                  ? dbname + "/" + bdb_options_.blob_dir
+                  : bdb_options_.blob_dir;
+
+  if (bdb_options_.default_ttl_extractor) {
+    bdb_options_.extract_ttl_fn = &BlobDBImpl::ExtractTTLFromBlob;
+  }
+}
+
+Status BlobDBImpl::LinkToBaseDB(DB* db) {
+  assert(db_ == nullptr);
+  assert(open_p1_done_);
+
+  db_ = db;
+
+  // the Base DB in-itself can be a stackable DB
+  StackableDB* sdb = dynamic_cast<StackableDB*>(db_);
+  if (sdb) {
+    db_impl_ = dynamic_cast<DBImpl*>(sdb->GetBaseDB());
+  } else {
+    db_impl_ = dynamic_cast<DBImpl*>(db);
+  }
+
+  myenv_ = db_->GetEnv();
+
+  opt_db_.reset(new OptimisticTransactionDBImpl(db, false));
+
+  Status s = myenv_->CreateDirIfMissing(blob_dir_);
+  if (!s.ok()) {
+    Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+        "Failed to create blob directory: %s status: '%s'", blob_dir_.c_str(),
+        s.ToString().c_str());
+  }
+  s = myenv_->NewDirectory(blob_dir_, &dir_ent_);
+  if (!s.ok()) {
+    Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+        "Failed to open blob directory: %s status: '%s'", blob_dir_.c_str(),
+        s.ToString().c_str());
+  }
+
+  StartBackgroundTasks();
+  return s;
+}
+
+BlobDBImpl::BlobDBImpl(DB* db, const BlobDBOptions& blob_db_options)
+    : BlobDB(db),
+      db_impl_(dynamic_cast<DBImpl*>(db)),
+      opt_db_(new OptimisticTransactionDBImpl(db, false)),
+      wo_set_(false),
+      bdb_options_(blob_db_options),
+      db_options_(db->GetOptions()),
+      env_options_(db_->GetOptions()),
+      dir_change_(false),
+      next_file_number_(1),
+      epoch_of_(0),
+      shutdown_(false),
+      current_epoch_(0),
+      open_file_count_(0),
+      last_period_write_(0),
+      last_period_ampl_(0),
+      total_periods_write_(0),
+      total_periods_ampl_(0),
+      total_blob_space_(0) {
+  assert(db_impl_ != nullptr);
+  const BlobDBOptionsImpl* options_impl =
+      dynamic_cast<const BlobDBOptionsImpl*>(&blob_db_options);
+  if (options_impl) {
+    bdb_options_ = *options_impl;
+  }
+
+  if (!bdb_options_.blob_dir.empty())
+    blob_dir_ = (bdb_options_.path_relative)
+                    ? db_->GetName() + "/" + bdb_options_.blob_dir
+                    : bdb_options_.blob_dir;
+
+  if (bdb_options_.default_ttl_extractor) {
+    bdb_options_.extract_ttl_fn = &BlobDBImpl::ExtractTTLFromBlob;
+  }
+}
+
+BlobDBImpl::~BlobDBImpl() {
+  // CancelAllBackgroundWork(db_, true);
+
+  Shutdown();
+}
+
+Status BlobDBImpl::OpenPhase1() {
+  assert(db_ == nullptr);
+  if (blob_dir_.empty())
+    return Status::NotSupported("No blob directory in options");
+
+  std::unique_ptr<Directory> dir_ent;
+  Status s = myenv_->NewDirectory(blob_dir_, &dir_ent);
+  if (!s.ok()) {
+    Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+        "Failed to open blob directory: %s status: '%s'", blob_dir_.c_str(),
+        s.ToString().c_str());
+    open_p1_done_ = true;
+    return Status::OK();
+  }
+
+  s = OpenAllFiles();
+  open_p1_done_ = true;
+  return s;
+}
+
+void BlobDBImpl::StartBackgroundTasks() {
+  // store a call to a member function and object
+  tqueue_.add(
+      bdb_options_.reclaim_of_period_millisecs,
+      std::bind(&BlobDBImpl::ReclaimOpenFiles, this, std::placeholders::_1));
+  tqueue_.add(bdb_options_.gc_check_period_millisecs,
+              std::bind(&BlobDBImpl::RunGC, this, std::placeholders::_1));
+  tqueue_.add(
+      bdb_options_.deletion_check_period_millisecs,
+      std::bind(&BlobDBImpl::EvictDeletions, this, std::placeholders::_1));
+  tqueue_.add(
+      bdb_options_.deletion_check_period_millisecs,
+      std::bind(&BlobDBImpl::EvictCompacted, this, std::placeholders::_1));
+  tqueue_.add(
+      bdb_options_.delete_obsf_period_millisecs,
+      std::bind(&BlobDBImpl::DeleteObsFiles, this, std::placeholders::_1));
+  tqueue_.add(bdb_options_.sanity_check_period_millisecs,
+              std::bind(&BlobDBImpl::SanityCheck, this, std::placeholders::_1));
+  tqueue_.add(bdb_options_.wa_stats_period_millisecs,
+              std::bind(&BlobDBImpl::WaStats, this, std::placeholders::_1));
+  tqueue_.add(bdb_options_.fsync_files_period_millisecs,
+              std::bind(&BlobDBImpl::FsyncFiles, this, std::placeholders::_1));
+  tqueue_.add(
+      bdb_options_.check_seqf_period_millisecs,
+      std::bind(&BlobDBImpl::CheckSeqFiles, this, std::placeholders::_1));
+}
+
+void BlobDBImpl::Shutdown() { shutdown_.store(true); }
+
+void BlobDBImpl::OnFlushBeginHandler(DB* db, const FlushJobInfo& info) {
+  if (shutdown_.load()) return;
+
+  // a callback that happens too soon needs to be ignored
+  if (!db_) return;
+
+  FsyncFiles(false);
+}
+
+Status BlobDBImpl::GetAllLogFiles(
+    std::set<std::pair<uint64_t, std::string>>* file_nums) {
+  std::vector<std::string> all_files;
+  Status status = myenv_->GetChildren(blob_dir_, &all_files);
+  if (!status.ok()) {
+    return status;
+  }
+
+  for (const auto& f : all_files) {
+    uint64_t number;
+    FileType type;
+    bool psucc = ParseFileName(f, &number, &type);
+    if (psucc && type == kBlobFile) {
+      file_nums->insert(std::make_pair(number, f));
+    } else {
+      Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+          "Skipping file in blob directory %s parse: %d type: %d", f.c_str(),
+          psucc, ((psucc) ? type : -1));
+    }
+  }
+
+  return status;
+}
+
+Status BlobDBImpl::OpenAllFiles() {
+  WriteLock wl(&mutex_);
+
+  std::set<std::pair<uint64_t, std::string>> file_nums;
+  Status status = GetAllLogFiles(&file_nums);
+
+  if (!status.ok()) {
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "Failed to collect files from blob dir: %s status: '%s'",
+        blob_dir_.c_str(), status.ToString().c_str());
+    return status;
+  }
+
+  Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+      "BlobDir files path: %s count: %d min: %" PRIu64 " max: %" PRIu64,
+      blob_dir_.c_str(), static_cast<int>(file_nums.size()),
+      (file_nums.empty()) ? -1 : (file_nums.begin())->first,
+      (file_nums.empty()) ? -1 : (file_nums.end())->first);
+
+  if (!file_nums.empty())
+    next_file_number_.store((file_nums.rbegin())->first + 1);
+
+  for (auto f_iter : file_nums) {
+    std::string bfpath = BlobFileName(blob_dir_, f_iter.first);
+    uint64_t size_bytes;
+    Status s1 = myenv_->GetFileSize(bfpath, &size_bytes);
+    if (!s1.ok()) {
+      Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+          "Unable to get size of %s. File skipped from open status: '%s'",
+          bfpath.c_str(), s1.ToString().c_str());
+      continue;
+    }
+
+    if (debug_level_ >= 1)
+      Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+          "Blob File open: %s size: %" PRIu64, bfpath.c_str(), size_bytes);
+
+    std::shared_ptr<BlobFile> bfptr =
+        std::make_shared<BlobFile>(this, blob_dir_, f_iter.first);
+    bfptr->SetFileSize(size_bytes);
+
+    // since this file already existed, we will try to reconcile
+    // deleted count with LSM
+    bfptr->gc_once_after_open_ = true;
+
+    // read header
+    std::shared_ptr<Reader> reader;
+    reader = bfptr->OpenSequentialReader(myenv_, db_options_, env_options_);
+    s1 = reader->ReadHeader(&bfptr->header_);
+    if (!s1.ok()) {
+      Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+          "Failure to read header for blob-file %s "
+          "status: '%s' size: %" PRIu64,
+          bfpath.c_str(), s1.ToString().c_str(), size_bytes);
+      continue;
+    }
+    bfptr->header_valid_ = true;
+
+    std::shared_ptr<RandomAccessFileReader> ra_reader =
+        GetOrOpenRandomAccessReader(bfptr, myenv_, env_options_);
+
+    BlobLogFooter bf;
+    s1 = bfptr->ReadFooter(&bf);
+
+    bfptr->CloseRandomAccessLocked();
+    if (s1.ok()) {
+      s1 = bfptr->SetFromFooterLocked(bf);
+      if (!s1.ok()) {
+        Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+            "Header Footer mismatch for blob-file %s "
+            "status: '%s' size: %" PRIu64,
+            bfpath.c_str(), s1.ToString().c_str(), size_bytes);
+        continue;
+      }
+    } else {
+      Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+          "File found incomplete (w/o footer) %s", bfpath.c_str());
+
+      // sequentially iterate over the file and read all the records
+      ttlrange_t ttl_range(std::numeric_limits<uint32_t>::max(),
+                           std::numeric_limits<uint32_t>::min());
+      tsrange_t ts_range(std::numeric_limits<uint32_t>::max(),
+                         std::numeric_limits<uint32_t>::min());
+      snrange_t sn_range(std::numeric_limits<SequenceNumber>::max(),
+                         std::numeric_limits<SequenceNumber>::min());
+
+      uint64_t blob_count = 0;
+      BlobLogRecord record;
+      Reader::ReadLevel shallow = Reader::kReadHdrKeyFooter;
+
+      uint64_t record_start = reader->GetNextByte();
+      // TODO(arahut) - when we detect corruption, we should truncate
+      while (reader->ReadRecord(&record, shallow).ok()) {
+        ++blob_count;
+        if (bfptr->HasTTL()) {
+          extendTTL(&ttl_range, record.GetTTL());
+        }
+        if (bfptr->HasTimestamp()) {
+          extendTimestamps(&ts_range, record.GetTimeVal());
+        }
+        extendSN(&sn_range, record.GetSN());
+        record_start = reader->GetNextByte();
+      }
+
+      if (record_start != bfptr->GetFileSize()) {
+        Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+            "Blob file is corrupted or crashed during write %s"
+            " good_size: %" PRIu64 " file_size: %" PRIu64,
+            bfpath.c_str(), record_start, bfptr->GetFileSize());
+      }
+
+      if (!blob_count) {
+        Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+            "BlobCount = 0 in file %s", bfpath.c_str());
+        continue;
+      }
+
+      bfptr->SetBlobCount(blob_count);
+      bfptr->SetSNRange(sn_range);
+
+      if (bfptr->HasTimestamp()) bfptr->set_time_range(ts_range);
+
+      Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+          "Blob File: %s blob_count: %" PRIu64 " size_bytes: %" PRIu64
+          " sn_range: (%d, %d) ts: %d ttl: %d",
+          bfpath.c_str(), blob_count, size_bytes, sn_range.first,
+          sn_range.second, bfptr->HasTimestamp(), bfptr->HasTTL());
+
+      if (bfptr->HasTTL()) {
+        ttl_range.second =
+            std::max(ttl_range.second,
+                     ttl_range.first + (uint32_t)bdb_options_.ttl_range_secs);
+        bfptr->set_ttl_range(ttl_range);
+
+        std::time_t epoch_now = std::chrono::system_clock::to_time_t(
+            std::chrono::system_clock::now());
+        if (ttl_range.second < epoch_now) {
+          Status fstatus = CreateWriterLocked(bfptr);
+          if (fstatus.ok()) fstatus = bfptr->WriteFooterAndCloseLocked();
+          if (!fstatus.ok()) {
+            Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+                "Failed to close Blob File: %s status: '%s'. Skipped",
+                bfpath.c_str(), fstatus.ToString().c_str());
+            continue;
+          } else {
+            Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+                "Blob File Closed: %s now: %d ttl_range: (%d, %d)",
+                bfpath.c_str(), epoch_now, ttl_range.first, ttl_range.second);
+          }
+        } else {
+          open_blob_files_.insert(bfptr);
+        }
+      }
+    }
+
+    blob_files_.insert(std::make_pair(f_iter.first, bfptr));
+  }
+
+  return status;
+}
+
+void BlobDBImpl::CloseRandomAccessLocked(
+    const std::shared_ptr<BlobFile>& bfile) {
+  bfile->CloseRandomAccessLocked();
+  open_file_count_--;
+}
+
+std::shared_ptr<RandomAccessFileReader> BlobDBImpl::GetOrOpenRandomAccessReader(
+    const std::shared_ptr<BlobFile>& bfile, Env* env,
+    const EnvOptions& env_options) {
+  bool fresh_open = false;
+  auto rar = bfile->GetOrOpenRandomAccessReader(env, env_options, &fresh_open);
+  if (fresh_open) open_file_count_++;
+  return rar;
+}
+
+std::shared_ptr<BlobFile> BlobDBImpl::NewBlobFile(const std::string& reason) {
+  uint64_t file_num = next_file_number_++;
+  auto bfile = std::make_shared<BlobFile>(this, blob_dir_, file_num);
+  Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
+      "New blob file created: %s reason='%s'", bfile->PathName().c_str(),
+      reason.c_str());
+  LogFlush(db_options_.info_log);
+  return bfile;
+}
+
+Status BlobDBImpl::CreateWriterLocked(const std::shared_ptr<BlobFile>& bfile) {
+  std::string fpath(bfile->PathName());
+  std::unique_ptr<WritableFile> wfile;
+
+  // We are having issue that we write duplicate blob to blob file and the bug
+  // is related to writable file buffer. Force no buffer until we fix the bug.
+  EnvOptions env_options = env_options_;
+  env_options.writable_file_max_buffer_size = 0;
+
+  Status s = myenv_->ReopenWritableFile(fpath, &wfile, env_options);
+  if (!s.ok()) {
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "Failed to open blob file for write: %s status: '%s'"
+        " exists: '%s'",
+        fpath.c_str(), s.ToString().c_str(),
+        myenv_->FileExists(fpath).ToString().c_str());
+    return s;
+  }
+
+  std::unique_ptr<WritableFileWriter> fwriter;
+  fwriter.reset(new WritableFileWriter(std::move(wfile), env_options));
+
+  uint64_t boffset = bfile->GetFileSize();
+  if (debug_level_ >= 2 && boffset) {
+    Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
+        "Open blob file: %s with offset: %d", fpath.c_str(), boffset);
+  }
+
+  Writer::ElemType et = Writer::kEtNone;
+  if (bfile->file_size_ == BlobLogHeader::kHeaderSize)
+    et = Writer::kEtFileHdr;
+  else if (bfile->file_size_ > BlobLogHeader::kHeaderSize)
+    et = Writer::kEtFooter;
+  else if (bfile->file_size_) {
+    Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+        "Open blob file: %s with wrong size: %d", fpath.c_str(), boffset);
+    return Status::Corruption("Invalid blob file size");
+  }
+
+  bfile->log_writer_ = std::make_shared<Writer>(
+      std::move(fwriter), bfile->file_number_, bdb_options_.bytes_per_sync,
+      db_options_.use_fsync, boffset);
+  bfile->log_writer_->last_elem_type_ = et;
+
+  return s;
+}
+
+std::shared_ptr<BlobFile> BlobDBImpl::FindBlobFileLocked(
+    uint32_t expiration) const {
+  if (open_blob_files_.empty()) return nullptr;
+
+  std::shared_ptr<BlobFile> tmp = std::make_shared<BlobFile>();
+  tmp->ttl_range_ = std::make_pair(expiration, 0);
+
+  auto citr = open_blob_files_.equal_range(tmp);
+  if (citr.first == open_blob_files_.end()) {
+    assert(citr.second == open_blob_files_.end());
+
+    std::shared_ptr<BlobFile> check = *(open_blob_files_.rbegin());
+    return (check->ttl_range_.second < expiration) ? nullptr : check;
+  }
+
+  if (citr.first != citr.second) return *(citr.first);
+
+  auto finditr = citr.second;
+  if (finditr != open_blob_files_.begin()) --finditr;
+
+  bool b2 = (*finditr)->ttl_range_.second < expiration;
+  bool b1 = (*finditr)->ttl_range_.first > expiration;
+
+  return (b1 || b2) ? nullptr : (*finditr);
+}
+
+std::shared_ptr<Writer> BlobDBImpl::CheckOrCreateWriterLocked(
+    const std::shared_ptr<BlobFile>& bfile) {
+  std::shared_ptr<Writer> writer = bfile->GetWriter();
+  if (writer) return writer;
+
+  Status s = CreateWriterLocked(bfile);
+  if (!s.ok()) return nullptr;
+
+  writer = bfile->GetWriter();
+  return writer;
+}
+
+void BlobDBImpl::UpdateWriteOptions(const WriteOptions& options) {
+  if (!wo_set_.load(std::memory_order_relaxed)) {
+    // DCLP
+    WriteLock wl(&mutex_);
+    if (!wo_set_.load(std::memory_order_acquire)) {
+      wo_set_.store(true, std::memory_order_release);
+      write_options_ = options;
+    }
+  }
+}
+
+std::shared_ptr<BlobFile> BlobDBImpl::SelectBlobFile() {
+  uint32_t val = blob_rgen.Next();
+  {
+    ReadLock rl(&mutex_);
+    if (open_simple_files_.size() == bdb_options_.num_concurrent_simple_blobs)
+      return open_simple_files_[val % bdb_options_.num_concurrent_simple_blobs];
+  }
+
+  std::shared_ptr<BlobFile> bfile = NewBlobFile("SelectBlobFile");
+  assert(bfile);
+
+  // file not visible, hence no lock
+  std::shared_ptr<Writer> writer = CheckOrCreateWriterLocked(bfile);
+  if (!writer) {
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "Failed to get writer from blob file: %s", bfile->PathName().c_str());
+    return nullptr;
+  }
+
+  bfile->file_size_ = BlobLogHeader::kHeaderSize;
+  bfile->header_.compression_ = bdb_options_.compression;
+  bfile->header_valid_ = true;
+
+  // CHECK again
+  WriteLock wl(&mutex_);
+  if (open_simple_files_.size() == bdb_options_.num_concurrent_simple_blobs) {
+    return open_simple_files_[val % bdb_options_.num_concurrent_simple_blobs];
+  }
+
+  Status s = writer->WriteHeader(bfile->header_);
+  if (!s.ok()) {
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "Failed to write header to new blob file: %s"
+        " status: '%s'",
+        bfile->PathName().c_str(), s.ToString().c_str());
+    return nullptr;
+  }
+
+  dir_change_.store(true);
+  blob_files_.insert(std::make_pair(bfile->BlobFileNumber(), bfile));
+  open_simple_files_.push_back(bfile);
+  return bfile;
+}
+
+std::shared_ptr<BlobFile> BlobDBImpl::SelectBlobFileTTL(uint32_t expiration) {
+  uint64_t epoch_read = 0;
+  std::shared_ptr<BlobFile> bfile;
+  {
+    ReadLock rl(&mutex_);
+    bfile = FindBlobFileLocked(expiration);
+    epoch_read = epoch_of_.load();
+  }
+
+  if (bfile) {
+    assert(!bfile->Immutable());
+    return bfile;
+  }
+
+  uint32_t exp_low =
+      (expiration / bdb_options_.ttl_range_secs) * bdb_options_.ttl_range_secs;
+  uint32_t exp_high = exp_low + bdb_options_.ttl_range_secs;
+  ttlrange_t ttl_guess = std::make_pair(exp_low, exp_high);
+
+  bfile = NewBlobFile("SelectBlobFileTTL");
+  assert(bfile);
+
+  Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+      "New blob file TTL range: %s %d %d", bfile->PathName().c_str(), exp_low,
+      exp_high);
+  LogFlush(db_options_.info_log);
+
+  // we don't need to take lock as no other thread is seeing bfile yet
+  std::shared_ptr<Writer> writer = CheckOrCreateWriterLocked(bfile);
+  if (!writer) {
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "Failed to get writer from blob file with TTL: %s",
+        bfile->PathName().c_str());
+    return nullptr;
+  }
+
+  bfile->header_.set_ttl_guess(ttl_guess);
+  bfile->header_.compression_ = bdb_options_.compression;
+  bfile->header_valid_ = true;
+  bfile->file_size_ = BlobLogHeader::kHeaderSize;
+
+  // set the first value of the range, since that is
+  // concrete at this time.  also necessary to add to open_blob_files_
+  bfile->ttl_range_ = ttl_guess;
+
+  WriteLock wl(&mutex_);
+  // in case the epoch has shifted in the interim, then check
+  // check condition again - should be rare.
+  if (epoch_of_.load() != epoch_read) {
+    auto bfile2 = FindBlobFileLocked(expiration);
+    if (bfile2) return bfile2;
+  }
+
+  Status s = writer->WriteHeader(bfile->header_);
+  if (!s.ok()) {
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "Failed to write header to new blob file: %s"
+        " status: '%s'",
+        bfile->PathName().c_str(), s.ToString().c_str());
+    return nullptr;
+  }
+
+  dir_change_.store(true);
+  blob_files_.insert(std::make_pair(bfile->BlobFileNumber(), bfile));
+  open_blob_files_.insert(bfile);
+  epoch_of_++;
+
+  return bfile;
+}
+
+bool BlobDBImpl::ExtractTTLFromBlob(const Slice& value, Slice* newval,
+                                    int32_t* ttl_val) {
+  *newval = value;
+  *ttl_val = -1;
+  if (value.size() <= BlobDB::kTTLSuffixLength) return false;
+
+  int32_t ttl_tmp =
+      DecodeFixed32(value.data() + value.size() - sizeof(int32_t));
+  std::string ttl_exp(value.data() + value.size() - BlobDB::kTTLSuffixLength,
+                      4);
+  if (ttl_exp != "ttl:") return false;
+
+  newval->remove_suffix(BlobDB::kTTLSuffixLength);
+  *ttl_val = ttl_tmp;
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// A specific pattern is looked up at the end of the value part.
+// ttl:TTLVAL . if this pattern is found, PutWithTTL is called, otherwise
+// regular Put is called.
+////////////////////////////////////////////////////////////////////////////////
+Status BlobDBImpl::Put(const WriteOptions& options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) {
+  Slice newval;
+  int32_t ttl_val;
+  if (bdb_options_.extract_ttl_fn) {
+    bdb_options_.extract_ttl_fn(value, &newval, &ttl_val);
+    return PutWithTTL(options, column_family, key, newval, ttl_val);
+  }
+
+  return PutWithTTL(options, column_family, key, value, -1);
+}
+
+Status BlobDBImpl::Delete(const WriteOptions& options,
+                          ColumnFamilyHandle* column_family, const Slice& key) {
+  SequenceNumber lsn = db_impl_->GetLatestSequenceNumber();
+  Status s = db_->Delete(options, column_family, key);
+
+  // add deleted key to list of keys that have been deleted for book-keeping
+  delete_keys_q_.enqueue({column_family, key.ToString(), lsn});
+  return s;
+}
+
+Status BlobDBImpl::SingleDelete(const WriteOptions& wopts,
+                                ColumnFamilyHandle* column_family,
+                                const Slice& key) {
+  SequenceNumber lsn = db_impl_->GetLatestSequenceNumber();
+  Status s = db_->SingleDelete(wopts, column_family, key);
+
+  delete_keys_q_.enqueue({column_family, key.ToString(), lsn});
+  return s;
+}
+
+Status BlobDBImpl::Write(const WriteOptions& opts, WriteBatch* updates) {
+  class Handler1 : public WriteBatch::Handler {
+   public:
+    explicit Handler1(BlobDBImpl* i) : impl(i), previous_put(false) {}
+
+    BlobDBImpl* impl;
+    WriteBatch updates_blob;
+    Status batch_rewrite_status;
+    std::shared_ptr<BlobFile> last_file;
+    bool previous_put;
+
+    virtual Status PutCF(uint32_t column_family_id, const Slice& key,
+                         const Slice& value_unc) override {
+      Slice newval;
+      int32_t ttl_val = -1;
+      if (impl->bdb_options_.extract_ttl_fn) {
+        impl->bdb_options_.extract_ttl_fn(value_unc, &newval, &ttl_val);
+      } else {
+        newval = value_unc;
+      }
+
+      int32_t expiration = -1;
+      if (ttl_val != -1) {
+        std::time_t cur_t = std::chrono::system_clock::to_time_t(
+            std::chrono::system_clock::now());
+        expiration = ttl_val + static_cast<int32_t>(cur_t);
+      }
+      std::shared_ptr<BlobFile> bfile =
+          (ttl_val != -1) ? impl->SelectBlobFileTTL(expiration)
+                          : ((last_file) ? last_file : impl->SelectBlobFile());
+      if (last_file && last_file != bfile) {
+        batch_rewrite_status = Status::NotFound("too many blob files");
+        return batch_rewrite_status;
+      }
+
+      if (!bfile) {
+        batch_rewrite_status = Status::NotFound("blob file not found");
+        return batch_rewrite_status;
+      }
+
+      Slice value = value_unc;
+      std::string compression_output;
+      if (impl->bdb_options_.compression != kNoCompression) {
+        CompressionType ct = impl->bdb_options_.compression;
+        CompressionOptions compression_opts;
+        value = CompressBlock(value_unc, compression_opts, &ct,
+                              kBlockBasedTableVersionFormat, Slice(),
+                              &compression_output);
+      }
+
+      std::string headerbuf;
+      Writer::ConstructBlobHeader(&headerbuf, key, value, expiration, -1);
+
+      if (previous_put) {
+        impl->AppendSN(last_file, -1);
+        previous_put = false;
+      }
+
+      last_file = bfile;
+
+      std::string index_entry;
+      Status st = impl->AppendBlob(bfile, headerbuf, key, value, &index_entry);
+
+      if (expiration != -1)
+        extendTTL(&(bfile->ttl_range_), (uint32_t)expiration);
+
+      if (!st.ok()) {
+        batch_rewrite_status = st;
+      } else {
+        previous_put = true;
+        WriteBatchInternal::Put(&updates_blob, column_family_id, key,
+                                index_entry);
+      }
+      return Status::OK();
+    }
+
+    virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
+                           const Slice& value) override {
+      batch_rewrite_status =
+          Status::NotSupported("Not supported operation in blob db.");
+      return batch_rewrite_status;
+    }
+
+    virtual Status DeleteCF(uint32_t column_family_id,
+                            const Slice& key) override {
+      WriteBatchInternal::Delete(&updates_blob, column_family_id, key);
+      return Status::OK();
+    }
+
+    virtual void LogData(const Slice& blob) override {
+      updates_blob.PutLogData(blob);
+    }
+
+   private:
+  };
+
+  Handler1 handler1(this);
+  updates->Iterate(&handler1);
+
+  Status s;
+  SequenceNumber lsn = db_impl_->GetLatestSequenceNumber();
+
+  if (!handler1.batch_rewrite_status.ok()) {
+    return handler1.batch_rewrite_status;
+  } else {
+    s = db_->Write(opts, &(handler1.updates_blob));
+  }
+
+  if (!s.ok()) return s;
+
+  if (handler1.previous_put) {
+    // this is the sequence number of the write.
+    SequenceNumber sn = WriteBatchInternal::Sequence(&handler1.updates_blob);
+    AppendSN(handler1.last_file, sn);
+
+    CloseIf(handler1.last_file);
+  }
+
+  // add deleted key to list of keys that have been deleted for book-keeping
+  class Handler2 : public WriteBatch::Handler {
+   public:
+    explicit Handler2(BlobDBImpl* i, const SequenceNumber& sn)
+        : impl(i), lsn(sn) {}
+
+    virtual Status DeleteCF(uint32_t column_family_id,
+                            const Slice& key) override {
+      ColumnFamilyHandle* cfh =
+          impl->db_impl_->GetColumnFamilyHandleUnlocked(column_family_id);
+
+      impl->delete_keys_q_.enqueue({cfh, key.ToString(), lsn});
+      return Status::OK();
+    }
+
+   private:
+    BlobDBImpl* impl;
+    SequenceNumber lsn;
+  };
+
+  // add deleted key to list of keys that have been deleted for book-keeping
+  Handler2 handler2(this, lsn);
+  updates->Iterate(&handler2);
+
+  return Status::OK();
+}
+
+Status BlobDBImpl::PutWithTTL(const WriteOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice& key, const Slice& value,
+                              int32_t ttl) {
+  return PutUntil(
+      options, column_family, key, value,
+      (ttl != -1)
+          ? ttl + static_cast<int32_t>(std::chrono::system_clock::to_time_t(
+                      std::chrono::system_clock::now()))
+          : -1);
+}
+
+Status BlobDBImpl::PutUntil(const WriteOptions& options,
+                            ColumnFamilyHandle* column_family, const Slice& key,
+                            const Slice& value_unc, int32_t expiration) {
+  UpdateWriteOptions(options);
+
+  std::shared_ptr<BlobFile> bfile =
+      (expiration != -1) ? SelectBlobFileTTL(expiration) : SelectBlobFile();
+
+  if (!bfile) return Status::NotFound("Blob file not found");
+
+  Slice value = value_unc;
+  std::string compression_output;
+  if (bdb_options_.compression != kNoCompression) {
+    CompressionType ct = bdb_options_.compression;
+    CompressionOptions compression_opts;
+    value = CompressBlock(value_unc, compression_opts, &ct,
+                          kBlockBasedTableVersionFormat, Slice(),
+                          &compression_output);
+  }
+
+  std::string headerbuf;
+  Writer::ConstructBlobHeader(&headerbuf, key, value, expiration, -1);
+
+  // this is another more safer way to do it, where you keep the writeLock
+  // for the entire write path. this will increase latency and reduce
+  // throughput
+  // WriteLock lockbfile_w(&bfile->mutex_);
+  // std::shared_ptr<Writer> writer =
+  // CheckOrCreateWriterLocked(bfile);
+
+  if (debug_level_ >= 3)
+    Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
+        ">Adding KEY FILE: %s: KEY: %s VALSZ: %d", bfile->PathName().c_str(),
+        key.ToString().c_str(), value.size());
+
+  std::string index_entry;
+  Status s = AppendBlob(bfile, headerbuf, key, value, &index_entry);
+  if (!s.ok()) {
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "Failed to append blob to FILE: %s: KEY: %s VALSZ: %d"
+        " status: '%s' blob_file: '%s'",
+        bfile->PathName().c_str(), key.ToString().c_str(), value.size(),
+        s.ToString().c_str(), bfile->DumpState().c_str());
+    // Fallback just write to the LSM and get going
+    WriteBatch batch;
+    batch.Put(column_family, key, value);
+    return db_->Write(options, &batch);
+  }
+
+  WriteBatch batch;
+  batch.Put(column_family, key, index_entry);
+
+  // this goes to the base db and can be expensive
+  s = db_->Write(options, &batch);
+
+  // this is the sequence number of the write.
+  SequenceNumber sn = WriteBatchInternal::Sequence(&batch);
+
+  if (debug_level_ >= 3)
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "<Adding KEY FILE: %s: KEY: %s SN: %d", bfile->PathName().c_str(),
+        key.ToString().c_str(), sn);
+
+  s = AppendSN(bfile, sn);
+  if (!s.ok()) {
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "Failed to append SN to FILE: %s: KEY: %s VALSZ: %d"
+        " status: '%s' blob_file: '%s'",
+        bfile->PathName().c_str(), key.ToString().c_str(), value.size(),
+        s.ToString().c_str(), bfile->DumpState().c_str());
+  }
+
+  if (expiration != -1) extendTTL(&(bfile->ttl_range_), (uint32_t)expiration);
+
+  CloseIf(bfile);
+
+  return s;
+}
+
+Status BlobDBImpl::AppendBlob(const std::shared_ptr<BlobFile>& bfile,
+                              const std::string& headerbuf, const Slice& key,
+                              const Slice& value, std::string* index_entry) {
+  Status s;
+
+  uint64_t blob_offset = 0;
+  uint64_t key_offset = 0;
+  {
+    WriteLock lockbfile_w(&bfile->mutex_);
+    std::shared_ptr<Writer> writer = CheckOrCreateWriterLocked(bfile);
+    if (!writer) return Status::IOError("Failed to create blob writer");
+
+    // write the blob to the blob log.
+    s = writer->EmitPhysicalRecord(headerbuf, key, value, &key_offset,
+                                   &blob_offset);
+  }
+
+  if (!s.ok()) {
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "Invalid status in AppendBlob: %s status: '%s'",
+        bfile->PathName().c_str(), s.ToString().c_str());
+    return s;
+  }
+
+  // increment blob count
+  bfile->blob_count_++;
+  auto size_put = BlobLogRecord::kHeaderSize + key.size() + value.size();
+
+  bfile->file_size_ += size_put;
+  last_period_write_ += size_put;
+  total_blob_space_ += size_put;
+
+  BlobHandle handle;
+  handle.set_filenumber(bfile->BlobFileNumber());
+  handle.set_size(value.size());
+  handle.set_offset(blob_offset);
+  handle.set_compression(bdb_options_.compression);
+  handle.EncodeTo(index_entry);
+
+  if (debug_level_ >= 3)
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        ">Adding KEY FILE: %s: BC: %d OFFSET: %d SZ: %d",
+        bfile->PathName().c_str(), bfile->blob_count_.load(), blob_offset,
+        value.size());
+
+  return s;
+}
+
+Status BlobDBImpl::AppendSN(const std::shared_ptr<BlobFile>& bfile,
+                            const SequenceNumber& sn) {
+  Status s;
+  {
+    WriteLock lockbfile_w(&bfile->mutex_);
+    std::shared_ptr<Writer> writer = CheckOrCreateWriterLocked(bfile);
+    if (!writer) return Status::IOError("Failed to create blob writer");
+
+    s = writer->AddRecordFooter(sn);
+    if (!s.ok()) {
+      Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+          "Invalid status in AppendSN: %s status: '%s'",
+          bfile->PathName().c_str(), s.ToString().c_str());
+      return s;
+    }
+
+    if (sn != std::numeric_limits<SequenceNumber>::max())
+      extendSN(&(bfile->sn_range_), sn);
+  }
+
+  bfile->file_size_ += BlobLogRecord::kFooterSize;
+  last_period_write_ += BlobLogRecord::kFooterSize;
+  total_blob_space_ += BlobLogRecord::kFooterSize;
+  return s;
+}
+
+std::vector<Status> BlobDBImpl::MultiGet(
+    const ReadOptions& options,
+    const std::vector<ColumnFamilyHandle*>& column_family,
+    const std::vector<Slice>& keys, std::vector<std::string>* values) {
+  std::vector<std::string> values_lsm;
+  values_lsm.resize(keys.size());
+  auto statuses = db_->MultiGet(options, column_family, keys, &values_lsm);
+
+  for (size_t i = 0; i < keys.size(); ++i) {
+    if (!statuses[i].ok()) continue;
+
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family[i]);
+    auto cfd = cfh->cfd();
+
+    Status s = CommonGet(cfd, keys[i], values_lsm[i], &((*values)[i]));
+    statuses[i] = s;
+  }
+  return statuses;
+}
+
+Status BlobDBImpl::CommonGet(const ColumnFamilyData* cfd, const Slice& key,
+                             const std::string& index_entry,
+                             std::string* value) {
+  Slice index_entry_slice(index_entry);
+  BlobHandle handle;
+  Status s = handle.DecodeFrom(&index_entry_slice);
+  if (!s.ok()) return s;
+
+  // offset has to have certain min, as we will read CRC
+  // later from the Blob Header, which needs to be also a
+  // valid offset.
+  if (handle.offset() <
+      (BlobLogHeader::kHeaderSize + BlobLogRecord::kHeaderSize + key.size())) {
+    if (debug_level_ >= 2) {
+      Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+          "Invalid blob handle file_number: %" PRIu64 " blob_offset: %" PRIu64
+          " blob_size: %" PRIu64 " key: %s",
+          handle.filenumber(), handle.offset(), handle.size(), key.data());
+    }
+    return Status::NotFound("Blob Not Found, although found in LSM");
+  }
+
+  std::shared_ptr<BlobFile> bfile;
+  {
+    ReadLock rl(&mutex_);
+    auto hitr = blob_files_.find(handle.filenumber());
+
+    // file was deleted
+    if (hitr == blob_files_.end()) {
+      return Status::NotFound("Blob Not Found as blob file missing");
+    }
+
+    bfile = hitr->second;
+  }
+
+  if (bfile->Obsolete()) {
+    return Status::NotFound(
+        "Blob Not Found as blob file was garbage collected");
+  }
+
+  // 0 - size
+  if (!handle.size()) {
+    value->clear();
+    return Status::OK();
+  }
+
+  // takes locks when called
+  std::shared_ptr<RandomAccessFileReader> reader =
+      GetOrOpenRandomAccessReader(bfile, myenv_, env_options_);
+
+  std::string* valueptr = value;
+  std::string value_c;
+  if (bdb_options_.compression != kNoCompression) {
+    valueptr = &value_c;
+  }
+
+  // allocate the buffer. This is safe in C++11
+  valueptr->resize(handle.size());
+  char* buffer = &(*valueptr)[0];
+
+  Slice blob_value;
+  s = reader->Read(handle.offset(), handle.size(), &blob_value, buffer);
+  if (!s.ok() || blob_value.size() != handle.size()) {
+    if (debug_level_ >= 2) {
+      Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+          "Failed to read blob from file: %s blob_offset: %" PRIu64
+          " blob_size: %" PRIu64 " read: %d key: %s status: '%s'",
+          bfile->PathName().c_str(), handle.offset(), handle.size(),
+          static_cast<int>(blob_value.size()), key.data(),
+          s.ToString().c_str());
+    }
+    return Status::NotFound("Blob Not Found as couldnt retrieve Blob");
+  }
+
+  Slice crc_slice;
+  uint32_t crc_exp;
+  std::string crc_str;
+  crc_str.resize(sizeof(uint32_t));
+  char* crc_buffer = &(crc_str[0]);
+  s = reader->Read(handle.offset() - (key.size() + sizeof(uint32_t)),
+                   sizeof(uint32_t), &crc_slice, crc_buffer);
+  if (!s.ok() || !GetFixed32(&crc_slice, &crc_exp)) {
+    if (debug_level_ >= 2) {
+      Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+          "Failed to fetch blob crc file: %s blob_offset: %" PRIu64
+          " blob_size: %" PRIu64 " key: %s status: '%s'",
+          bfile->PathName().c_str(), handle.offset(), handle.size(), key.data(),
+          s.ToString().c_str());
+    }
+    return Status::NotFound("Blob Not Found as couldnt retrieve CRC");
+  }
+
+  uint32_t crc = crc32c::Extend(0, blob_value.data(), blob_value.size());
+  crc = crc32c::Mask(crc);  // Adjust for storage
+  if (crc != crc_exp) {
+    if (debug_level_ >= 2) {
+      Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+          "Blob crc mismatch file: %s blob_offset: %" PRIu64
+          " blob_size: %" PRIu64 " key: %s status: '%s'",
+          bfile->PathName().c_str(), handle.offset(), handle.size(), key.data(),
+          s.ToString().c_str());
+    }
+    return Status::Corruption("Corruption. Blob CRC mismatch");
+  }
+
+  if (bdb_options_.compression != kNoCompression) {
+    BlockContents contents;
+    s = UncompressBlockContentsForCompressionType(
+        blob_value.data(), blob_value.size(), &contents,
+        kBlockBasedTableVersionFormat, Slice(), bdb_options_.compression,
+        *(cfd->ioptions()));
+    *value = contents.data.ToString();
+  }
+
+  return s;
+}
+
+Status BlobDBImpl::Get(const ReadOptions& options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       std::string* value) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+
+  Status s;
+  std::string index_entry;
+  s = db_->Get(options, column_family, key, &index_entry);
+  if (!s.ok()) {
+    if (debug_level_ >= 3)
+      Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+          "Get Failed on LSM KEY: %s status: '%s'", key.ToString().c_str(),
+          s.ToString().c_str());
+    return s;
+  }
+
+  s = CommonGet(cfd, key, index_entry, value);
+  return s;
+}
+
+Slice BlobDBIterator::value() const {
+  Slice index_entry = iter_->value();
+
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh_);
+  auto cfd = cfh->cfd();
+
+  Status s = db_impl_->CommonGet(cfd, iter_->key(), index_entry.ToString(false),
+                                 &vpart_);
+  return Slice(vpart_);
+}
+
+std::pair<bool, int64_t> BlobDBImpl::SanityCheck(bool aborted) {
+  if (aborted) return std::make_pair(false, -1);
+
+  Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, "Starting Sanity Check");
+
+  Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+      "Number of files %" PRIu64, blob_files_.size());
+
+  Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+      "Number of open files %" PRIu64, open_blob_files_.size());
+
+  for (auto bfile : open_blob_files_) {
+    assert(!bfile->Immutable());
+  }
+
+  std::time_t epoch_now =
+      std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
+
+  for (auto bfile_pair : blob_files_) {
+    auto bfile = bfile_pair.second;
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "Blob File %s %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %d",
+        bfile->PathName().c_str(), bfile->GetFileSize(), bfile->BlobCount(),
+        bfile->deleted_count_, bfile->deleted_size_,
+        (bfile->ttl_range_.second - epoch_now));
+  }
+
+  // reschedule
+  return std::make_pair(true, -1);
+}
+
+std::pair<bool, int64_t> BlobDBImpl::CloseSeqWrite(
+    std::shared_ptr<BlobFile> bfile, bool aborted) {
+  {
+    WriteLock wl(&mutex_);
+
+    // this prevents others from picking up this file
+    open_blob_files_.erase(bfile);
+
+    auto findit =
+        std::find(open_simple_files_.begin(), open_simple_files_.end(), bfile);
+    if (findit != open_simple_files_.end()) open_simple_files_.erase(findit);
+  }
+
+  if (!bfile->closed_.load()) {
+    WriteLock lockbfile_w(&bfile->mutex_);
+    bfile->WriteFooterAndCloseLocked();
+  }
+
+  return std::make_pair(false, -1);
+}
+
+void BlobDBImpl::CloseIf(const std::shared_ptr<BlobFile>& bfile) {
+  // atomic read
+  bool close = bfile->GetFileSize() > bdb_options_.blob_file_size;
+  if (!close) return;
+
+  if (debug_level_ >= 2) {
+    Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
+        "Scheduling file for close %s fsize: %" PRIu64 " limit: %" PRIu64,
+        bfile->PathName().c_str(), bfile->GetFileSize(),
+        bdb_options_.blob_file_size);
+  }
+
+  {
+    WriteLock wl(&mutex_);
+
+    open_blob_files_.erase(bfile);
+    auto findit =
+        std::find(open_simple_files_.begin(), open_simple_files_.end(), bfile);
+    if (findit != open_simple_files_.end()) {
+      open_simple_files_.erase(findit);
+    } else {
+      Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+          "File not found while closing %s fsize: %" PRIu64
+          " Multithreaded Writes?",
+          bfile->PathName().c_str(), bfile->GetFileSize());
+    }
+  }
+
+  tqueue_.add(0, std::bind(&BlobDBImpl::CloseSeqWrite, this, bfile,
+                           std::placeholders::_1));
+}
+
+bool BlobDBImpl::FileDeleteOk_SnapshotCheckLocked(
+    const std::shared_ptr<BlobFile>& bfile) {
+  assert(bfile->Obsolete());
+
+  SequenceNumber esn = bfile->GetSNRange().first;
+
+  // this is not correct.
+  // you want to check that there are no snapshots in the
+  bool notok = db_impl_->HasActiveSnapshotLaterThanSN(esn);
+  if (notok) {
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "Could not delete file due to snapshot failure %s",
+        bfile->PathName().c_str());
+    return false;
+  } else {
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "Will delete file due to snapshot success %s",
+        bfile->PathName().c_str());
+    return true;
+  }
+}
+
+bool BlobDBImpl::FindFileAndEvictABlob(uint64_t file_number, uint64_t key_size,
+                                       uint64_t blob_offset,
+                                       uint64_t blob_size) {
+  (void)blob_offset;
+  std::shared_ptr<BlobFile> bfile;
+  {
+    ReadLock rl(&mutex_);
+    auto hitr = blob_files_.find(file_number);
+
+    // file was deleted
+    if (hitr == blob_files_.end()) {
+      Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+          "Could not find file_number %" PRIu64, file_number);
+      return false;
+    }
+
+    bfile = hitr->second;
+  }
+
+  WriteLock lockbfile_w(&bfile->mutex_);
+
+  bfile->deleted_count_++;
+  bfile->deleted_size_ += key_size + blob_size + BlobLogRecord::kHeaderSize +
+                          BlobLogRecord::kFooterSize;
+  return true;
+}
+
+bool BlobDBImpl::MarkBlobDeleted(const Slice& key, const Slice& lsmValue) {
+  Slice val(lsmValue);
+  BlobHandle handle;
+  Status s = handle.DecodeFrom(&val);
+  if (!s.ok()) {
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "Could not parse lsm val in MarkBlobDeleted %s",
+        lsmValue.ToString().c_str());
+    return false;
+  }
+  bool succ = FindFileAndEvictABlob(handle.filenumber(), key.size(),
+                                    handle.offset(), handle.size());
+  return succ;
+}
+
+std::pair<bool, int64_t> BlobDBImpl::EvictCompacted(bool aborted) {
+  if (aborted) return std::make_pair(false, -1);
+
+  override_packet_t packet;
+  while (override_vals_q_.dequeue(&packet)) {
+    bool succ = FindFileAndEvictABlob(packet.file_number_, packet.key_size_,
+                                      packet.blob_offset_, packet.blob_size_);
+
+    if (!succ)
+      Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
+          "EVICT COMPACTION FAILURE SN: %d FN: %d OFFSET: %d SIZE: %d",
+          packet.dsn_, packet.file_number_, packet.blob_offset_,
+          packet.blob_size_);
+
+    if (debug_level_ >= 3)
+      Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+          "EVICT COMPACTED SN: %d FN: %d OFFSET: %d SIZE: %d SUCC: %d",
+          packet.dsn_, packet.file_number_, packet.blob_offset_,
+          packet.blob_size_, succ);
+  }
+  return std::make_pair(true, -1);
+}
+
+std::pair<bool, int64_t> BlobDBImpl::EvictDeletions(bool aborted) {
+  if (aborted) return std::make_pair(false, -1);
+
+  ColumnFamilyHandle* last_cfh = nullptr;
+  Options last_op;
+
+  Arena arena;
+  ScopedArenaIterator iter;
+
+  // we will use same RangeDelAggregator for all cf's.
+  // essentially we do not support Range Deletes now
+  std::unique_ptr<RangeDelAggregator> range_del_agg;
+  delete_packet_t dpacket;
+  while (delete_keys_q_.dequeue(&dpacket)) {
+    if (last_cfh != dpacket.cfh_) {
+      if (!range_del_agg) {
+        auto cfhi = reinterpret_cast<ColumnFamilyHandleImpl*>(dpacket.cfh_);
+        auto cfd = cfhi->cfd();
+        range_del_agg.reset(new RangeDelAggregator(cfd->internal_comparator(),
+                                                   kMaxSequenceNumber));
+      }
+
+      // this can be expensive
+      last_cfh = dpacket.cfh_;
+      last_op = db_impl_->GetOptions(last_cfh);
+      iter.set(db_impl_->NewInternalIterator(&arena, range_del_agg.get(),
+                                             dpacket.cfh_));
+      // this will not work for multiple CF's.
+    }
+
+    Slice user_key(dpacket.key_);
+    InternalKey target(user_key, dpacket.dsn_, kTypeValue);
+
+    Slice eslice = target.Encode();
+    iter->Seek(eslice);
+
+    if (!iter->status().ok()) {
+      Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+          "Invalid iterator seek %s", dpacket.key_.c_str());
+      continue;
+    }
+
+    const Comparator* bwc = BytewiseComparator();
+    while (iter->Valid()) {
+      if (!bwc->Equal(ExtractUserKey(iter->key()), ExtractUserKey(eslice)))
+        break;
+
+      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+      if (!ParseInternalKey(iter->key(), &ikey)) {
+        continue;
+      }
+
+      // once you hit a DELETE, assume the keys below have been
+      // processed previously
+      if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion) break;
+
+      Slice val = iter->value();
+      MarkBlobDeleted(ikey.user_key, val);
+
+      iter->Next();
+    }
+  }
+  return std::make_pair(true, -1);
+}
+
+std::pair<bool, int64_t> BlobDBImpl::CheckSeqFiles(bool aborted) {
+  if (aborted) return std::make_pair(false, -1);
+
+  std::vector<std::shared_ptr<BlobFile>> process_files;
+  {
+    std::time_t epoch_now =
+        std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
+
+    ReadLock rl(&mutex_);
+    for (auto bfile : open_blob_files_) {
+      {
+        ReadLock lockbfile_r(&bfile->mutex_);
+
+        if (bfile->ttl_range_.second > epoch_now) continue;
+        process_files.push_back(bfile);
+      }
+    }
+  }
+
+  for (auto bfile : process_files) CloseSeqWrite(bfile, false);
+
+  return std::make_pair(true, -1);
+}
+
+std::pair<bool, int64_t> BlobDBImpl::FsyncFiles(bool aborted) {
+  if (aborted) return std::make_pair(false, -1);
+
+  std::vector<std::shared_ptr<BlobFile>> process_files;
+  {
+    ReadLock rl(&mutex_);
+    for (auto fitr : open_blob_files_) {
+      if (fitr->NeedsFsync(true, bdb_options_.bytes_per_sync))
+        process_files.push_back(fitr);
+    }
+
+    for (auto fitr : open_simple_files_) {
+      if (fitr->NeedsFsync(true, bdb_options_.bytes_per_sync))
+        process_files.push_back(fitr);
+    }
+  }
+
+  for (auto fitr : process_files) {
+    if (fitr->NeedsFsync(true, bdb_options_.bytes_per_sync)) fitr->Fsync();
+  }
+
+  bool expected = true;
+  if (dir_change_.compare_exchange_weak(expected, false)) dir_ent_->Fsync();
+
+  return std::make_pair(true, -1);
+}
+
+std::pair<bool, int64_t> BlobDBImpl::ReclaimOpenFiles(bool aborted) {
+  if (aborted) return std::make_pair(false, -1);
+
+  if (open_file_count_.load() < bdb_options_.open_files_trigger)
+    return std::make_pair(true, -1);
+
+  // in the future, we should sort by last_access_
+  // instead of closing every file
+  ReadLock rl(&mutex_);
+  for (auto const& ent : blob_files_) {
+    auto bfile = ent.second;
+    if (bfile->last_access_.load() == -1) continue;
+
+    WriteLock lockbfile_w(&bfile->mutex_);
+    CloseRandomAccessLocked(bfile);
+  }
+
+  return std::make_pair(true, -1);
+}
+
+std::pair<bool, int64_t> BlobDBImpl::WaStats(bool aborted) {
+  if (aborted) return std::make_pair(false, -1);
+
+  WriteLock wl(&mutex_);
+
+  if (all_periods_write_.size() < bdb_options_.wa_num_stats_periods) {
+    total_periods_write_ -= (*all_periods_write_.begin());
+    total_periods_ampl_ = (*all_periods_ampl_.begin());
+
+    all_periods_write_.pop_front();
+    all_periods_ampl_.pop_front();
+  }
+
+  uint64_t val1 = last_period_write_.load();
+  uint64_t val2 = last_period_ampl_.load();
+
+  all_periods_write_.push_back(val1);
+  all_periods_ampl_.push_back(val2);
+
+  last_period_write_ = 0;
+  last_period_ampl_ = 0;
+
+  total_periods_write_ += val1;
+  total_periods_ampl_ += val2;
+
+  return std::make_pair(true, -1);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// iterate over the blobs sequentially and check if the blob sequence number
+// is the latest. If it is the latest, preserve it, otherwise delete it
+// if it is TTL based, and the TTL has expired, then
+// we can blow the entity if the key is still the latest or the Key is not
+// found
+// WHAT HAPPENS IF THE KEY HAS BEEN OVERRIDEN. Then we can drop the blob
+// without doing anything if the earliest snapshot is not
+// referring to that sequence number, i.e. it is later than the sequence number
+// of the new key
+//
+// if it is not TTL based, then we can blow the key if the key has been
+// DELETED in the LSM
+////////////////////////////////////////////////////////////////////////////////
+Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
+                                      GCStats* gcstats) {
+  std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
+  std::time_t tt = std::chrono::system_clock::to_time_t(now);
+
+  std::shared_ptr<Reader> reader =
+      bfptr->OpenSequentialReader(myenv_, db_options_, env_options_);
+  if (!reader) {
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "File sequential reader could not be opened",
+        bfptr->PathName().c_str());
+    return Status::IOError("failed to create sequential reader");
+  }
+
+  BlobLogHeader header;
+  Status s = reader->ReadHeader(&header);
+  if (!s.ok()) {
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "Failure to read header for blob-file %s", bfptr->PathName().c_str());
+    return s;
+  }
+
+  bool first_gc = bfptr->gc_once_after_open_;
+
+  ColumnFamilyHandle* cfh = bfptr->GetColumnFamily(db_);
+  auto cfhi = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh);
+  auto cfd = cfhi->cfd();
+  bool has_ttl = header.HasTTL();
+
+  // this reads the key but skips the blob
+  Reader::ReadLevel shallow = Reader::kReadHdrKeyFooter;
+
+  assert(opt_db_);
+
+  bool no_relocation_ttl = (has_ttl && tt > bfptr->GetTTLRange().second);
+
+  bool no_relocation_lsmdel = false;
+  {
+    ReadLock lockbfile_r(&bfptr->mutex_);
+    no_relocation_lsmdel = (bfptr->GetFileSize() ==
+                            (BlobLogHeader::kHeaderSize + bfptr->deleted_size_ +
+                             BlobLogFooter::kFooterSize));
+  }
+
+  bool no_relocation = no_relocation_ttl || no_relocation_lsmdel;
+  if (!no_relocation) {
+    // read the blob because you have to write it back to new file
+    shallow = Reader::kReadHdrKeyBlobFooter;
+  }
+
+  BlobLogRecord record;
+  std::shared_ptr<BlobFile> newfile;
+  std::shared_ptr<Writer> new_writer;
+
+  while (reader->ReadRecord(&record, shallow).ok()) {
+    gcstats->blob_count++;
+
+    bool del_this = false;
+    // this particular TTL has expired
+    if (no_relocation_ttl || (has_ttl && tt > record.GetTTL())) {
+      del_this = true;
+    } else {
+      SequenceNumber seq = kMaxSequenceNumber;
+      bool found_record_for_key = false;
+      SuperVersion* sv = db_impl_->GetAndRefSuperVersion(cfd);
+      if (sv == nullptr) {
+        Status result =
+            Status::InvalidArgument("Could not access column family 0");
+        return result;
+      }
+      Status s1 = db_impl_->GetLatestSequenceForKey(
+          sv, record.Key(), false, &seq, &found_record_for_key);
+      if (s1.IsNotFound() || (!found_record_for_key || seq != record.GetSN())) {
+        del_this = true;
+      }
+      db_impl_->ReturnAndCleanupSuperVersion(cfd, sv);
+    }
+
+    if (del_this) {
+      gcstats->num_deletes++;
+      gcstats->deleted_size += record.GetBlobSize();
+      if (first_gc) continue;
+
+      Transaction* txn = static_cast<OptimisticTransactionDB*>(opt_db_.get())
+                             ->BeginTransaction(write_options_);
+      txn->Delete(cfh, record.Key());
+      Status s1 = txn->Commit();
+      // chances that this DELETE will fail is low. If it fails, it would be
+      // because
+      // a new version of the key came in at this time, which will override
+      // the current version being iterated on.
+      if (s1.IsBusy()) {
+        Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+            "Optimistic transaction failed delete: %s bn: %" PRIu32,
+            bfptr->PathName().c_str(), gcstats->blob_count);
+      } else {
+        Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
+            "Successfully added delete back into LSM: %s bn: %" PRIu32,
+            bfptr->PathName().c_str(), gcstats->blob_count);
+
+        // assume that failures happen due to new writes.
+        gcstats->succ_deletes_lsm++;
+      }
+      delete txn;
+      continue;
+    } else if (first_gc) {
+      continue;
+    }
+
+    if (!newfile) {
+      // new file
+      std::string reason("GC of ");
+      reason += bfptr->PathName();
+      newfile = NewBlobFile(reason);
+      gcstats->newfile = newfile;
+
+      new_writer = CheckOrCreateWriterLocked(newfile);
+      newfile->header_ = std::move(header);
+      // Can't use header beyond this point
+      newfile->header_valid_ = true;
+      newfile->file_size_ = BlobLogHeader::kHeaderSize;
+      s = new_writer->WriteHeader(newfile->header_);
+
+      if (!s.ok()) {
+        Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+            "File: %s - header writing failed", newfile->PathName().c_str());
+        return s;
+      }
+
+      WriteLock wl(&mutex_);
+
+      dir_change_.store(true);
+      blob_files_.insert(std::make_pair(newfile->BlobFileNumber(), newfile));
+    }
+
+    gcstats->num_relocs++;
+    std::string index_entry;
+
+    uint64_t blob_offset = 0;
+    uint64_t key_offset = 0;
+    // write the blob to the blob log.
+    s = new_writer->AddRecord(record.Key(), record.Blob(), &key_offset,
+                              &blob_offset, record.GetTTL());
+
+    BlobHandle handle;
+    handle.set_filenumber(newfile->BlobFileNumber());
+    handle.set_size(record.Blob().size());
+    handle.set_offset(blob_offset);
+    handle.set_compression(bdb_options_.compression);
+    handle.EncodeTo(&index_entry);
+
+    new_writer->AddRecordFooter(record.GetSN());
+    newfile->blob_count_++;
+    newfile->file_size_ += BlobLogRecord::kHeaderSize + record.Key().size() +
+                           record.Blob().size() + BlobLogRecord::kFooterSize;
+
+    Transaction* txn = static_cast<OptimisticTransactionDB*>(opt_db_.get())
+                           ->BeginTransaction(write_options_);
+    txn->Put(cfh, record.Key(), index_entry);
+    Status s1 = txn->Commit();
+    // chances that this Put will fail is low. If it fails, it would be because
+    // a new version of the key came in at this time, which will override
+    // the current version being iterated on.
+    if (s1.IsBusy()) {
+      Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+          "Optimistic transaction failed: %s put bn: %" PRIu32,
+          bfptr->PathName().c_str(), gcstats->blob_count);
+    } else {
+      gcstats->succ_relocs++;
+      Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
+          "Successfully added put back into LSM: %s bn: %" PRIu32,
+          bfptr->PathName().c_str(), gcstats->blob_count);
+    }
+    delete txn;
+  }
+
+  if (gcstats->newfile) total_blob_space_ += newfile->file_size_;
+
+  Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+      "File: %s Num deletes %" PRIu32 " Num relocs: %" PRIu32
+      " Succ Deletes: %" PRIu32 " Succ relocs: %" PRIu32,
+      bfptr->PathName().c_str(), gcstats->num_deletes, gcstats->num_relocs,
+      gcstats->succ_deletes_lsm, gcstats->succ_relocs);
+
+  return s;
+}
+
+// Ideally we should hold the lock during the entire function,
+// but under the asusmption that this is only called when a
+// file is Immutable, we can reduce the critical section
+bool BlobDBImpl::ShouldGCFile(std::shared_ptr<BlobFile> bfile, std::time_t tt,
+                              uint64_t last_id, std::string* reason) {
+  if (bfile->HasTTL()) {
+    ttlrange_t ttl_range = bfile->GetTTLRange();
+    if (tt > ttl_range.second) {
+      *reason = "entire file ttl expired";
+      return true;
+    }
+
+    if (!bfile->file_size_.load()) {
+      Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+          "Invalid file size = 0 %s", bfile->PathName().c_str());
+      *reason = "file is empty";
+      return false;
+    }
+
+    if (bfile->gc_once_after_open_.load()) {
+      return true;
+    }
+
+    if (bdb_options_.ttl_range_secs <
+        bdb_options_.partial_expiration_gc_range_secs) {
+      *reason = "has ttl but partial expiration not turned on";
+      return false;
+    }
+
+    ReadLock lockbfile_r(&bfile->mutex_);
+    bool ret = ((bfile->deleted_size_ * 100.0 / bfile->file_size_.load()) >
+                bdb_options_.partial_expiration_pct);
+    if (ret) {
+      *reason = "deleted blobs beyond threshold";
+    } else {
+      *reason = "deleted blobs below threshold";
+    }
+    return ret;
+  }
+
+  // when crash happens, we lose the in-memory account of deleted blobs.
+  // we are therefore forced to do one GC to make sure delete accounting
+  // is OK
+  if (bfile->gc_once_after_open_.load()) {
+    return true;
+  }
+
+  ReadLock lockbfile_r(&bfile->mutex_);
+
+  if ((bfile->deleted_size_ * 100.0 / bfile->file_size_.load()) >
+      bdb_options_.partial_expiration_pct) {
+    *reason = "deleted simple blobs beyond threshold";
+    return true;
+  }
+
+  // if we haven't reached limits of disk space, don't DELETE
+  if (total_blob_space_.load() < bdb_options_.blob_dir_size) {
+    *reason = "disk space not exceeded";
+    return false;
+  }
+
+  bool ret = bfile->BlobFileNumber() == last_id;
+  if (ret) {
+    *reason = "eligible last simple blob file";
+  } else {
+    *reason = "not eligible since not last simple blob file";
+  }
+  return ret;
+}
+
+std::pair<bool, int64_t> BlobDBImpl::DeleteObsFiles(bool aborted) {
+  if (aborted) return std::make_pair(false, -1);
+
+  {
+    ReadLock rl(&mutex_);
+    if (obsolete_files_.empty()) return std::make_pair(true, -1);
+  }
+
+  std::list<std::shared_ptr<BlobFile>> tobsolete;
+  {
+    WriteLock wl(&mutex_);
+    tobsolete.swap(obsolete_files_);
+  }
+
+  bool file_deleted = false;
+  for (auto iter = tobsolete.begin(); iter != tobsolete.end();) {
+    auto bfile = *iter;
+    {
+      ReadLock lockbfile_r(&bfile->mutex_);
+      if (!FileDeleteOk_SnapshotCheckLocked(bfile)) {
+        ++iter;
+        continue;
+      }
+    }
+
+    Status s = myenv_->DeleteFile(bfile->PathName());
+    if (!s.ok()) {
+      Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+          "File failed to be deleted as obsolete %s",
+          bfile->PathName().c_str());
+      ++iter;
+      continue;
+    }
+
+    file_deleted = true;
+    total_blob_space_ -= bfile->file_size_;
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "File deleted as obsolete from blob dir %s", bfile->PathName().c_str());
+
+    iter = tobsolete.erase(iter);
+  }
+
+  // directory change. Fsync
+  if (file_deleted) dir_ent_->Fsync();
+
+  // put files back into obsolete if for some reason, delete failed
+  if (!tobsolete.empty()) {
+    WriteLock wl(&mutex_);
+    for (auto bfile : tobsolete) obsolete_files_.push_front(bfile);
+  }
+
+  return std::make_pair(!aborted, -1);
+}
+
+bool BlobDBImpl::CallbackEvictsImpl(std::shared_ptr<BlobFile> bfile) {
+  std::shared_ptr<Reader> reader =
+      bfile->OpenSequentialReader(myenv_, db_options_, env_options_);
+  if (!reader) {
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "File sequential reader could not be opened for evict callback: %s",
+        bfile->PathName().c_str());
+    return false;
+  }
+
+  ReadLock lockbfile_r(&bfile->mutex_);
+
+  BlobLogHeader header;
+  Status s = reader->ReadHeader(&header);
+  if (!s.ok()) {
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "Failure to read header for blob-file during evict callback %s",
+        bfile->PathName().c_str());
+    return false;
+  }
+
+  ColumnFamilyHandle* cfh = bfile->GetColumnFamily(db_);
+  BlobLogRecord record;
+  Reader::ReadLevel full = Reader::kReadHdrKeyBlobFooter;
+  while (reader->ReadRecord(&record, full).ok()) {
+    bdb_options_.gc_evict_cb_fn(cfh, record.Key(), record.Blob());
+  }
+
+  return true;
+}
+
+std::pair<bool, int64_t> BlobDBImpl::RemoveTimerQ(TimerQueue* tq,
+                                                  bool aborted) {
+  WriteLock wl(&mutex_);
+  for (auto itr = cb_threads_.begin(); itr != cb_threads_.end(); ++itr) {
+    if ((*itr).get() != tq) continue;
+
+    cb_threads_.erase(itr);
+    break;
+  }
+  return std::make_pair(false, -1);
+}
+
+std::pair<bool, int64_t> BlobDBImpl::CallbackEvicts(
+    TimerQueue* tq, std::shared_ptr<BlobFile> bfile, bool aborted) {
+  if (aborted) return std::make_pair(false, -1);
+  bool succ = CallbackEvictsImpl(bfile);
+  if (succ) {
+    Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
+        "Eviction callbacks completed %s", bfile->PathName().c_str());
+  }
+
+  WriteLock wl(&mutex_);
+  bfile->SetCanBeDeleted();
+  obsolete_files_.push_front(bfile);
+  if (tq) {
+    // all of the callbacks have been processed
+    tqueue_.add(0, std::bind(&BlobDBImpl::RemoveTimerQ, this, tq,
+                             std::placeholders::_1));
+  }
+  return std::make_pair(false, -1);
+}
+
+void BlobDBImpl::CopyBlobFiles(
+    std::vector<std::shared_ptr<BlobFile>>* bfiles_copy, uint64_t* last_id) {
+  ReadLock rl(&mutex_);
+
+  // take a copy
+  bfiles_copy->reserve(blob_files_.size());
+  for (auto const& ent : blob_files_) {
+    bfiles_copy->push_back(ent.second);
+
+    // A. has ttl is immutable, once set, hence no locks required
+    // B. blob files are sorted based on number(i.e. index of creation )
+    //    so we will return the last blob file
+    if (!ent.second->HasTTL()) *last_id = ent.second->BlobFileNumber();
+  }
+}
+
+void BlobDBImpl::FilterSubsetOfFiles(
+    const std::vector<std::shared_ptr<BlobFile>>& blob_files,
+    std::vector<std::shared_ptr<BlobFile>>* to_process, uint64_t epoch,
+    uint64_t last_id, size_t files_to_collect) {
+  // 100.0 / 15.0 = 7
+  uint64_t next_epoch_increment = static_cast<uint64_t>(
+      std::ceil(100 / static_cast<double>(bdb_options_.gc_file_pct)));
+  std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
+  std::time_t tt = std::chrono::system_clock::to_time_t(now);
+
+  size_t files_processed = 0;
+  for (auto bfile : blob_files) {
+    if (files_processed >= files_to_collect) break;
+    // if this is the first time processing the file
+    // i.e. gc_epoch == -1, process it.
+    // else process the file if its processing epoch matches
+    // the current epoch. Typically the #of epochs should be
+    // around 5-10
+    if (bfile->gc_epoch_ != -1 && (uint64_t)bfile->gc_epoch_ != epoch) {
+      continue;
+    }
+
+    files_processed++;
+    // reset the epoch
+    bfile->gc_epoch_ = epoch + next_epoch_increment;
+
+    // file has already been GC'd or is still open for append,
+    // then it should not be GC'd
+    if (bfile->Obsolete() || !bfile->Immutable()) continue;
+
+    std::string reason;
+    bool shouldgc = ShouldGCFile(bfile, tt, last_id, &reason);
+    if (!shouldgc) {
+      Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
+          "File has been skipped for GC ttl %s %d %d reason='%s'",
+          bfile->PathName().c_str(), tt, bfile->GetTTLRange().second,
+          reason.c_str());
+      continue;
+    }
+
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "File has been chosen for GC ttl %s %d %d reason='%s'",
+        bfile->PathName().c_str(), tt, bfile->GetTTLRange().second,
+        reason.c_str());
+    to_process->push_back(bfile);
+  }
+}
+
+std::pair<bool, int64_t> BlobDBImpl::RunGC(bool aborted) {
+  if (aborted) return std::make_pair(false, -1);
+
+  current_epoch_++;
+
+  // collect the ID of the last regular file, in case we need to GC it.
+  uint64_t last_id = std::numeric_limits<uint64_t>::max();
+
+  std::vector<std::shared_ptr<BlobFile>> blob_files;
+  CopyBlobFiles(&blob_files, &last_id);
+
+  if (!blob_files.size()) return std::make_pair(true, -1);
+
+  // 15% of files are collected each call to space out the IO and CPU
+  // consumption.
+  size_t files_to_collect =
+      (bdb_options_.gc_file_pct * blob_files.size()) / 100;
+
+  std::vector<std::shared_ptr<BlobFile>> to_process;
+  FilterSubsetOfFiles(blob_files, &to_process, current_epoch_, last_id,
+                      files_to_collect);
+
+  // in this collect the set of files, which became obsolete
+  std::vector<std::shared_ptr<BlobFile>> obsoletes;
+  for (auto bfile : to_process) {
+    GCStats gcstats;
+    Status s = GCFileAndUpdateLSM(bfile, &gcstats);
+    if (!s.ok()) continue;
+
+    if (bfile->gc_once_after_open_.load()) {
+      WriteLock lockbfile_w(&bfile->mutex_);
+
+      bfile->deleted_size_ = gcstats.deleted_size;
+      bfile->deleted_count_ = gcstats.num_deletes;
+      bfile->gc_once_after_open_ = false;
+    } else {
+      obsoletes.push_back(bfile);
+    }
+  }
+
+  if (!obsoletes.empty()) {
+    bool evict_cb = (!!bdb_options_.gc_evict_cb_fn);
+    std::shared_ptr<TimerQueue> tq;
+    if (evict_cb) tq = std::make_shared<TimerQueue>();
+
+    // if evict callback is present, first schedule the callback thread
+    WriteLock wl(&mutex_);
+    for (auto bfile : obsoletes) {
+      bool last_file = (bfile == obsoletes.back());
+      // remove from global list so writers
+      blob_files_.erase(bfile->BlobFileNumber());
+
+      if (!evict_cb) {
+        bfile->SetCanBeDeleted();
+        obsolete_files_.push_front(bfile);
+      } else {
+        tq->add(0, std::bind(&BlobDBImpl::CallbackEvicts, this,
+                             (last_file) ? tq.get() : nullptr, bfile,
+                             std::placeholders::_1));
+      }
+    }
+    if (evict_cb) cb_threads_.emplace_back(tq);
+  }
+
+  // reschedule
+  return std::make_pair(true, -1);
+}
+
+Iterator* BlobDBImpl::NewIterator(const ReadOptions& opts,
+                                  ColumnFamilyHandle* column_family) {
+  return new BlobDBIterator(db_->NewIterator(opts, column_family),
+                            column_family, this);
+}
+
+}  // namespace blob_db
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/utilities/blob_db/blob_db_impl.h b/utilities/blob_db/blob_db_impl.h
new file mode 100644
index 000000000..5b9d1fba7
--- /dev/null
+++ b/utilities/blob_db/blob_db_impl.h
@@ -0,0 +1,657 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <atomic>
+#include <condition_variable>
+#include <ctime>
+#include <list>
+#include <memory>
+#include <set>
+#include <string>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/options.h"
+#include "rocksdb/wal_filter.h"
+#include "util/file_reader_writer.h"
+#include "util/mpsc.h"
+#include "util/mutexlock.h"
+#include "util/timer_queue.h"
+#include "utilities/blob_db/blob_db.h"
+#include "utilities/blob_db/blob_db_options_impl.h"
+#include "utilities/blob_db/blob_log_format.h"
+#include "utilities/blob_db/blob_log_reader.h"
+#include "utilities/blob_db/blob_log_writer.h"
+
+namespace rocksdb {
+
+class DBImpl;
+class ColumnFamilyHandle;
+class ColumnFamilyData;
+class OptimisticTransactionDBImpl;
+struct FlushJobInfo;
+
+namespace blob_db {
+
+class BlobFile;
+class BlobDBImpl;
+struct GCStats;
+
+class BlobDBFlushBeginListener : public EventListener {
+ public:
+  explicit BlobDBFlushBeginListener() : impl_(nullptr) {}
+
+  void OnFlushBegin(DB* db, const FlushJobInfo& info) override;
+
+  void SetImplPtr(BlobDBImpl* p) { impl_ = p; }
+
+ protected:
+  BlobDBImpl* impl_;
+};
+
+// this implements the callback from the WAL which ensures that the
+// blob record is present in the blob log. If fsync/fdatasync in not
+// happening on every write, there is the probability that keys in the
+// blob log can lag the keys in blobs
+class BlobReconcileWalFilter : public WalFilter {
+ public:
+  virtual WalFilter::WalProcessingOption LogRecordFound(
+      unsigned long long log_number, const std::string& log_file_name,
+      const WriteBatch& batch, WriteBatch* new_batch,
+      bool* batch_changed) override;
+
+  virtual const char* Name() const override { return "BlobDBWalReconciler"; }
+
+  void SetImplPtr(BlobDBImpl* p) { impl_ = p; }
+
+ protected:
+  BlobDBImpl* impl_;
+};
+
+class EvictAllVersionsCompactionListener : public EventListener {
+ public:
+  class InternalListener : public CompactionEventListener {
+    friend class BlobDBImpl;
+
+   public:
+    virtual void OnCompaction(int level, const Slice& key,
+                              CompactionListenerValueType value_type,
+                              const Slice& existing_value,
+                              const SequenceNumber& sn, bool is_new) override;
+
+    void SetImplPtr(BlobDBImpl* p) { impl_ = p; }
+
+   private:
+    BlobDBImpl* impl_;
+  };
+
+  explicit EvictAllVersionsCompactionListener()
+      : internal_listener_(new InternalListener()) {}
+
+  virtual CompactionEventListener* GetCompactionEventListener() override {
+    return internal_listener_.get();
+  }
+
+  void SetImplPtr(BlobDBImpl* p) { internal_listener_->SetImplPtr(p); }
+
+ private:
+  std::unique_ptr<InternalListener> internal_listener_;
+};
+
+#if 0
+class EvictAllVersionsFilterFactory : public CompactionFilterFactory {
+ private:
+  BlobDBImpl* impl_;
+
+ public:
+  EvictAllVersionsFilterFactory() : impl_(nullptr) {}
+
+  void SetImplPtr(BlobDBImpl* p) { impl_ = p; }
+
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override;
+
+  virtual const char* Name() const override {
+    return "EvictAllVersionsFilterFactory";
+  }
+};
+#endif
+
+// Comparator to sort "TTL" aware Blob files based on the lower value of
+// TTL range.
+struct blobf_compare_ttl {
+  bool operator()(const std::shared_ptr<BlobFile>& lhs,
+                  const std::shared_ptr<BlobFile>& rhs) const;
+};
+
+/**
+ * The implementation class for BlobDB. This manages the value
+ * part in TTL aware sequentially written files. These files are
+ * Garbage Collected.
+ */
+class BlobDBImpl : public BlobDB {
+  friend class BlobDBFlushBeginListener;
+  friend class EvictAllVersionsCompactionListener;
+  friend class BlobDB;
+  friend class BlobFile;
+  friend class BlobDBIterator;
+
+ public:
+  using rocksdb::StackableDB::Put;
+  Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family,
+             const Slice& key, const Slice& value) override;
+
+  using rocksdb::StackableDB::Delete;
+  Status Delete(const WriteOptions& options, ColumnFamilyHandle* column_family,
+                const Slice& key) override;
+
+  using rocksdb::StackableDB::SingleDelete;
+  virtual Status SingleDelete(const WriteOptions& wopts,
+                              ColumnFamilyHandle* column_family,
+                              const Slice& key) override;
+
+  using rocksdb::StackableDB::Get;
+  Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
+             const Slice& key, std::string* value) override;
+
+  using rocksdb::StackableDB::NewIterator;
+  virtual Iterator* NewIterator(const ReadOptions& opts,
+                                ColumnFamilyHandle* column_family) override;
+
+  using rocksdb::StackableDB::MultiGet;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override;
+
+  virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
+
+  using BlobDB::PutWithTTL;
+  Status PutWithTTL(const WriteOptions& options,
+                    ColumnFamilyHandle* column_family, const Slice& key,
+                    const Slice& value, int32_t ttl) override;
+
+  using BlobDB::PutUntil;
+  Status PutUntil(const WriteOptions& options,
+                  ColumnFamilyHandle* column_family, const Slice& key,
+                  const Slice& value_unc, int32_t expiration) override;
+
+  Status LinkToBaseDB(DB* db) override;
+
+  BlobDBImpl(DB* db, const BlobDBOptions& bdb_options);
+
+  BlobDBImpl(const std::string& dbname, const BlobDBOptions& bdb_options,
+             const DBOptions& db_options);
+
+  ~BlobDBImpl();
+
+ private:
+  static bool ExtractTTLFromBlob(const Slice& value, Slice* newval,
+                                 int32_t* ttl_val);
+
+  Status OpenPhase1();
+
+  Status CommonGet(const ColumnFamilyData* cfd, const Slice& key,
+                   const std::string& index_entry, std::string* value);
+
+  // Just before flush starts acting on memtable files,
+  // this handler is called.
+  void OnFlushBeginHandler(DB* db, const FlushJobInfo& info);
+
+  // timer queue callback to close a file by appending a footer
+  // removes file from open files list
+  std::pair<bool, int64_t> CloseSeqWrite(std::shared_ptr<BlobFile> bfile,
+                                         bool aborted);
+
+  // is this file ready for Garbage collection. if the TTL of the file
+  // has expired or if threshold of the file has been evicted
+  // tt - current time
+  // last_id - the id of the non-TTL file to evict
+  bool ShouldGCFile(std::shared_ptr<BlobFile> bfile, std::time_t tt,
+                    uint64_t last_id, std::string* reason);
+
+  // collect all the blob log files from the blob directory
+  Status GetAllLogFiles(std::set<std::pair<uint64_t, std::string>>* file_nums);
+
+  // appends a task into timer queue to close the file
+  void CloseIf(const std::shared_ptr<BlobFile>& bfile);
+
+  Status AppendBlob(const std::shared_ptr<BlobFile>& bfile,
+                    const std::string& headerbuf, const Slice& key,
+                    const Slice& value, std::string* index_entry);
+
+  Status AppendSN(const std::shared_ptr<BlobFile>& bfile,
+                  const SequenceNumber& sn);
+
+  // find an existing blob log file based on the expiration unix epoch
+  // if such a file does not exist, return nullptr
+  std::shared_ptr<BlobFile> SelectBlobFileTTL(uint32_t expiration);
+
+  // find an existing blob log file to append the value to
+  std::shared_ptr<BlobFile> SelectBlobFile();
+
+  std::shared_ptr<BlobFile> FindBlobFileLocked(uint32_t expiration) const;
+
+  void UpdateWriteOptions(const WriteOptions& options);
+
+  void Shutdown();
+
+  // periodic sanity check. Bunch of checks
+  std::pair<bool, int64_t> SanityCheck(bool aborted);
+
+  // delete files which have been garbage collected and marked
+  // obsolete. Check whether any snapshots exist which refer to
+  // the same
+  std::pair<bool, int64_t> DeleteObsFiles(bool aborted);
+
+  // Major task to garbage collect expired and deleted blobs
+  std::pair<bool, int64_t> RunGC(bool aborted);
+
+  // asynchronous task to fsync/fdatasync the open blob files
+  std::pair<bool, int64_t> FsyncFiles(bool aborted);
+
+  // periodically check if open blob files and their TTL's has expired
+  // if expired, close the sequential writer and make the file immutable
+  std::pair<bool, int64_t> CheckSeqFiles(bool aborted);
+
+  // if the number of open files, approaches ULIMIT's this
+  // task will close random readers, which are kept around for
+  // efficiency
+  std::pair<bool, int64_t> ReclaimOpenFiles(bool aborted);
+
+  // periodically print write amplification statistics
+  std::pair<bool, int64_t> WaStats(bool aborted);
+
+  // background task to do book-keeping of deleted keys
+  std::pair<bool, int64_t> EvictDeletions(bool aborted);
+
+  std::pair<bool, int64_t> EvictCompacted(bool aborted);
+
+  bool CallbackEvictsImpl(std::shared_ptr<BlobFile> bfile);
+
+  std::pair<bool, int64_t> RemoveTimerQ(TimerQueue* tq, bool aborted);
+
+  std::pair<bool, int64_t> CallbackEvicts(TimerQueue* tq,
+                                          std::shared_ptr<BlobFile> bfile,
+                                          bool aborted);
+
+  // Adds the background tasks to the timer queue
+  void StartBackgroundTasks();
+
+  // add a new Blob File
+  std::shared_ptr<BlobFile> NewBlobFile(const std::string& reason);
+
+  Status OpenAllFiles();
+
+  // hold write mutex on file and call
+  // creates a Random Access reader for GET call
+  std::shared_ptr<RandomAccessFileReader> GetOrOpenRandomAccessReader(
+      const std::shared_ptr<BlobFile>& bfile, Env* env,
+      const EnvOptions& env_options);
+
+  // hold write mutex on file and call.
+  // Close the above Random Access reader
+  void CloseRandomAccessLocked(const std::shared_ptr<BlobFile>& bfile);
+
+  // hold write mutex on file and call
+  // creates a sequential (append) writer for this blobfile
+  Status CreateWriterLocked(const std::shared_ptr<BlobFile>& bfile);
+
+  // returns a Writer object for the file. If writer is not
+  // already present, creates one. Needs Write Mutex to be held
+  std::shared_ptr<Writer> CheckOrCreateWriterLocked(
+      const std::shared_ptr<BlobFile>& bfile);
+
+  // Iterate through keys and values on Blob and write into
+  // separate file the remaining blobs and delete/update pointers
+  // in LSM atomically
+  Status GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
+                            GCStats* gcstats);
+
+  // checks if there is no snapshot which is referencing the
+  // blobs
+  bool FileDeleteOk_SnapshotCheckLocked(const std::shared_ptr<BlobFile>& bfile);
+
+  bool MarkBlobDeleted(const Slice& key, const Slice& lsmValue);
+
+  bool FindFileAndEvictABlob(uint64_t file_number, uint64_t key_size,
+                             uint64_t blob_offset, uint64_t blob_size);
+
+  void CopyBlobFiles(std::vector<std::shared_ptr<BlobFile>>* bfiles_copy,
+                     uint64_t* last_id);
+
+  void FilterSubsetOfFiles(
+      const std::vector<std::shared_ptr<BlobFile>>& blob_files,
+      std::vector<std::shared_ptr<BlobFile>>* to_process, uint64_t epoch,
+      uint64_t last_id, size_t files_to_collect);
+
+ private:
+  // the base DB
+  DBImpl* db_impl_;
+
+  Env* myenv_;
+
+  // Optimistic Transaction DB used during Garbage collection
+  // for atomicity
+  std::unique_ptr<OptimisticTransactionDBImpl> opt_db_;
+
+  // a boolean to capture whether write_options has been set
+  std::atomic<bool> wo_set_;
+  WriteOptions write_options_;
+
+  // the options that govern the behavior of Blob Storage
+  BlobDBOptionsImpl bdb_options_;
+  DBOptions db_options_;
+  EnvOptions env_options_;
+
+  // name of the database directory
+  std::string dbname_;
+
+  // by default this is "blob_dir" under dbname_
+  // but can be configured
+  std::string blob_dir_;
+
+  // pointer to directory
+  std::unique_ptr<Directory> dir_ent_;
+
+  std::atomic<bool> dir_change_;
+
+  // Read Write Mutex, which protects all the data structures
+  // HEAVILY TRAFFICKED
+  port::RWMutex mutex_;
+
+  // counter for blob file number
+  std::atomic<uint64_t> next_file_number_;
+
+  // entire metadata of all the BLOB files memory
+  std::unordered_map<uint64_t, std::shared_ptr<BlobFile>> blob_files_;
+
+  // epoch or version of the open files.
+  std::atomic<uint64_t> epoch_of_;
+
+  // typically we keep 4 open blob files (simple i.e. no TTL)
+  std::vector<std::shared_ptr<BlobFile>> open_simple_files_;
+
+  // all the blob files which are currently being appended to based
+  // on variety of incoming TTL's
+  std::multiset<std::shared_ptr<BlobFile>, blobf_compare_ttl> open_blob_files_;
+
+  // packet of information to put in lockess delete(s) queue
+  struct delete_packet_t {
+    ColumnFamilyHandle* cfh_;
+    std::string key_;
+    SequenceNumber dsn_;
+  };
+
+  struct override_packet_t {
+    uint64_t file_number_;
+    uint64_t key_size_;
+    uint64_t blob_offset_;
+    uint64_t blob_size_;
+    SequenceNumber dsn_;
+  };
+
+  // LOCKLESS multiple producer single consumer queue to quickly append
+  // deletes without taking lock. Can rapidly grow in size!!
+  // deletes happen in LSM, but minor book-keeping needs to happen on
+  // BLOB side (for triggering eviction)
+  mpsc_queue_t<delete_packet_t> delete_keys_q_;
+
+  // LOCKLESS multiple producer single consumer queue for values
+  // that are being compacted
+  mpsc_queue_t<override_packet_t> override_vals_q_;
+
+  // atomic bool to represent shutdown
+  std::atomic<bool> shutdown_;
+
+  // timer based queue to execute tasks
+  TimerQueue tqueue_;
+
+  // timer queues to call eviction callbacks.
+  std::vector<std::shared_ptr<TimerQueue>> cb_threads_;
+
+  // only accessed in GC thread, hence not atomic. The epoch of the
+  // GC task. Each execution is one epoch. Helps us in allocating
+  // files to one execution
+  uint64_t current_epoch_;
+
+  // number of files opened for random access/GET
+  // counter is used to monitor and close excess RA files.
+  std::atomic<uint32_t> open_file_count_;
+
+  // should hold mutex to modify
+  // STATISTICS for WA of Blob Files due to GC
+  // collect by default 24 hourly periods
+  std::list<uint64_t> all_periods_write_;
+  std::list<uint64_t> all_periods_ampl_;
+
+  std::atomic<uint64_t> last_period_write_;
+  std::atomic<uint64_t> last_period_ampl_;
+
+  uint64_t total_periods_write_;
+  uint64_t total_periods_ampl_;
+
+  // total size of all blob files at a given time
+  std::atomic<uint64_t> total_blob_space_;
+  std::list<std::shared_ptr<BlobFile>> obsolete_files_;
+  bool open_p1_done_;
+
+  uint32_t debug_level_;
+};
+
+class BlobFile {
+  friend class BlobDBImpl;
+  friend struct blobf_compare_ttl;
+
+ private:
+  // access to parent
+  const BlobDBImpl* parent_;
+
+  // path to blob directory
+  std::string path_to_dir_;
+
+  // the id of the file.
+  // the above 2 are created during file creation and never changed
+  // after that
+  uint64_t file_number_;
+
+  // number of blobs in the file
+  std::atomic<uint64_t> blob_count_;
+
+  // the file will be selected for GC in this future epoch
+  std::atomic<int64_t> gc_epoch_;
+
+  // size of the file
+  std::atomic<uint64_t> file_size_;
+
+  // number of blobs in this particular file which have been evicted
+  uint64_t deleted_count_;
+
+  // size of deleted blobs (used by heuristic to select file for GC)
+  uint64_t deleted_size_;
+
+  BlobLogHeader header_;
+
+  // closed_ = true implies the file is no more mutable
+  // no more blobs will be appended and the footer has been written out
+  std::atomic<bool> closed_;
+
+  // has a pass of garbage collection successfully finished on this file
+  // can_be_deleted_ still needs to do iterator/snapshot checks
+  std::atomic<bool> can_be_deleted_;
+
+  // should this file been gc'd once to reconcile lost deletes/compactions
+  std::atomic<bool> gc_once_after_open_;
+
+  // et - lt of the blobs
+  ttlrange_t ttl_range_;
+
+  // et - lt of the timestamp of the KV pairs.
+  tsrange_t time_range_;
+
+  // ESN - LSN of the blobs
+  snrange_t sn_range_;
+
+  // Sequential/Append writer for blobs
+  std::shared_ptr<Writer> log_writer_;
+
+  // random access file reader for GET calls
+  std::shared_ptr<RandomAccessFileReader> ra_file_reader_;
+
+  // This Read-Write mutex is per file specific and protects
+  // all the datastructures
+  port::RWMutex mutex_;
+
+  // time when the random access reader was last created.
+  std::atomic<std::time_t> last_access_;
+
+  // last time file was fsync'd/fdatasyncd
+  std::atomic<uint64_t> last_fsync_;
+
+  bool header_valid_;
+
+ public:
+  BlobFile();
+
+  BlobFile(const BlobDBImpl* parent, const std::string& bdir, uint64_t fnum);
+
+  ~BlobFile();
+
+  ColumnFamilyHandle* GetColumnFamily(DB* db);
+
+  // Returns log file's pathname relative to the main db dir
+  // Eg. For a live-log-file = blob_dir/000003.blob
+  std::string PathName() const;
+
+  // Primary identifier for blob file.
+  // once the file is created, this never changes
+  uint64_t BlobFileNumber() const { return file_number_; }
+
+  // the following functions are atomic, and don't need
+  // read lock
+  uint64_t BlobCount() const {
+    return blob_count_.load(std::memory_order_acquire);
+  }
+
+  std::string DumpState() const;
+
+  // if the file has gone through GC and blobs have been relocated
+  bool Obsolete() const { return can_be_deleted_.load(); }
+
+  // if the file is not taking any more appends.
+  bool Immutable() const { return closed_.load(); }
+
+  // we will assume this is atomic
+  bool NeedsFsync(bool hard, uint64_t bytes_per_sync) const;
+
+  uint64_t GetFileSize() const {
+    return file_size_.load(std::memory_order_acquire);
+  }
+
+  // All Get functions which are not atomic, will need ReadLock on the mutex
+  tsrange_t GetTimeRange() const {
+    assert(HasTimestamp());
+    return time_range_;
+  }
+
+  ttlrange_t GetTTLRange() const { return ttl_range_; }
+
+  snrange_t GetSNRange() const { return sn_range_; }
+
+  bool HasTTL() const {
+    assert(header_valid_);
+    return header_.HasTTL();
+  }
+
+  bool HasTimestamp() const {
+    assert(header_valid_);
+    return header_.HasTimestamp();
+  }
+
+  std::shared_ptr<Writer> GetWriter() const { return log_writer_; }
+
+  void Fsync();
+
+ private:
+  std::shared_ptr<Reader> OpenSequentialReader(
+      Env* env, const DBOptions& db_options,
+      const EnvOptions& env_options) const;
+
+  Status ReadFooter(BlobLogFooter* footer);
+
+  Status WriteFooterAndCloseLocked();
+
+  std::shared_ptr<RandomAccessFileReader> GetOrOpenRandomAccessReader(
+      Env* env, const EnvOptions& env_options, bool* fresh_open);
+
+  void CloseRandomAccessLocked();
+
+  // this is used, when you are reading only the footer of a
+  // previously closed file
+  Status SetFromFooterLocked(const BlobLogFooter& footer);
+
+  void set_time_range(const tsrange_t& tr) { time_range_ = tr; }
+
+  void set_ttl_range(const ttlrange_t& ttl) { ttl_range_ = ttl; }
+
+  void SetSNRange(const snrange_t& snr) { sn_range_ = snr; }
+
+  // The following functions are atomic, and don't need locks
+  void SetFileSize(uint64_t fs) { file_size_ = fs; }
+
+  void SetBlobCount(uint64_t bc) { blob_count_ = bc; }
+
+  void SetCanBeDeleted() { can_be_deleted_ = true; }
+};
+
+class BlobDBIterator : public Iterator {
+ public:
+  explicit BlobDBIterator(Iterator* iter, ColumnFamilyHandle* column_family,
+                          BlobDBImpl* impl)
+      : iter_(iter), cfh_(column_family), db_impl_(impl) {
+    assert(iter_);
+  }
+
+  ~BlobDBIterator() { delete iter_; }
+
+  bool Valid() const override { return iter_->Valid(); }
+
+  void SeekToFirst() override { iter_->SeekToFirst(); }
+
+  void SeekToLast() override { iter_->SeekToLast(); }
+
+  void Seek(const Slice& target) override { iter_->Seek(target); }
+
+  void SeekForPrev(const Slice& target) override { iter_->SeekForPrev(target); }
+
+  void Next() override { iter_->Next(); }
+
+  void Prev() override { iter_->Prev(); }
+
+  Slice key() const override { return iter_->key(); }
+
+  Slice value() const override;
+
+  Status status() const override { return iter_->status(); }
+
+ private:
+  Iterator* iter_;
+  ColumnFamilyHandle* cfh_;
+  BlobDBImpl* db_impl_;
+  mutable std::string vpart_;
+};
+
+}  // namespace blob_db
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/utilities/blob_db/blob_db_options_impl.cc b/utilities/blob_db/blob_db_options_impl.cc
new file mode 100644
index 000000000..fff85a92a
--- /dev/null
+++ b/utilities/blob_db/blob_db_options_impl.cc
@@ -0,0 +1,66 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#ifndef ROCKSDB_LITE
+
+#include "utilities/blob_db/blob_db_options_impl.h"
+
+namespace rocksdb {
+
+namespace blob_db {
+
+BlobDBOptionsImpl::BlobDBOptionsImpl(const BlobDBOptions& in)
+    : BlobDBOptions(in),
+      deletion_check_period_millisecs(2 * 1000),
+      gc_file_pct(20),
+      gc_check_period_millisecs(60 * 1000),
+      sanity_check_period_millisecs(20 * 60 * 1000),
+      open_files_trigger(100),
+      wa_num_stats_periods(24),
+      wa_stats_period_millisecs(3600 * 1000),
+      partial_expiration_gc_range_secs(4 * 3600),
+      partial_expiration_pct(75),
+      fsync_files_period_millisecs(10 * 1000),
+      reclaim_of_period_millisecs(1 * 1000),
+      delete_obsf_period_millisecs(10 * 1000),
+      check_seqf_period_millisecs(10 * 1000) {}
+
+BlobDBOptionsImpl::BlobDBOptionsImpl()
+    : deletion_check_period_millisecs(2 * 1000),
+      gc_file_pct(20),
+      gc_check_period_millisecs(60 * 1000),
+      sanity_check_period_millisecs(20 * 60 * 1000),
+      open_files_trigger(100),
+      wa_num_stats_periods(24),
+      wa_stats_period_millisecs(3600 * 1000),
+      partial_expiration_gc_range_secs(4 * 3600),
+      partial_expiration_pct(75),
+      fsync_files_period_millisecs(10 * 1000),
+      reclaim_of_period_millisecs(1 * 1000),
+      delete_obsf_period_millisecs(10 * 1000),
+      check_seqf_period_millisecs(10 * 1000) {}
+
+BlobDBOptionsImpl& BlobDBOptionsImpl::operator=(const BlobDBOptionsImpl& in) {
+  BlobDBOptions::operator=(in);
+  if (this != &in) {
+    deletion_check_period_millisecs = in.deletion_check_period_millisecs;
+    gc_file_pct = in.gc_file_pct;
+    gc_check_period_millisecs = in.gc_check_period_millisecs;
+    sanity_check_period_millisecs = in.sanity_check_period_millisecs;
+    open_files_trigger = in.open_files_trigger;
+    wa_num_stats_periods = in.wa_num_stats_periods;
+    wa_stats_period_millisecs = in.wa_stats_period_millisecs;
+    partial_expiration_gc_range_secs = in.partial_expiration_gc_range_secs;
+    partial_expiration_pct = in.partial_expiration_pct;
+    fsync_files_period_millisecs = in.fsync_files_period_millisecs;
+    reclaim_of_period_millisecs = in.reclaim_of_period_millisecs;
+    delete_obsf_period_millisecs = in.delete_obsf_period_millisecs;
+    check_seqf_period_millisecs = in.check_seqf_period_millisecs;
+  }
+  return *this;
+}
+
+}  // namespace blob_db
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/utilities/blob_db/blob_db_options_impl.h b/utilities/blob_db/blob_db_options_impl.h
new file mode 100644
index 000000000..9cc887ee2
--- /dev/null
+++ b/utilities/blob_db/blob_db_options_impl.h
@@ -0,0 +1,73 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/blob_db/blob_db.h"
+
+namespace rocksdb {
+
+namespace blob_db {
+
+struct BlobDBOptionsImpl : public BlobDBOptions {
+  // deletions check period
+  uint32_t deletion_check_period_millisecs;
+
+  // gc percentage each check period
+  uint32_t gc_file_pct;
+
+  // gc period
+  uint32_t gc_check_period_millisecs;
+
+  // sanity check task
+  uint32_t sanity_check_period_millisecs;
+
+  // how many random access open files can we tolerate
+  uint32_t open_files_trigger;
+
+  // how many periods of stats do we keep.
+  uint32_t wa_num_stats_periods;
+
+  // what is the length of any period
+  uint32_t wa_stats_period_millisecs;
+
+  // we will garbage collect blob files in
+  // which entire files have expired. However if the
+  // ttl_range of files is very large say a day, we
+  // would have to wait for the entire day, before we
+  // recover most of the space.
+  uint32_t partial_expiration_gc_range_secs;
+
+  // this should be based on allowed Write Amplification
+  // if 50% of the space of a blob file has been deleted/expired,
+  uint32_t partial_expiration_pct;
+
+  // how often should we schedule a job to fsync open files
+  uint32_t fsync_files_period_millisecs;
+
+  // how often to schedule reclaim open files.
+  uint32_t reclaim_of_period_millisecs;
+
+  // how often to schedule delete obs files periods
+  uint32_t delete_obsf_period_millisecs;
+
+  // how often to schedule check seq files period
+  uint32_t check_seqf_period_millisecs;
+
+  // default constructor
+  BlobDBOptionsImpl();
+
+  explicit BlobDBOptionsImpl(const BlobDBOptions& in);
+
+  BlobDBOptionsImpl& operator=(const BlobDBOptionsImpl& in);
+};
+
+}  // namespace blob_db
+
+}  // namespace rocksdb
+
+#endif  // endif ROCKSDB
diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc
index 4d26ef0e4..17a5ddca7 100644
--- a/utilities/blob_db/blob_db_test.cc
+++ b/utilities/blob_db/blob_db_test.cc
@@ -1,66 +1,567 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
 #ifndef ROCKSDB_LITE
 
 #include "utilities/blob_db/blob_db.h"
+#include <cstdlib>
+#include "db/db_test_util.h"
 #include "util/random.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
+#include "utilities/blob_db/blob_db_options_impl.h"
 
 namespace rocksdb {
+
+namespace blob_db {
+Random s_rnd(301);
+
+void gen_random(char *s, const int len) {
+  static const char alphanum[] =
+      "0123456789"
+      "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+      "abcdefghijklmnopqrstuvwxyz";
+
+  for (int i = 0; i < len; ++i) {
+    s[i] = alphanum[s_rnd.Next() % (sizeof(alphanum) - 1)];
+  }
+
+  s[len] = 0;
+}
+
 class BlobDBTest : public testing::Test {
  public:
-  BlobDBTest() {
+  BlobDBTest() : blobdb_(nullptr) {
     dbname_ = test::TmpDir() + "/blob_db_test";
-    Options options;
-    options.create_if_missing = true;
-    EXPECT_TRUE(NewBlobDB(options, dbname_, &db_).ok());
+    // Reopen1(BlobDBOptionsImpl());
+  }
+
+  ~BlobDBTest() {
+    if (blobdb_) {
+      delete blobdb_;
+      blobdb_ = nullptr;
+    }
   }
 
-  ~BlobDBTest() { delete db_; }
+  void Reopen1(const BlobDBOptionsImpl &bdboptions,
+               const Options &options = Options()) {
+    if (blobdb_) {
+      delete blobdb_;
+      blobdb_ = nullptr;
+    }
+
+    BlobDBOptionsImpl bblobdb_options = bdboptions;
+    Options myoptions = options;
+    BlobDB::DestroyBlobDB(dbname_, myoptions, bblobdb_options);
+
+    DestroyDB(dbname_, myoptions);
+
+    myoptions.create_if_missing = true;
+    EXPECT_TRUE(
+        BlobDB::Open(myoptions, bblobdb_options, dbname_, &blobdb_).ok());
+  }
+
+  void insert_blobs() {
+    WriteOptions wo;
+    ReadOptions ro;
+    std::string value;
+
+    ColumnFamilyHandle *dcfh = blobdb_->DefaultColumnFamily();
+
+    Random rnd(301);
+    for (size_t i = 0; i < 100000; i++) {
+      int len = rnd.Next() % 16384;
+      if (!len) continue;
+
+      char *val = new char[len + 1];
+      gen_random(val, len);
+
+      std::string key("key");
+      key += std::to_string(i % 500);
+
+      Slice keyslice(key);
+      Slice valslice(val, len + 1);
+
+      int ttl = rnd.Next() % 86400;
+
+      ASSERT_OK(blobdb_->PutWithTTL(wo, dcfh, keyslice, valslice, ttl));
+      delete[] val;
+    }
+
+    for (size_t i = 0; i < 10; i++) {
+      std::string key("key");
+      key += std::to_string(i % 500);
+      Slice keyslice(key);
+      blobdb_->Delete(wo, dcfh, keyslice);
+    }
+  }
 
-  DB* db_;
+  BlobDB *blobdb_;
   std::string dbname_;
 };  // class BlobDBTest
 
-TEST_F(BlobDBTest, Basic) {
+TEST_F(BlobDBTest, DeleteComplex) {
+  BlobDBOptionsImpl bdboptions;
+  bdboptions.partial_expiration_pct = 75;
+  bdboptions.gc_check_period_millisecs = 20 * 1000;
+  bdboptions.blob_file_size = 219 * 1024;
+
+  Reopen1(bdboptions);
+
   WriteOptions wo;
   ReadOptions ro;
   std::string value;
 
-  ASSERT_OK(db_->Put(wo, "foo", "v1"));
-  ASSERT_OK(db_->Put(wo, "bar", "v2"));
+  ColumnFamilyHandle *dcfh = blobdb_->DefaultColumnFamily();
+
+  Random rnd(301);
+  for (size_t i = 0; i < 100; i++) {
+    int len = rnd.Next() % 16384;
+    if (!len) continue;
+
+    char *val = new char[len + 1];
+    gen_random(val, len);
 
-  ASSERT_OK(db_->Get(ro, "foo", &value));
-  ASSERT_EQ("v1", value);
-  ASSERT_OK(db_->Get(ro, "bar", &value));
-  ASSERT_EQ("v2", value);
+    std::string key("key");
+    key += std::to_string(i);
+
+    Slice keyslice(key);
+    Slice valslice(val, len + 1);
+
+    ASSERT_OK(blobdb_->Put(wo, dcfh, keyslice, valslice));
+    delete[] val;
+  }
+
+  for (size_t i = 0; i < 99; i++) {
+    std::string key("key");
+    key += std::to_string(i);
+
+    Slice keyslice(key);
+    blobdb_->Delete(wo, dcfh, keyslice);
+  }
+
+  Env::Default()->SleepForMicroseconds(60 * 1000 * 1000);
 }
 
+TEST_F(BlobDBTest, OverrideTest) {
+  BlobDBOptionsImpl bdboptions;
+  bdboptions.ttl_range_secs = 30;
+  bdboptions.gc_file_pct = 100;
+  bdboptions.gc_check_period_millisecs = 20 * 1000;
+  bdboptions.num_concurrent_simple_blobs = 2;
+  bdboptions.blob_file_size = 876 * 1024 * 10;
+
+  Options options;
+  options.write_buffer_size = 256 * 1024;
+  options.info_log_level = INFO_LEVEL;
+
+  Reopen1(bdboptions, options);
+
+  WriteOptions wo;
+  ReadOptions ro;
+  std::string value;
+
+  Random rnd(301);
+  ColumnFamilyHandle *dcfh = blobdb_->DefaultColumnFamily();
+
+  for (int i = 0; i < 10000; i++) {
+    int len = rnd.Next() % 16384;
+    if (!len) continue;
+
+    char *val = new char[len + 1];
+    gen_random(val, len);
+
+    std::string key("key");
+    char x[10];
+    std::sprintf(x, "%04d", i);
+    key += std::string(x);
+
+    Slice keyslice(key);
+    Slice valslice(val, len + 1);
+
+    ASSERT_OK(blobdb_->Put(wo, dcfh, keyslice, valslice));
+    delete[] val;
+  }
+
+  // override all the keys
+  for (int i = 0; i < 10000; i++) {
+    int len = rnd.Next() % 16384;
+    if (!len) continue;
+
+    char *val = new char[len + 1];
+    gen_random(val, len);
+
+    std::string key("key");
+    char x[10];
+    std::sprintf(x, "%04d", i);
+    key += std::string(x);
+
+    Slice keyslice(key);
+    Slice valslice(val, len + 1);
+
+    ASSERT_OK(blobdb_->Put(wo, dcfh, keyslice, valslice));
+    delete[] val;
+  }
+
+  blobdb_->Flush(FlushOptions());
+
+#if 1
+  blobdb_->GetBaseDB()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  reinterpret_cast<DBImpl *>(blobdb_->GetBaseDB())->TEST_WaitForFlushMemTable();
+  reinterpret_cast<DBImpl *>(blobdb_->GetBaseDB())->TEST_WaitForCompact();
+#endif
+
+  Env::Default()->SleepForMicroseconds(120 * 1000 * 1000);
+}
+
+TEST_F(BlobDBTest, DeleteTest) {
+  BlobDBOptionsImpl bdboptions;
+  bdboptions.ttl_range_secs = 30;
+  bdboptions.gc_file_pct = 100;
+  bdboptions.partial_expiration_pct = 18;
+  bdboptions.gc_check_period_millisecs = 20 * 1000;
+  bdboptions.num_concurrent_simple_blobs = 1;
+  bdboptions.blob_file_size = 876 * 1024;
+
+  Reopen1(bdboptions);
+
+  WriteOptions wo;
+  ReadOptions ro;
+  std::string value;
+
+  Random rnd(301);
+  ColumnFamilyHandle *dcfh = blobdb_->DefaultColumnFamily();
+
+  for (size_t i = 0; i < 100; i++) {
+    int len = rnd.Next() % 16384;
+    if (!len) continue;
+
+    char *val = new char[len + 1];
+    gen_random(val, len);
+
+    std::string key("key");
+    key += std::to_string(i);
+
+    Slice keyslice(key);
+    Slice valslice(val, len + 1);
+
+    ASSERT_OK(blobdb_->Put(wo, dcfh, keyslice, valslice));
+    delete[] val;
+  }
+
+  for (size_t i = 0; i < 100; i += 5) {
+    std::string key("key");
+    key += std::to_string(i);
+
+    Slice keyslice(key);
+    blobdb_->Delete(wo, dcfh, keyslice);
+  }
+
+  Env::Default()->SleepForMicroseconds(60 * 1000 * 1000);
+}
+
+TEST_F(BlobDBTest, GCTestWithWrite) {
+  BlobDBOptionsImpl bdboptions;
+  bdboptions.ttl_range_secs = 30;
+  bdboptions.gc_file_pct = 100;
+  bdboptions.gc_check_period_millisecs = 20 * 1000;
+  bdboptions.default_ttl_extractor = true;
+
+  Reopen1(bdboptions);
+
+  WriteOptions wo;
+  ReadOptions ro;
+  std::string value;
+
+  ColumnFamilyHandle *dcfh = blobdb_->DefaultColumnFamily();
+
+  WriteBatch WB;
+
+  Random rnd(301);
+  for (size_t i = 0; i < 100; i++) {
+    int len = rnd.Next() % 16384;
+    if (!len) continue;
+
+    int ttl = 30;
+
+    char *val = new char[len + BlobDB::kTTLSuffixLength];
+    gen_random(val, len);
+    strncpy(val + len, "ttl:", 4);
+    EncodeFixed32(val + len + 4, ttl);
+
+    std::string key("key");
+    key += std::to_string(i);
+
+    Slice keyslice(key);
+    Slice valslice(val, len + BlobDB::kTTLSuffixLength);
+
+    WB.Put(dcfh, keyslice, valslice);
+    delete[] val;
+  }
+
+  ASSERT_OK(blobdb_->Write(wo, &WB));
+
+  Env::Default()->SleepForMicroseconds(120 * 1000 * 1000);
+}
+
+void cb_evict(const ColumnFamilyHandle *cfh, const Slice &key,
+              const Slice &val) {
+  fprintf(stderr, "key evicted: %s\n", key.ToString().c_str());
+}
+
+static const char *LONG_STRING =
+    "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
+    "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
+    "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
+    "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
+    "AJFJFJFJFJFJTWFNLLFKFFMFMFMFMFMFMFMFMFMFMFMFMFMMF "
+    "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
+    "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
+    "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
+    "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
+    "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
+    "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
+    "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
+    "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
+    "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
+    "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
+    "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
+    "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
+    "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
+    "AJFJFJAJFJFJFJFJTWBFNMFLLWMFMFMFMWKWMFMFMFMFMFMFM "
+    "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
+    "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
+    "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
+    "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
+    "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
+    "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
+    "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
+    "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH ";
+
+TEST_F(BlobDBTest, GetWithCompression) {
+  BlobDBOptionsImpl bdboptions;
+  bdboptions.gc_file_pct = 100;
+  bdboptions.gc_check_period_millisecs = 20 * 1000;
+  bdboptions.default_ttl_extractor = true;
+  bdboptions.gc_evict_cb_fn = &cb_evict;
+  bdboptions.compression = CompressionType::kLZ4Compression;
+
+  Reopen1(bdboptions);
+
+  WriteOptions wo;
+  ReadOptions ro;
+  std::string value;
+  Random rnd(301);
+
+  ColumnFamilyHandle *dcfh = blobdb_->DefaultColumnFamily();
+
+  std::string orig(LONG_STRING);
+
+  for (size_t i = 0; i < 10000; i++) {
+    int len = orig.length();
+    int ttl = 3000 * (rnd.Next() % 10);
+
+    char *val = new char[len + BlobDB::kTTLSuffixLength];
+    strncpy(val, LONG_STRING, len);
+    strncpy(val + len, "ttl:", 4);
+    EncodeFixed32(val + len + 4, ttl);
+
+    std::string key("key");
+    key += std::to_string(i);
+
+    Slice keyslice(key);
+    Slice valslice(val, len + BlobDB::kTTLSuffixLength);
+
+    ASSERT_OK(blobdb_->Put(wo, dcfh, keyslice, valslice));
+    delete[] val;
+  }
+
+  for (size_t i = 0; i < 10000; i++) {
+    std::string key("key");
+    key += std::to_string(i);
+
+    Slice keyslice(key);
+    std::string val;
+    Status s = blobdb_->Get(ro, dcfh, keyslice, &val);
+    ASSERT_TRUE(orig == val);
+  }
+
+  Env::Default()->SleepForMicroseconds(120 * 1000 * 1000);
+}
+
+TEST_F(BlobDBTest, GCTestWithPutAndCompression) {
+  BlobDBOptionsImpl bdboptions;
+  bdboptions.ttl_range_secs = 30;
+  bdboptions.gc_file_pct = 100;
+  bdboptions.gc_check_period_millisecs = 20 * 1000;
+  bdboptions.default_ttl_extractor = true;
+  bdboptions.gc_evict_cb_fn = &cb_evict;
+  bdboptions.compression = CompressionType::kLZ4Compression;
+
+  Reopen1(bdboptions);
+
+  WriteOptions wo;
+  ReadOptions ro;
+  std::string value;
+  Random rnd(301);
+
+  ColumnFamilyHandle *dcfh = blobdb_->DefaultColumnFamily();
+
+  for (size_t i = 0; i < 100; i++) {
+    int len = rnd.Next() % 16384;
+    if (!len) continue;
+
+    int ttl = 30;
+
+    char *val = new char[len + BlobDB::kTTLSuffixLength];
+    gen_random(val, len);
+    strncpy(val + len, "ttl:", 4);
+    EncodeFixed32(val + len + 4, ttl);
+
+    std::string key("key");
+    key += std::to_string(i);
+
+    Slice keyslice(key);
+    Slice valslice(val, len + BlobDB::kTTLSuffixLength);
+
+    ASSERT_OK(blobdb_->Put(wo, dcfh, keyslice, valslice));
+    delete[] val;
+  }
+
+  Env::Default()->SleepForMicroseconds(120 * 1000 * 1000);
+}
+
+TEST_F(BlobDBTest, GCTestWithPut) {
+  BlobDBOptionsImpl bdboptions;
+  bdboptions.ttl_range_secs = 30;
+  bdboptions.gc_file_pct = 100;
+  bdboptions.gc_check_period_millisecs = 20 * 1000;
+  bdboptions.default_ttl_extractor = true;
+  bdboptions.gc_evict_cb_fn = &cb_evict;
+
+  Reopen1(bdboptions);
+
+  WriteOptions wo;
+  ReadOptions ro;
+  std::string value;
+  Random rnd(301);
+
+  ColumnFamilyHandle *dcfh = blobdb_->DefaultColumnFamily();
+
+  for (size_t i = 0; i < 100; i++) {
+    int len = rnd.Next() % 16384;
+    if (!len) continue;
+
+    int ttl = 30;
+
+    char *val = new char[len + BlobDB::kTTLSuffixLength];
+    gen_random(val, len);
+    strncpy(val + len, "ttl:", 4);
+    EncodeFixed32(val + len + 4, ttl);
+
+    std::string key("key");
+    key += std::to_string(i);
+
+    Slice keyslice(key);
+    Slice valslice(val, len + BlobDB::kTTLSuffixLength);
+
+    ASSERT_OK(blobdb_->Put(wo, dcfh, keyslice, valslice));
+    delete[] val;
+  }
+
+  Env::Default()->SleepForMicroseconds(120 * 1000 * 1000);
+}
+
+TEST_F(BlobDBTest, GCTest) {
+  BlobDBOptionsImpl bdboptions;
+  bdboptions.ttl_range_secs = 30;
+  bdboptions.gc_file_pct = 100;
+
+  Reopen1(bdboptions);
+
+  WriteOptions wo;
+  ReadOptions ro;
+  std::string value;
+  Random rnd(301);
+
+  ColumnFamilyHandle *dcfh = blobdb_->DefaultColumnFamily();
+
+  for (size_t i = 0; i < 100; i++) {
+    int len = rnd.Next() % 16384;
+    if (!len) continue;
+
+    char *val = new char[len + 1];
+    gen_random(val, len);
+
+    std::string key("key");
+    key += std::to_string(i);
+
+    Slice keyslice(key);
+    Slice valslice(val, len + 1);
+
+    int ttl = 30;
+
+    ASSERT_OK(blobdb_->PutWithTTL(wo, dcfh, keyslice, valslice, ttl));
+    delete[] val;
+  }
+
+  Env::Default()->SleepForMicroseconds(240 * 1000 * 1000);
+}
+
+TEST_F(BlobDBTest, DISABLED_MultipleWriters) {
+  BlobDBOptionsImpl bdboptions;
+  Reopen1(bdboptions);
+
+  ASSERT_TRUE(blobdb_ != nullptr);
+
+  std::vector<std::thread> workers;
+  for (size_t ii = 0; ii < 10; ii++)
+    workers.push_back(std::thread(&BlobDBTest::insert_blobs, this));
+
+  for (std::thread &t : workers) {
+    if (t.joinable()) {
+      t.join();
+    }
+  }
+
+  Env::Default()->SleepForMicroseconds(180 * 1000 * 1000);
+  // ASSERT_OK(blobdb_->PutWithTTL(wo, dcfh, "bar", "v2", 60));
+  // ASSERT_OK(blobdb_->Get(ro, dcfh, "foo", &value));
+  // ASSERT_EQ("v1", value);
+  // ASSERT_OK(blobdb_->Get(ro, dcfh, "bar", &value));
+  // ASSERT_EQ("v2", value);
+}
+
+#if 0
 TEST_F(BlobDBTest, Large) {
+  ASSERT_TRUE(blobdb_ != nullptr);
+
   WriteOptions wo;
   ReadOptions ro;
   std::string value1, value2, value3;
   Random rnd(301);
+  ColumnFamilyHandle* dcfh = blobdb_->DefaultColumnFamily();
 
   value1.assign(8999, '1');
-  ASSERT_OK(db_->Put(wo, "foo", value1));
+  ASSERT_OK(blobdb_->PutWithTTL(wo, dcfh, "foo", value1, 3600));
   value2.assign(9001, '2');
-  ASSERT_OK(db_->Put(wo, "bar", value2));
+  ASSERT_OK(blobdb_->PutWithTTL(wo, dcfh, "bar", value2, 3600));
   test::RandomString(&rnd, 13333, &value3);
-  ASSERT_OK(db_->Put(wo, "barfoo", value3));
+  ASSERT_OK(blobdb_->PutWithTTL(wo, dcfh, "barfoo", value3, 3600));
 
   std::string value;
-  ASSERT_OK(db_->Get(ro, "foo", &value));
+  ASSERT_OK(blobdb_->Get(ro, dcfh, "foo", &value));
   ASSERT_EQ(value1, value);
-  ASSERT_OK(db_->Get(ro, "bar", &value));
+  ASSERT_OK(blobdb_->Get(ro, dcfh, "bar", &value));
   ASSERT_EQ(value2, value);
-  ASSERT_OK(db_->Get(ro, "barfoo", &value));
+  ASSERT_OK(blobdb_->Get(ro, dcfh, "barfoo", &value));
   ASSERT_EQ(value3, value);
 }
+#endif
 
+}  //  namespace blob_db
 }  //  namespace rocksdb
 
 // A black-box test for the ttl wrapper around rocksdb
diff --git a/utilities/blob_db/blob_file.cc b/utilities/blob_db/blob_file.cc
new file mode 100644
index 000000000..5c8d6864f
--- /dev/null
+++ b/utilities/blob_db/blob_file.cc
@@ -0,0 +1,225 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#ifndef ROCKSDB_LITE
+
+#include <chrono>
+#include <cinttypes>
+#include <memory>
+#include "utilities/blob_db/blob_db_impl.h"
+
+#include "util/filename.h"
+
+namespace rocksdb {
+
+namespace blob_db {
+
+BlobFile::BlobFile()
+    : parent_(nullptr),
+      file_number_(0),
+      blob_count_(0),
+      gc_epoch_(-1),
+      file_size_(0),
+      deleted_count_(0),
+      deleted_size_(0),
+      closed_(false),
+      can_be_deleted_(false),
+      gc_once_after_open_(false),
+      ttl_range_(std::make_pair(0, 0)),
+      time_range_(std::make_pair(0, 0)),
+      sn_range_(std::make_pair(0, 0)),
+      last_access_(-1),
+      last_fsync_(0),
+      header_valid_(false) {}
+
+BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn)
+    : parent_(p),
+      path_to_dir_(bdir),
+      file_number_(fn),
+      blob_count_(0),
+      gc_epoch_(-1),
+      file_size_(0),
+      deleted_count_(0),
+      deleted_size_(0),
+      closed_(false),
+      can_be_deleted_(false),
+      gc_once_after_open_(false),
+      ttl_range_(std::make_pair(0, 0)),
+      time_range_(std::make_pair(0, 0)),
+      sn_range_(std::make_pair(0, 0)),
+      last_access_(-1),
+      last_fsync_(0),
+      header_valid_(false) {}
+
+BlobFile::~BlobFile() {
+  if (can_be_deleted_) {
+    std::string pn(PathName());
+    Status s = Env::Default()->DeleteFile(PathName());
+    if (!s.ok()) {
+      // Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+      // "File could not be deleted %s", pn.c_str());
+    }
+  }
+}
+
+std::string BlobFile::PathName() const {
+  return BlobFileName(path_to_dir_, file_number_);
+}
+
+std::shared_ptr<Reader> BlobFile::OpenSequentialReader(
+    Env* env, const DBOptions& db_options,
+    const EnvOptions& env_options) const {
+  std::unique_ptr<SequentialFile> sfile;
+  Status s = env->NewSequentialFile(PathName(), &sfile, env_options);
+  if (!s.ok()) {
+    // report something here.
+    return nullptr;
+  }
+
+  std::unique_ptr<SequentialFileReader> sfile_reader;
+  sfile_reader.reset(new SequentialFileReader(std::move(sfile)));
+
+  std::shared_ptr<Reader> log_reader =
+      std::make_shared<Reader>(db_options.info_log, std::move(sfile_reader));
+
+  return log_reader;
+}
+
+std::string BlobFile::DumpState() const {
+  char str[1000];
+  std::snprintf(str, sizeof(str),
+                "path: %s fn: %" PRIu64 " blob_count: %" PRIu64
+                " gc_epoch: %" PRIu64 " file_size: %" PRIu64
+                " deleted_count: %" PRIu64 " deleted_size: %" PRIu64
+                " closed: %d can_be_deleted: %d ttl_range: (%d, %d)"
+                " sn_range: (%" PRIu64 " %" PRIu64 "), writer: %d reader: %d",
+                path_to_dir_.c_str(), file_number_, blob_count_.load(),
+                gc_epoch_.load(), file_size_.load(), deleted_count_,
+                deleted_size_, closed_.load(), can_be_deleted_.load(),
+                ttl_range_.first, ttl_range_.second, sn_range_.first,
+                sn_range_.second, (!!log_writer_), (!!ra_file_reader_));
+  return str;
+}
+
+bool BlobFile::NeedsFsync(bool hard, uint64_t bytes_per_sync) const {
+  assert(last_fsync_ <= file_size_);
+  return (hard) ? file_size_ > last_fsync_
+                : (file_size_ - last_fsync_) >= bytes_per_sync;
+}
+
+Status BlobFile::WriteFooterAndCloseLocked() {
+  Log(InfoLogLevel::INFO_LEVEL, parent_->db_options_.info_log,
+      "File is being closed after footer %s", PathName().c_str());
+
+  BlobLogFooter footer;
+  footer.blob_count_ = blob_count_;
+  if (HasTTL()) footer.set_ttl_range(ttl_range_);
+
+  footer.sn_range_ = sn_range_;
+  if (HasTimestamp()) footer.set_time_range(time_range_);
+
+  // this will close the file and reset the Writable File Pointer.
+  Status s = log_writer_->AppendFooter(footer);
+  if (s.ok()) {
+    closed_ = true;
+    file_size_ += BlobLogFooter::kFooterSize;
+  } else {
+    Log(InfoLogLevel::ERROR_LEVEL, parent_->db_options_.info_log,
+        "Failure to read Header for blob-file %s", PathName().c_str());
+  }
+  // delete the sequential writer
+  log_writer_.reset();
+  return s;
+}
+
+Status BlobFile::ReadFooter(BlobLogFooter* bf) {
+  if (file_size_ < (BlobLogHeader::kHeaderSize + BlobLogFooter::kFooterSize)) {
+    return Status::IOError("File does not have footer", PathName());
+  }
+
+  uint64_t footer_offset = file_size_ - BlobLogFooter::kFooterSize;
+  // assume that ra_file_reader_ is valid before we enter this
+  assert(ra_file_reader_);
+
+  Slice result;
+  char scratch[BlobLogFooter::kFooterSize + 10];
+  Status s = ra_file_reader_->Read(footer_offset, BlobLogFooter::kFooterSize,
+                                   &result, scratch);
+  if (!s.ok()) return s;
+  if (result.size() != BlobLogFooter::kFooterSize) {
+    // should not happen
+    return Status::IOError("EOF reached before footer");
+  }
+
+  s = bf->DecodeFrom(&result);
+  return s;
+}
+
+Status BlobFile::SetFromFooterLocked(const BlobLogFooter& footer) {
+  if (footer.HasTTL() != header_.HasTTL()) {
+    return Status::Corruption("has_ttl mismatch");
+  }
+  if (footer.HasTimestamp() != header_.HasTimestamp()) {
+    return Status::Corruption("has_ts mismatch");
+  }
+
+  // assume that file has been fully fsync'd
+  last_fsync_.store(file_size_);
+  blob_count_ = footer.GetBlobCount();
+  ttl_range_ = footer.GetTTLRange();
+  time_range_ = footer.GetTimeRange();
+  sn_range_ = footer.GetSNRange();
+  closed_ = true;
+
+  return Status::OK();
+}
+
+void BlobFile::Fsync() {
+  if (log_writer_.get()) {
+    log_writer_->Sync();
+    last_fsync_.store(file_size_.load());
+  }
+}
+
+void BlobFile::CloseRandomAccessLocked() {
+  ra_file_reader_.reset();
+  last_access_ = -1;
+}
+
+std::shared_ptr<RandomAccessFileReader> BlobFile::GetOrOpenRandomAccessReader(
+    Env* env, const EnvOptions& env_options, bool* fresh_open) {
+  *fresh_open = false;
+  last_access_ =
+      std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
+  {
+    ReadLock lockbfile_r(&mutex_);
+    if (ra_file_reader_) return ra_file_reader_;
+  }
+
+  WriteLock lockbfile_w(&mutex_);
+  if (ra_file_reader_) return ra_file_reader_;
+
+  std::unique_ptr<RandomAccessFile> rfile;
+  Status s = env->NewRandomAccessFile(PathName(), &rfile, env_options);
+  if (!s.ok()) {
+    Log(InfoLogLevel::ERROR_LEVEL, parent_->db_options_.info_log,
+        "Failed to open blob file for random-read: %s status: '%s'"
+        " exists: '%s'",
+        PathName().c_str(), s.ToString().c_str(),
+        env->FileExists(PathName()).ToString().c_str());
+    return nullptr;
+  }
+
+  ra_file_reader_ = std::make_shared<RandomAccessFileReader>(std::move(rfile));
+  *fresh_open = true;
+  return ra_file_reader_;
+}
+
+ColumnFamilyHandle* BlobFile::GetColumnFamily(DB* db) {
+  return db->DefaultColumnFamily();
+}
+
+}  // namespace blob_db
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/utilities/blob_db/blob_log_format.cc b/utilities/blob_db/blob_log_format.cc
new file mode 100644
index 000000000..051e9bb01
--- /dev/null
+++ b/utilities/blob_db/blob_log_format.cc
@@ -0,0 +1,313 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#ifndef ROCKSDB_LITE
+
+#include "utilities/blob_db/blob_log_format.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace rocksdb {
+namespace blob_db {
+
+const uint32_t kMagicNumber = 2395959;
+const uint32_t kVersion1 = 1;
+const size_t kBlockSize = 32768;
+
+BlobLogHeader::BlobLogHeader()
+    : magic_number_(kMagicNumber), compression_(kNoCompression) {}
+
+BlobLogHeader& BlobLogHeader::operator=(BlobLogHeader&& in) noexcept {
+  if (this != &in) {
+    magic_number_ = in.magic_number_;
+    version_ = in.version_;
+    ttl_guess_ = std::move(in.ttl_guess_);
+    ts_guess_ = std::move(in.ts_guess_);
+    compression_ = in.compression_;
+  }
+  return *this;
+}
+
+BlobLogFooter::BlobLogFooter() : magic_number_(kMagicNumber), blob_count_(0) {}
+
+Status BlobLogFooter::DecodeFrom(Slice* input) {
+  uint32_t val;
+  if (!GetFixed32(input, &val)) {
+    return Status::Corruption("Invalid Blob Footer: flags");
+  }
+
+  bool has_ttl = false;
+  bool has_ts = false;
+  val >>= 8;
+  RecordSubType st = static_cast<RecordSubType>(val);
+  switch (st) {
+    case kRegularType:
+      break;
+    case kTTLType:
+      has_ttl = true;
+      break;
+    case kTimestampType:
+      has_ts = true;
+      break;
+    default:
+      return Status::Corruption("Invalid Blob Footer: flags_val");
+  }
+
+  if (!GetFixed64(input, &blob_count_)) {
+    return Status::Corruption("Invalid Blob Footer: blob_count");
+  }
+
+  ttlrange_t temp_ttl;
+  if (!GetFixed32(input, &temp_ttl.first) ||
+      !GetFixed32(input, &temp_ttl.second)) {
+    return Status::Corruption("Invalid Blob Footer: ttl_range");
+  }
+  if (has_ttl) {
+    printf("has ttl\n");
+    ttl_range_.reset(new ttlrange_t(temp_ttl));
+  }
+
+  if (!GetFixed64(input, &sn_range_.first) ||
+      !GetFixed64(input, &sn_range_.second)) {
+    return Status::Corruption("Invalid Blob Footer: sn_range");
+  }
+
+  tsrange_t temp_ts;
+  if (!GetFixed64(input, &temp_ts.first) ||
+      !GetFixed64(input, &temp_ts.second)) {
+    return Status::Corruption("Invalid Blob Footer: ts_range");
+  }
+  if (has_ts) ts_range_.reset(new tsrange_t(temp_ts));
+
+  if (!GetFixed32(input, &magic_number_) || magic_number_ != kMagicNumber) {
+    return Status::Corruption("Invalid Blob Footer: magic");
+  }
+
+  return Status::OK();
+}
+
+void BlobLogFooter::EncodeTo(std::string* dst) const {
+  dst->reserve(kFooterSize);
+
+  RecordType rt = kFullType;
+  RecordSubType st = kRegularType;
+  if (HasTTL()) {
+    st = kTTLType;
+  } else if (HasTimestamp()) {
+    st = kTimestampType;
+  }
+  uint32_t val = static_cast<uint32_t>(rt) | (static_cast<uint32_t>(st) << 8);
+  PutFixed32(dst, val);
+
+  PutFixed64(dst, blob_count_);
+  bool has_ttl = HasTTL();
+  bool has_ts = HasTimestamp();
+
+  if (has_ttl) {
+    PutFixed32(dst, ttl_range_.get()->first);
+    PutFixed32(dst, ttl_range_.get()->second);
+  } else {
+    PutFixed32(dst, 0);
+    PutFixed32(dst, 0);
+  }
+  PutFixed64(dst, sn_range_.first);
+  PutFixed64(dst, sn_range_.second);
+
+  if (has_ts) {
+    PutFixed64(dst, ts_range_.get()->first);
+    PutFixed64(dst, ts_range_.get()->second);
+  } else {
+    PutFixed64(dst, 0);
+    PutFixed64(dst, 0);
+  }
+
+  PutFixed32(dst, magic_number_);
+}
+
+void BlobLogHeader::EncodeTo(std::string* dst) const {
+  dst->reserve(kHeaderSize);
+
+  PutFixed32(dst, magic_number_);
+
+  PutFixed32(dst, version_);
+
+  RecordSubType st = kRegularType;
+  bool has_ttl = HasTTL();
+  bool has_ts = HasTimestamp();
+
+  if (has_ttl) {
+    st = kTTLType;
+  } else if (has_ts) {
+    st = kTimestampType;
+  }
+  uint32_t val =
+      static_cast<uint32_t>(st) | (static_cast<uint32_t>(compression_) << 8);
+  PutFixed32(dst, val);
+
+  if (has_ttl) {
+    PutFixed32(dst, ttl_guess_.get()->first);
+    PutFixed32(dst, ttl_guess_.get()->second);
+  } else {
+    PutFixed32(dst, 0);
+    PutFixed32(dst, 0);
+  }
+
+  if (has_ts) {
+    PutFixed64(dst, ts_guess_.get()->first);
+    PutFixed64(dst, ts_guess_.get()->second);
+  } else {
+    PutFixed64(dst, 0);
+    PutFixed64(dst, 0);
+  }
+}
+
+Status BlobLogHeader::DecodeFrom(Slice* input) {
+  if (!GetFixed32(input, &magic_number_) || magic_number_ != kMagicNumber) {
+    return Status::Corruption("Invalid Blob Log Header: magic");
+  }
+
+  // as of today, we only support 1 version
+  if (!GetFixed32(input, &version_) || version_ != kVersion1) {
+    return Status::Corruption("Invalid Blob Log Header: version");
+  }
+
+  uint32_t val;
+  if (!GetFixed32(input, &val)) {
+    return Status::Corruption("Invalid Blob Log Header: subtype");
+  }
+
+  bool has_ttl = false;
+  bool has_ts = false;
+  RecordSubType st = static_cast<RecordSubType>(val & 0xff);
+  compression_ = static_cast<CompressionType>((val >> 8) & 0xff);
+  switch (st) {
+    case kRegularType:
+      break;
+    case kTTLType:
+      has_ttl = true;
+      break;
+    case kTimestampType:
+      has_ts = true;
+      break;
+    default:
+      return Status::Corruption("Invalid Blob Log Header: subtype_2");
+  }
+
+  ttlrange_t temp_ttl;
+  if (!GetFixed32(input, &temp_ttl.first) ||
+      !GetFixed32(input, &temp_ttl.second)) {
+    return Status::Corruption("Invalid Blob Log Header: ttl");
+  }
+  if (has_ttl) set_ttl_guess(temp_ttl);
+
+  tsrange_t temp_ts;
+  if (!GetFixed64(input, &temp_ts.first) ||
+      !GetFixed64(input, &temp_ts.second)) {
+    return Status::Corruption("Invalid Blob Log Header: timestamp");
+  }
+  if (has_ts) set_ts_guess(temp_ts);
+
+  return Status::OK();
+}
+
+BlobLogRecord::BlobLogRecord()
+    : checksum_(0),
+      header_cksum_(0),
+      key_size_(0),
+      blob_size_(0),
+      time_val_(0),
+      ttl_val_(0),
+      sn_(0),
+      type_(0),
+      subtype_(0) {}
+
+BlobLogRecord::~BlobLogRecord() {}
+
+void BlobLogRecord::ResizeKeyBuffer(size_t kbs) {
+  if (kbs > key_buffer_.size()) {
+    key_buffer_.resize(kbs);
+  }
+}
+
+void BlobLogRecord::ResizeBlobBuffer(size_t bbs) {
+  if (bbs > blob_buffer_.size()) {
+    blob_buffer_.resize(bbs);
+  }
+}
+
+void BlobLogRecord::Clear() {
+  checksum_ = 0;
+  header_cksum_ = 0;
+  key_size_ = 0;
+  blob_size_ = 0;
+  time_val_ = 0;
+  ttl_val_ = 0;
+  sn_ = 0;
+  type_ = subtype_ = 0;
+  key_.clear();
+  blob_.clear();
+}
+
+Status BlobLogRecord::DecodeHeaderFrom(const Slice& hdrslice) {
+  Slice input = hdrslice;
+  if (input.size() < kHeaderSize) {
+    return Status::Corruption("Invalid Blob Record Header: size");
+  }
+
+  if (!GetFixed32(&input, &key_size_)) {
+    return Status::Corruption("Invalid Blob Record Header: key_size");
+  }
+  if (!GetFixed64(&input, &blob_size_)) {
+    return Status::Corruption("Invalid Blob Record Header: blob_size");
+  }
+  if (!GetFixed32(&input, &ttl_val_)) {
+    return Status::Corruption("Invalid Blob Record Header: ttl_val");
+  }
+  if (!GetFixed64(&input, &time_val_)) {
+    return Status::Corruption("Invalid Blob Record Header: time_val");
+  }
+
+  type_ = *(input.data());
+  input.remove_prefix(1);
+  subtype_ = *(input.data());
+  input.remove_prefix(1);
+
+  if (!GetFixed32(&input, &header_cksum_)) {
+    return Status::Corruption("Invalid Blob Record Header: header_cksum");
+  }
+  if (!GetFixed32(&input, &checksum_)) {
+    return Status::Corruption("Invalid Blob Record Header: checksum");
+  }
+
+  return Status::OK();
+}
+
+Status BlobLogRecord::DecodeFooterFrom(const Slice& footerslice) {
+  Slice input = footerslice;
+  if (input.size() < kFooterSize) {
+    return Status::Corruption("Invalid Blob Record Footer: size");
+  }
+
+  uint32_t f_crc = crc32c::Extend(0, input.data(), 8);
+  f_crc = crc32c::Mask(f_crc);
+
+  if (!GetFixed64(&input, &sn_)) {
+    return Status::Corruption("Invalid Blob Record Footer: sn");
+  }
+
+  if (!GetFixed32(&input, &footer_cksum_)) {
+    return Status::Corruption("Invalid Blob Record Footer: cksum");
+  }
+
+  if (f_crc != footer_cksum_) {
+    return Status::Corruption("Record Checksum mismatch: footer_cksum");
+  }
+
+  return Status::OK();
+}
+
+}  // namespace blob_db
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/utilities/blob_db/blob_log_format.h b/utilities/blob_db/blob_log_format.h
new file mode 100644
index 000000000..c688ed400
--- /dev/null
+++ b/utilities/blob_db/blob_log_format.h
@@ -0,0 +1,226 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Log format information shared by reader and writer.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace rocksdb {
+
+namespace blob_db {
+class BlobFile;
+class BlobDBImpl;
+
+enum RecordType : uint8_t {
+  // Zero is reserved for preallocated files
+  kFullType = 0,
+
+  // For fragments
+  kFirstType = 1,
+  kMiddleType = 2,
+  kLastType = 3,
+  kMaxRecordType = kLastType
+};
+
+enum RecordSubType : uint8_t {
+  kRegularType = 0,
+  kTTLType = 1,
+  kTimestampType = 2,
+};
+
+extern const uint32_t kMagicNumber;
+
+class Reader;
+
+typedef std::pair<uint32_t, uint32_t> ttlrange_t;
+typedef std::pair<uint64_t, uint64_t> tsrange_t;
+typedef std::pair<rocksdb::SequenceNumber, rocksdb::SequenceNumber> snrange_t;
+
+class BlobLogHeader {
+  friend class BlobFile;
+  friend class BlobDBImpl;
+
+ private:
+  uint32_t magic_number_ = 0;
+  uint32_t version_ = 1;
+  CompressionType compression_;
+  std::unique_ptr<ttlrange_t> ttl_guess_;
+  std::unique_ptr<tsrange_t> ts_guess_;
+
+ private:
+  void set_ttl_guess(const ttlrange_t& ttl) {
+    ttl_guess_.reset(new ttlrange_t(ttl));
+  }
+
+  void set_version(uint32_t v) { version_ = v; }
+
+  void set_ts_guess(const tsrange_t& ts) { ts_guess_.reset(new tsrange_t(ts)); }
+
+ public:
+  // magic number + version + flags + ttl guess + timestamp range
+  static const size_t kHeaderSize = 4 + 4 + 4 + 4 * 2 + 8 * 2;
+  // 32
+
+  void EncodeTo(std::string* dst) const;
+
+  Status DecodeFrom(Slice* input);
+
+  BlobLogHeader();
+
+  bool HasTTL() const { return !!ttl_guess_; }
+
+  bool HasTimestamp() const { return !!ts_guess_; }
+
+  BlobLogHeader& operator=(BlobLogHeader&& in) noexcept;
+};
+
+// Footer encapsulates the fixed information stored at the tail
+// end of every blob log file.
+class BlobLogFooter {
+  friend class BlobFile;
+
+ public:
+  // Use this constructor when you plan to write out the footer using
+  // EncodeTo(). Never use this constructor with DecodeFrom().
+  BlobLogFooter();
+
+  uint64_t magic_number() const { return magic_number_; }
+
+  void EncodeTo(std::string* dst) const;
+
+  Status DecodeFrom(Slice* input);
+
+  // convert this object to a human readable form
+  std::string ToString() const;
+
+  // footer size = 4 byte magic number
+  // 8 bytes count
+  // 4, 4 - ttl range
+  // 8, 8 - sn range
+  // 8, 8 - ts range
+  // = 56
+  static const size_t kFooterSize = 4 + 4 + 8 + (4 * 2) + (8 * 2) + (8 * 2);
+
+  bool HasTTL() const { return !!ttl_range_; }
+
+  bool HasTimestamp() const { return !!ts_range_; }
+
+  uint64_t GetBlobCount() const { return blob_count_; }
+
+  ttlrange_t GetTTLRange() const {
+    if (ttl_range_) {
+      *ttl_range_;
+    }
+    return {0, 0};
+  }
+
+  tsrange_t GetTimeRange() const {
+    if (ts_range_) {
+      return *ts_range_;
+    }
+    return {0, 0};
+  }
+
+  const snrange_t& GetSNRange() const { return sn_range_; }
+
+ private:
+  uint32_t magic_number_ = 0;
+  uint64_t blob_count_ = 0;
+
+  std::unique_ptr<ttlrange_t> ttl_range_;
+  std::unique_ptr<tsrange_t> ts_range_;
+  snrange_t sn_range_;
+
+ private:
+  void set_ttl_range(const ttlrange_t& ttl) {
+    ttl_range_.reset(new ttlrange_t(ttl));
+  }
+  void set_time_range(const tsrange_t& ts) {
+    ts_range_.reset(new tsrange_t(ts));
+  }
+};
+
+extern const size_t kBlockSize;
+
+class BlobLogRecord {
+  friend class Reader;
+
+ private:
+  // this might not be set.
+  uint32_t checksum_;
+  uint32_t header_cksum_;
+  uint32_t key_size_;
+  uint64_t blob_size_;
+  uint64_t time_val_;
+  uint32_t ttl_val_;
+  SequenceNumber sn_;
+  uint32_t footer_cksum_;
+  char type_;
+  char subtype_;
+  Slice key_;
+  Slice blob_;
+  std::string key_buffer_;
+  std::string blob_buffer_;
+
+ private:
+  void Clear();
+
+  char* GetKeyBuffer() { return &(key_buffer_[0]); }
+
+  char* GetBlobBuffer() { return &(blob_buffer_[0]); }
+
+  void ResizeKeyBuffer(size_t kbs);
+
+  void ResizeBlobBuffer(size_t bbs);
+
+ public:
+  // Header is
+  // Key Length ( 4 bytes ),
+  // Blob Length ( 8 bytes), timestamp/ttl (8 bytes),
+  // type (1 byte), subtype (1 byte)
+  // header checksum (4 bytes), blob checksum (4 bytes),
+  // = 34
+  static const size_t kHeaderSize = 4 + 4 + 4 + 8 + 4 + 8 + 1 + 1;
+
+  static const size_t kFooterSize = 8 + 4;
+
+ public:
+  BlobLogRecord();
+
+  ~BlobLogRecord();
+
+  const Slice& Key() const { return key_; }
+
+  const Slice& Blob() const { return blob_; }
+
+  uint32_t GetKeySize() const { return key_size_; }
+
+  uint64_t GetBlobSize() const { return blob_size_; }
+
+  uint32_t GetTTL() const { return ttl_val_; }
+
+  uint64_t GetTimeVal() const { return time_val_; }
+
+  SequenceNumber GetSN() const { return sn_; }
+
+  Status DecodeHeaderFrom(const Slice& hdrslice);
+
+  Status DecodeFooterFrom(const Slice& footerslice);
+};
+
+}  // namespace blob_db
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/utilities/blob_db/blob_log_reader.cc b/utilities/blob_db/blob_log_reader.cc
new file mode 100644
index 000000000..c93a520ae
--- /dev/null
+++ b/utilities/blob_db/blob_log_reader.cc
@@ -0,0 +1,163 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#ifndef ROCKSDB_LITE
+
+#include "utilities/blob_db/blob_log_reader.h"
+
+#include <cstdio>
+#include "rocksdb/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/file_reader_writer.h"
+
+namespace rocksdb {
+namespace blob_db {
+
+Reader::Reader(std::shared_ptr<Logger> info_log,
+               unique_ptr<SequentialFileReader>&& _file)
+    : info_log_(info_log), file_(std::move(_file)), buffer_(), next_byte_(0) {
+  backing_store_.resize(kBlockSize);
+}
+
+Reader::~Reader() {}
+
+Status Reader::ReadHeader(BlobLogHeader* header) {
+  assert(file_.get() != nullptr);
+  assert(next_byte_ == 0);
+  Status status =
+      file_->Read(BlobLogHeader::kHeaderSize, &buffer_, GetReadBuffer());
+  next_byte_ += buffer_.size();
+  if (!status.ok()) return status;
+
+  if (buffer_.size() != BlobLogHeader::kHeaderSize) {
+    return Status::IOError("EOF reached before file header");
+  }
+
+  status = header->DecodeFrom(&buffer_);
+  return status;
+}
+
+Status Reader::ReadRecord(BlobLogRecord* record, ReadLevel level,
+                          WALRecoveryMode wal_recovery_mode) {
+  record->Clear();
+  buffer_.clear();
+  backing_store_[0] = '\0';
+
+  Status status =
+      file_->Read(BlobLogRecord::kHeaderSize, &buffer_, GetReadBuffer());
+  next_byte_ += buffer_.size();
+  if (!status.ok()) return status;
+  if (buffer_.size() != BlobLogRecord::kHeaderSize) {
+    return Status::IOError("EOF reached before record header");
+  }
+
+  status = record->DecodeHeaderFrom(buffer_);
+  if (!status.ok()) return status;
+
+  uint32_t header_crc = 0;
+  uint32_t blob_crc = 0;
+  size_t crc_data_size = BlobLogRecord::kHeaderSize - 2 * sizeof(uint32_t);
+  header_crc = crc32c::Extend(header_crc, buffer_.data(), crc_data_size);
+
+  uint64_t kb_size = record->GetKeySize() + record->GetBlobSize();
+  switch (level) {
+    case kReadHdrFooter:
+      file_->Skip(kb_size);
+      next_byte_ += kb_size;
+      status =
+          file_->Read(BlobLogRecord::kFooterSize, &buffer_, GetReadBuffer());
+      next_byte_ += buffer_.size();
+      if (!status.ok()) return status;
+      if (buffer_.size() != BlobLogRecord::kFooterSize) {
+        return Status::IOError("EOF reached before record footer");
+      }
+
+      status = record->DecodeFooterFrom(buffer_);
+      return status;
+
+    case kReadHdrKeyFooter:
+      record->ResizeKeyBuffer(record->GetKeySize());
+      status = file_->Read(record->GetKeySize(), &record->key_,
+                           record->GetKeyBuffer());
+      next_byte_ += record->key_.size();
+      if (!status.ok()) return status;
+      if (record->key_.size() != record->GetKeySize()) {
+        return Status::IOError("EOF reached before key read");
+      }
+
+      header_crc =
+          crc32c::Extend(header_crc, record->key_.data(), record->GetKeySize());
+      header_crc = crc32c::Mask(header_crc);
+      if (header_crc != record->header_cksum_) {
+        return Status::Corruption("Record Checksum mismatch: header_cksum");
+      }
+
+      file_->Skip(record->GetBlobSize());
+      next_byte_ += record->GetBlobSize();
+
+      status =
+          file_->Read(BlobLogRecord::kFooterSize, &buffer_, GetReadBuffer());
+      next_byte_ += buffer_.size();
+      if (!status.ok()) return status;
+      if (buffer_.size() != BlobLogRecord::kFooterSize) {
+        return Status::IOError("EOF reached during footer read");
+      }
+
+      status = record->DecodeFooterFrom(buffer_);
+      return status;
+
+    case kReadHdrKeyBlobFooter:
+      record->ResizeKeyBuffer(record->GetKeySize());
+      status = file_->Read(record->GetKeySize(), &record->key_,
+                           record->GetKeyBuffer());
+      next_byte_ += record->key_.size();
+      if (!status.ok()) return status;
+      if (record->key_.size() != record->GetKeySize()) {
+        return Status::IOError("EOF reached before key read");
+      }
+
+      header_crc =
+          crc32c::Extend(header_crc, record->key_.data(), record->GetKeySize());
+      header_crc = crc32c::Mask(header_crc);
+      if (header_crc != record->header_cksum_) {
+        return Status::Corruption("Record Checksum mismatch: header_cksum");
+      }
+
+      record->ResizeBlobBuffer(record->GetBlobSize());
+      status = file_->Read(record->GetBlobSize(), &record->blob_,
+                           record->GetBlobBuffer());
+      next_byte_ += record->blob_.size();
+      if (!status.ok()) return status;
+      if (record->blob_.size() != record->GetBlobSize()) {
+        return Status::IOError("EOF reached during blob read");
+      }
+
+      blob_crc =
+          crc32c::Extend(blob_crc, record->blob_.data(), record->blob_.size());
+      blob_crc = crc32c::Mask(blob_crc);
+      if (blob_crc != record->checksum_) {
+        return Status::Corruption("Blob Checksum mismatch");
+      }
+
+      status =
+          file_->Read(BlobLogRecord::kFooterSize, &buffer_, GetReadBuffer());
+      next_byte_ += buffer_.size();
+      if (!status.ok()) return status;
+      if (buffer_.size() != BlobLogRecord::kFooterSize) {
+        return Status::IOError("EOF reached during blob footer read");
+      }
+
+      status = record->DecodeFooterFrom(buffer_);
+      return status;
+    default:
+      assert(0);
+      return status;
+  }
+}
+
+}  // namespace blob_db
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/utilities/blob_db/blob_log_reader.h b/utilities/blob_db/blob_log_reader.h
new file mode 100644
index 000000000..5805ceb5e
--- /dev/null
+++ b/utilities/blob_db/blob_log_reader.h
@@ -0,0 +1,93 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "utilities/blob_db/blob_log_format.h"
+
+namespace rocksdb {
+
+class SequentialFileReader;
+class Logger;
+
+namespace blob_db {
+
+/**
+ * Reader is a general purpose log stream reader implementation. The actual job
+ * of reading from the device is implemented by the SequentialFile interface.
+ *
+ * Please see Writer for details on the file and record layout.
+ */
+class Reader {
+ public:
+  enum ReadLevel {
+    kReadHdrFooter,
+    kReadHdrKeyFooter,
+    kReadHdrKeyBlobFooter,
+  };
+
+  // Create a reader that will return log records from "*file".
+  // "*file" must remain live while this Reader is in use.
+  //
+  // If "reporter" is non-nullptr, it is notified whenever some data is
+  // dropped due to a detected corruption.  "*reporter" must remain
+  // live while this Reader is in use.
+  //
+  // If "checksum" is true, verify checksums if available.
+  //
+  // The Reader will start reading at the first record located at physical
+  // position >= initial_offset within the file.
+  Reader(std::shared_ptr<Logger> info_log,
+         std::unique_ptr<SequentialFileReader>&& file);
+
+  ~Reader();
+
+  Status ReadHeader(BlobLogHeader* header);
+
+  // Read the next record into *record.  Returns true if read
+  // successfully, false if we hit end of the input.  May use
+  // "*scratch" as temporary storage.  The contents filled in *record
+  // will only be valid until the next mutating operation on this
+  // reader or the next mutation to *scratch.
+  Status ReadRecord(BlobLogRecord* record, ReadLevel level = kReadHdrFooter,
+                    WALRecoveryMode wal_recovery_mode =
+                        WALRecoveryMode::kTolerateCorruptedTailRecords);
+
+  SequentialFileReader* file() { return file_.get(); }
+
+  void ResetNextByte() { next_byte_ = 0; }
+
+  uint64_t GetNextByte() const { return next_byte_; }
+
+ private:
+  char* GetReadBuffer() { return &(backing_store_[0]); }
+
+ private:
+  std::shared_ptr<Logger> info_log_;
+  const std::unique_ptr<SequentialFileReader> file_;
+
+  std::string backing_store_;
+  Slice buffer_;
+
+  // which byte to read next. For asserting proper usage
+  uint64_t next_byte_;
+
+  // No copying allowed
+  Reader(const Reader&) = delete;
+  Reader& operator=(const Reader&) = delete;
+};
+
+}  // namespace blob_db
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/utilities/blob_db/blob_log_writer.cc b/utilities/blob_db/blob_log_writer.cc
new file mode 100644
index 000000000..295624ddc
--- /dev/null
+++ b/utilities/blob_db/blob_log_writer.cc
@@ -0,0 +1,172 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#ifndef ROCKSDB_LITE
+
+#include "utilities/blob_db/blob_log_writer.h"
+
+#include <cstdint>
+#include <limits>
+#include <string>
+#include "rocksdb/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/file_reader_writer.h"
+
+namespace rocksdb {
+namespace blob_db {
+
+Writer::Writer(unique_ptr<WritableFileWriter>&& dest, uint64_t log_number,
+               uint64_t bpsync, bool use_fs, uint64_t boffset)
+    : dest_(std::move(dest)),
+      log_number_(log_number),
+      block_offset_(boffset),
+      bytes_per_sync_(bpsync),
+      next_sync_offset_(0),
+      use_fsync_(use_fs),
+      last_elem_type_(kEtNone) {
+  for (int i = 0; i <= kMaxRecordType; i++) {
+    char t = static_cast<char>(i);
+    type_crc_[i] = crc32c::Value(&t, 1);
+  }
+}
+
+Writer::~Writer() {}
+
+void Writer::Sync() { dest_->Sync(use_fsync_); }
+
+Status Writer::WriteHeader(const BlobLogHeader& header) {
+  assert(block_offset_ == 0);
+  assert(last_elem_type_ == kEtNone);
+  std::string str;
+  header.EncodeTo(&str);
+
+  Status s = dest_->Append(Slice(str));
+  if (s.ok()) {
+    block_offset_ += str.size();
+    s = dest_->Flush();
+  }
+  last_elem_type_ = kEtFileHdr;
+  return s;
+}
+
+Status Writer::AppendFooter(const BlobLogFooter& footer) {
+  assert(block_offset_ != 0);
+  assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtFooter);
+
+  std::string str;
+  footer.EncodeTo(&str);
+
+  Status s = dest_->Append(Slice(str));
+  if (s.ok()) {
+    block_offset_ += str.size();
+    s = dest_->Close();
+    dest_.reset();
+  }
+
+  last_elem_type_ = kEtFileFooter;
+  return s;
+}
+
+Status Writer::AddRecord(const Slice& key, const Slice& val,
+                         uint64_t* key_offset, uint64_t* blob_offset,
+                         uint32_t ttl) {
+  assert(block_offset_ != 0);
+  assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtFooter);
+
+  std::string buf;
+  ConstructBlobHeader(&buf, key, val, ttl, -1);
+
+  Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset);
+  return s;
+}
+
+Status Writer::AddRecord(const Slice& key, const Slice& val,
+                         uint64_t* key_offset, uint64_t* blob_offset) {
+  assert(block_offset_ != 0);
+  assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtFooter);
+
+  std::string buf;
+  ConstructBlobHeader(&buf, key, val, -1, -1);
+
+  Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset);
+  return s;
+}
+
+void Writer::ConstructBlobHeader(std::string* headerbuf, const Slice& key,
+                                 const Slice& val, int32_t ttl, int64_t ts) {
+  headerbuf->reserve(BlobLogRecord::kHeaderSize);
+
+  uint32_t key_size = static_cast<uint32_t>(key.size());
+  PutFixed32(headerbuf, key_size);
+  PutFixed64(headerbuf, val.size());
+
+  uint32_t ttl_write = (ttl != -1) ? static_cast<uint32_t>(ttl)
+                                   : std::numeric_limits<uint32_t>::max();
+  PutFixed32(headerbuf, ttl_write);
+
+  uint64_t ts_write = (ts != -1) ? static_cast<uint64_t>(ts)
+                                 : std::numeric_limits<uint64_t>::max();
+  PutFixed64(headerbuf, ts_write);
+
+  RecordType t = kFullType;
+  headerbuf->push_back(static_cast<char>(t));
+
+  RecordSubType st = kRegularType;
+  if (ttl != -1) st = kTTLType;
+  headerbuf->push_back(static_cast<char>(st));
+
+  uint32_t header_crc = 0;
+  header_crc =
+      crc32c::Extend(header_crc, headerbuf->c_str(), headerbuf->size());
+  header_crc = crc32c::Extend(header_crc, key.data(), key.size());
+  header_crc = crc32c::Mask(header_crc);
+  PutFixed32(headerbuf, header_crc);
+
+  uint32_t crc = 0;
+  // Compute the crc of the record type and the payload.
+  crc = crc32c::Extend(crc, val.data(), val.size());
+  crc = crc32c::Mask(crc);  // Adjust for storage
+  PutFixed32(headerbuf, crc);
+}
+
+Status Writer::EmitPhysicalRecord(const std::string& headerbuf,
+                                  const Slice& key, const Slice& val,
+                                  uint64_t* key_offset, uint64_t* blob_offset) {
+  Status s = dest_->Append(Slice(headerbuf));
+  if (s.ok()) {
+    s = dest_->Append(key);
+    if (s.ok()) s = dest_->Append(val);
+  }
+
+  *key_offset = block_offset_ + BlobLogRecord::kHeaderSize;
+  *blob_offset = *key_offset + key.size();
+  block_offset_ = *blob_offset + val.size();
+  last_elem_type_ = kEtRecord;
+  return s;
+}
+
+Status Writer::AddRecordFooter(const SequenceNumber& seq) {
+  assert(last_elem_type_ == kEtRecord);
+
+  std::string buf;
+  PutFixed64(&buf, seq);
+
+  uint32_t footer_crc = crc32c::Extend(0, buf.c_str(), buf.size());
+  footer_crc = crc32c::Mask(footer_crc);
+  PutFixed32(&buf, footer_crc);
+
+  Status s = dest_->Append(Slice(buf));
+  block_offset_ += BlobLogRecord::kFooterSize;
+
+  if (s.ok()) dest_->Flush();
+
+  last_elem_type_ = kEtFooter;
+  return s;
+}
+
+}  // namespace blob_db
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/utilities/blob_db/blob_log_writer.h b/utilities/blob_db/blob_log_writer.h
new file mode 100644
index 000000000..4443c4eeb
--- /dev/null
+++ b/utilities/blob_db/blob_log_writer.h
@@ -0,0 +1,98 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "utilities/blob_db/blob_log_format.h"
+
+namespace rocksdb {
+
+class WritableFileWriter;
+
+namespace blob_db {
+
+/**
+ * Writer is the blob log stream writer. It provides an append-only
+ * abstraction for writing blob data.
+ *
+ *
+ * Look at blob_db_format.h to see the details of the record formats.
+ */
+
+class Writer {
+ public:
+  // Create a writer that will append data to "*dest".
+  // "*dest" must be initially empty.
+  // "*dest" must remain live while this Writer is in use.
+  explicit Writer(std::unique_ptr<WritableFileWriter>&& dest,
+                  uint64_t log_number, uint64_t bpsync, bool use_fsync,
+                  uint64_t boffset = 0);
+  ~Writer();
+
+  static void ConstructBlobHeader(std::string* headerbuf, const Slice& key,
+                                  const Slice& val, int32_t ttl, int64_t ts);
+
+  Status AddRecord(const Slice& key, const Slice& val, uint64_t* key_offset,
+                   uint64_t* blob_offset);
+
+  Status AddRecord(const Slice& key, const Slice& val, uint64_t* key_offset,
+                   uint64_t* blob_offset, uint32_t ttl);
+
+  Status EmitPhysicalRecord(const std::string& headerbuf, const Slice& key,
+                            const Slice& val, uint64_t* key_offset,
+                            uint64_t* blob_offset);
+
+  Status AddRecordFooter(const SequenceNumber& sn);
+
+  Status AppendFooter(const BlobLogFooter& footer);
+
+  Status WriteHeader(const BlobLogHeader& header);
+
+  WritableFileWriter* file() { return dest_.get(); }
+
+  const WritableFileWriter* file() const { return dest_.get(); }
+
+  uint64_t get_log_number() const { return log_number_; }
+
+  bool ShouldSync() const { return block_offset_ > next_sync_offset_; }
+
+  void Sync();
+
+  void ResetSyncPointer() { next_sync_offset_ += bytes_per_sync_; }
+
+ private:
+  std::unique_ptr<WritableFileWriter> dest_;
+  uint64_t log_number_;
+  uint64_t block_offset_;  // Current offset in block
+  uint64_t bytes_per_sync_;
+  uint64_t next_sync_offset_;
+  bool use_fsync_;
+
+  // crc32c values for all supported record types.  These are
+  // pre-computed to reduce the overhead of computing the crc of the
+  // record type stored in the header.
+  uint32_t type_crc_[kMaxRecordType + 1];
+
+  // No copying allowed
+  Writer(const Writer&) = delete;
+  Writer& operator=(const Writer&) = delete;
+
+ public:
+  enum ElemType { kEtNone, kEtFileHdr, kEtRecord, kEtFooter, kEtFileFooter };
+  ElemType last_elem_type_;
+};
+
+}  // namespace blob_db
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/optimistic_transaction_db_impl.h b/utilities/transactions/optimistic_transaction_db_impl.h
index 51d2950ed..5721e499e 100644
--- a/utilities/transactions/optimistic_transaction_db_impl.h
+++ b/utilities/transactions/optimistic_transaction_db_impl.h
@@ -16,10 +16,14 @@ namespace rocksdb {
 
 class OptimisticTransactionDBImpl : public OptimisticTransactionDB {
  public:
-  explicit OptimisticTransactionDBImpl(DB* db)
-      : OptimisticTransactionDB(db), db_(db) {}
+  explicit OptimisticTransactionDBImpl(DB* db, bool take_ownership = true)
+      : OptimisticTransactionDB(db), db_(db), db_owner_(take_ownership) {}
 
-  ~OptimisticTransactionDBImpl() {}
+  ~OptimisticTransactionDBImpl() {
+    if (!db_owner_) {
+      db_.release();
+    }
+  }
 
   Transaction* BeginTransaction(const WriteOptions& write_options,
                                 const OptimisticTransactionOptions& txn_options,
@@ -29,6 +33,7 @@ class OptimisticTransactionDBImpl : public OptimisticTransactionDB {
 
  private:
   std::unique_ptr<DB> db_;
+  bool db_owner_;
 
   void ReinitializeTransaction(Transaction* txn,
                                const WriteOptions& write_options,