From da7990950787257cb312ca562ce5977749afc3e9 Mon Sep 17 00:00:00 2001
From: "dgrogan@chromium.org"
 <dgrogan@chromium.org@62dab493-f737-651d-591e-8d6aee1b9529>
Date: Sat, 21 May 2011 02:17:43 +0000
Subject: [PATCH] sync with upstream @ 21409451

Check the NEWS file for details of what changed.

git-svn-id: https://leveldb.googlecode.com/svn/trunk@28 62dab493-f737-651d-591e-8d6aee1b9529
---
 NEWS                            |  17 ++
 db/db_bench.cc                  |  66 +++++--
 db/db_impl.cc                   |  64 +++++--
 db/db_test.cc                   | 109 ++++++++---
 db/dbformat.h                   |  10 ++
 db/log_reader.cc                | 116 ++++++++++--
 db/log_reader.h                 |  37 +++-
 db/log_test.cc                  | 143 ++++++++++++++-
 db/log_writer.cc                |   3 +-
 db/memtable.cc                  |  14 +-
 db/memtable.h                   |  18 +-
 db/repair.cc                    |  12 +-
 db/snapshot.h                   |  22 +--
 db/version_set.cc               | 309 ++++++++++++++++++--------------
 db/version_set.h                |  38 ++--
 db/write_batch.cc               | 136 +++++++-------
 db/write_batch_internal.h       |  24 ---
 db/write_batch_test.cc          |   8 +-
 doc/impl.html                   |  26 ++-
 doc/index.html                  |  16 +-
 include/leveldb/comparator.h    |   4 +-
 include/leveldb/db.h            |  20 ++-
 include/leveldb/env.h           |  12 ++
 include/leveldb/iterator.h      |   5 +
 include/leveldb/slice.h         |   5 +
 include/leveldb/status.h        |  36 ++--
 include/leveldb/table.h         |   3 +-
 include/leveldb/table_builder.h |   5 +
 include/leveldb/write_batch.h   |  15 ++
 table/block_builder.cc          |   2 +-
 table/table_test.cc             |  14 +-
 util/env_chromium.cc            |   7 +
 util/env_posix.cc               |   7 +
 util/status.cc                  |  36 ++--
 34 files changed, 953 insertions(+), 406 deletions(-)
 create mode 100644 NEWS

diff --git a/NEWS b/NEWS
new file mode 100644
index 000000000..3fd99242d
--- /dev/null
+++ b/NEWS
@@ -0,0 +1,17 @@
+Release 1.2 2011-05-16
+----------------------
+
+Fixes for larger databases (tested up to one billion 100-byte entries,
+i.e., ~100GB).
+
+(1) Place hard limit on number of level-0 files.  This fixes errors
+of the form "too many open files".
+
+(2) Fixed memtable management.  Before the fix, a heavy write burst
+could cause unbounded memory usage.
+
+A fix for a logging bug where the reader would incorrectly complain
+about corruption.
+
+Allow public access to WriteBatch contents so that users can easily
+wrap a DB.
diff --git a/db/db_bench.cc b/db/db_bench.cc
index d1cbdc01c..b5fd679da 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -24,9 +24,10 @@
 //      overwrite     -- overwrite N values in random key order in async mode
 //      fillsync      -- write N/100 values in random key order in sync mode
 //      fill100K      -- write N/1000 100K values in random order in async mode
-//      readseq       -- read N values sequentially
-//      readreverse   -- read N values in reverse order
-//      readrandom    -- read N values in random order
+//      readseq       -- read N times sequentially
+//      readreverse   -- read N times in reverse order
+//      readrandom    -- read N times in random order
+//      readhot       -- read N times in random order from 1% section of DB
 //      crc32c        -- repeated crc32c of 4K of data
 //   Meta operations:
 //      compact     -- Compact the entire DB
@@ -54,6 +55,9 @@ static const char* FLAGS_benchmarks =
 // Number of key/values to place in database
 static int FLAGS_num = 1000000;
 
+// Number of read operations to do.  If negative, do FLAGS_num reads.
+static int FLAGS_reads = -1;
+
 // Size of each value
 static int FLAGS_value_size = 100;
 
@@ -72,6 +76,14 @@ static int FLAGS_write_buffer_size = 0;
 // Negative means use default settings.
 static int FLAGS_cache_size = -1;
 
+// Maximum number of files to keep open at the same time (use default if == 0)
+static int FLAGS_open_files = 0;
+
+// If true, do not destroy the existing database.  If you set this
+// flag and also specify a benchmark that wants a fresh database, that
+// benchmark will fail.
+static bool FLAGS_use_existing_db = false;
+
 namespace leveldb {
 
 // Helper for quickly generating random data.
@@ -126,6 +138,7 @@ class Benchmark {
   Cache* cache_;
   DB* db_;
   int num_;
+  int reads_;
   int heap_counter_;
   double start_;
   double last_op_finish_;
@@ -298,6 +311,7 @@ class Benchmark {
   : cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL),
     db_(NULL),
     num_(FLAGS_num),
+    reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
     heap_counter_(0),
     bytes_(0),
     rand_(301) {
@@ -308,7 +322,9 @@ class Benchmark {
         Env::Default()->DeleteFile("/tmp/dbbench/" + files[i]);
       }
     }
-    DestroyDB("/tmp/dbbench", Options());
+    if (!FLAGS_use_existing_db) {
+      DestroyDB("/tmp/dbbench", Options());
+    }
   }
 
   ~Benchmark() {
@@ -355,11 +371,13 @@ class Benchmark {
         ReadReverse();
       } else if (name == Slice("readrandom")) {
         ReadRandom();
+      } else if (name == Slice("readhot")) {
+        ReadHot();
       } else if (name == Slice("readrandomsmall")) {
-        int n = num_;
-        num_ /= 1000;
+        int n = reads_;
+        reads_ /= 1000;
         ReadRandom();
-        num_ = n;
+        reads_ = n;
       } else if (name == Slice("compact")) {
         Compact();
       } else if (name == Slice("crc32c")) {
@@ -449,7 +467,7 @@ class Benchmark {
   void Open() {
     assert(db_ == NULL);
     Options options;
-    options.create_if_missing = true;
+    options.create_if_missing = !FLAGS_use_existing_db;
     options.block_cache = cache_;
     options.write_buffer_size = FLAGS_write_buffer_size;
     Status s = DB::Open(options, "/tmp/dbbench", &db_);
@@ -462,6 +480,10 @@ class Benchmark {
   void Write(const WriteOptions& options, Order order, DBState state,
              int num_entries, int value_size, int entries_per_batch) {
     if (state == FRESH) {
+      if (FLAGS_use_existing_db) {
+        message_ = "skipping (--use_existing_db is true)";
+        return;
+      }
       delete db_;
       db_ = NULL;
       DestroyDB("/tmp/dbbench", Options());
@@ -499,7 +521,7 @@ class Benchmark {
   void ReadSequential() {
     Iterator* iter = db_->NewIterator(ReadOptions());
     int i = 0;
-    for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) {
+    for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) {
       bytes_ += iter->key().size() + iter->value().size();
       FinishedSingleOp();
       ++i;
@@ -510,7 +532,7 @@ class Benchmark {
   void ReadReverse() {
     Iterator* iter = db_->NewIterator(ReadOptions());
     int i = 0;
-    for (iter->SeekToLast(); i < num_ && iter->Valid(); iter->Prev()) {
+    for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) {
       bytes_ += iter->key().size() + iter->value().size();
       FinishedSingleOp();
       ++i;
@@ -521,7 +543,7 @@ class Benchmark {
   void ReadRandom() {
     ReadOptions options;
     std::string value;
-    for (int i = 0; i < num_; i++) {
+    for (int i = 0; i < reads_; i++) {
       char key[100];
       const int k = rand_.Next() % FLAGS_num;
       snprintf(key, sizeof(key), "%016d", k);
@@ -530,6 +552,19 @@ class Benchmark {
     }
   }
 
+  void ReadHot() {
+    ReadOptions options;
+    std::string value;
+    const int range = (FLAGS_num + 99) / 100;
+    for (int i = 0; i < reads_; i++) {
+      char key[100];
+      const int k = rand_.Next() % range;
+      snprintf(key, sizeof(key), "%016d", k);
+      db_->Get(options, key, &value);
+      FinishedSingleOp();
+    }
+  }
+
   void Compact() {
     DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
     dbi->TEST_CompactMemTable();
@@ -582,6 +617,8 @@ class Benchmark {
 
 int main(int argc, char** argv) {
   FLAGS_write_buffer_size = leveldb::Options().write_buffer_size;
+  FLAGS_open_files = leveldb::Options().max_open_files;
+
   for (int i = 1; i < argc; i++) {
     double d;
     int n;
@@ -593,14 +630,21 @@ int main(int argc, char** argv) {
     } else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 &&
                (n == 0 || n == 1)) {
       FLAGS_histogram = n;
+    } else if (sscanf(argv[i], "--use_existing_db=%d%c", &n, &junk) == 1 &&
+               (n == 0 || n == 1)) {
+      FLAGS_use_existing_db = n;
     } else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) {
       FLAGS_num = n;
+    } else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) {
+      FLAGS_reads = n;
     } else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) {
       FLAGS_value_size = n;
     } else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
       FLAGS_write_buffer_size = n;
     } else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) {
       FLAGS_cache_size = n;
+    } else if (sscanf(argv[i], "--open_files=%d%c", &n, &junk) == 1) {
+      FLAGS_open_files = n;
     } else {
       fprintf(stderr, "Invalid flag '%s'\n", argv[i]);
       exit(1);
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 3b9e04e83..baf9299f6 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -126,6 +126,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
       log_(NULL),
       bg_compaction_scheduled_(false),
       compacting_(false) {
+  mem_->Ref();
   has_imm_.Release_Store(NULL);
 
   // Reserve ten files or so for other uses and give the rest to TableCache.
@@ -152,8 +153,8 @@ DBImpl::~DBImpl() {
   }
 
   delete versions_;
-  delete mem_;
-  delete imm_;
+  if (mem_ != NULL) mem_->Unref();
+  if (imm_ != NULL) imm_->Unref();
   delete log_;
   delete logfile_;
   delete table_cache_;
@@ -344,7 +345,8 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
   // paranoid_checks==false so that corruptions cause entire commits
   // to be skipped instead of propagating bad information (like overly
   // large sequence numbers).
-  log::Reader reader(file, &reporter, true/*checksum*/);
+  log::Reader reader(file, &reporter, true/*checksum*/,
+                     0/*initial_offset*/);
   Log(env_, options_.info_log, "Recovering log #%llu",
       (unsigned long long) log_number);
 
@@ -364,6 +366,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
 
     if (mem == NULL) {
       mem = new MemTable(internal_comparator_);
+      mem->Ref();
     }
     status = WriteBatchInternal::InsertInto(&batch, mem);
     MaybeIgnoreError(&status);
@@ -384,7 +387,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
         // file-systems cause the DB::Open() to fail.
         break;
       }
-      delete mem;
+      mem->Unref();
       mem = NULL;
     }
   }
@@ -395,7 +398,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
     // file-systems cause the DB::Open() to fail.
   }
 
-  delete mem;
+  if (mem != NULL) mem->Unref();
   delete file;
   return status;
 }
@@ -443,11 +446,12 @@ Status DBImpl::CompactMemTable() {
   // Replace immutable memtable with the generated Table
   if (s.ok()) {
     edit.SetPrevLogNumber(0);
-    s = versions_->LogAndApply(&edit, imm_);
+    s = versions_->LogAndApply(&edit);
   }
 
   if (s.ok()) {
     // Commit to the new state
+    imm_->Unref();
     imm_ = NULL;
     has_imm_.Release_Store(NULL);
     DeleteObsoleteFiles();
@@ -556,7 +560,7 @@ void DBImpl::BackgroundCompaction() {
     c->edit()->DeleteFile(c->level(), f->number);
     c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
                        f->smallest, f->largest);
-    status = versions_->LogAndApply(c->edit(), NULL);
+    status = versions_->LogAndApply(c->edit());
     Log(env_, options_.info_log, "Moved #%lld to level-%d %lld bytes %s\n",
         static_cast<unsigned long long>(f->number),
         c->level() + 1,
@@ -697,7 +701,7 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) {
   }
   compact->outputs.clear();
 
-  Status s = versions_->LogAndApply(compact->compaction->edit(), NULL);
+  Status s = versions_->LogAndApply(compact->compaction->edit());
   if (s.ok()) {
     compact->compaction->ReleaseInputs();
     DeleteObsoleteFiles();
@@ -754,9 +758,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
     }
 
     Slice key = input->key();
-    InternalKey tmp_internal_key;
-    tmp_internal_key.DecodeFrom(key);
-    if (compact->compaction->ShouldStopBefore(tmp_internal_key) &&
+    if (compact->compaction->ShouldStopBefore(key) &&
         compact->builder != NULL) {
       status = FinishCompactionOutputFile(compact, input);
       if (!status.ok()) {
@@ -867,6 +869,9 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
   }
   compacting_ = false;
   compacting_cv_.SignalAll();
+  VersionSet::LevelSummaryStorage tmp;
+  Log(env_, options_.info_log,
+      "compacted to: %s", versions_->LevelSummary(&tmp));
   return status;
 }
 
@@ -925,10 +930,11 @@ Status DBImpl::Get(const ReadOptions& options,
 Iterator* DBImpl::NewIterator(const ReadOptions& options) {
   SequenceNumber latest_snapshot;
   Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot);
-  SequenceNumber sequence =
-      (options.snapshot ? options.snapshot->number_ : latest_snapshot);
-  return NewDBIterator(&dbname_, env_,
-                       user_comparator(), internal_iter, sequence);
+  return NewDBIterator(
+      &dbname_, env_, user_comparator(), internal_iter,
+      (options.snapshot != NULL
+       ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
+       : latest_snapshot));
 }
 
 void DBImpl::Unref(void* arg1, void* arg2) {
@@ -945,7 +951,7 @@ const Snapshot* DBImpl::GetSnapshot() {
 
 void DBImpl::ReleaseSnapshot(const Snapshot* s) {
   MutexLock l(&mutex_);
-  snapshots_.Delete(s);
+  snapshots_.Delete(reinterpret_cast<const SnapshotImpl*>(s));
 }
 
 // Convenience methods
@@ -985,12 +991,26 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
 
 Status DBImpl::MakeRoomForWrite(bool force) {
   mutex_.AssertHeld();
+  bool allow_delay = !force;
   Status s;
   while (true) {
     if (!bg_error_.ok()) {
       // Yield previous error
       s = bg_error_;
       break;
+    } else if (
+        allow_delay &&
+        versions_->NumLevelFiles(0) >= config::kL0_SlowdownWritesTrigger) {
+      // We are getting close to hitting a hard limit on the number of
+      // L0 files.  Rather than delaying a single write by several
+      // seconds when we hit the hard limit, start delaying each
+      // individual write by 1ms to reduce latency variance.  Also,
+      // this delay hands over some CPU to the compaction thread in
+      // case it is sharing the same core as the writer.
+      mutex_.Unlock();
+      env_->SleepForMicroseconds(1000);
+      allow_delay = false;  // Do not delay a single write more than once
+      mutex_.Lock();
     } else if (!force &&
                (mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) {
       // There is room in current memtable
@@ -999,6 +1019,9 @@ Status DBImpl::MakeRoomForWrite(bool force) {
       // We have filled up the current memtable, but the previous
       // one is still being compacted, so we wait.
       compacting_cv_.Wait();
+    } else if (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) {
+      // There are too many level-0 files.
+      compacting_cv_.Wait();
     } else {
       // Attempt to switch to a new memtable and trigger compaction of old
       assert(versions_->PrevLogNumber() == 0);
@@ -1011,7 +1034,7 @@ Status DBImpl::MakeRoomForWrite(bool force) {
       VersionEdit edit;
       edit.SetPrevLogNumber(versions_->LogNumber());
       edit.SetLogNumber(new_log_number);
-      s = versions_->LogAndApply(&edit, NULL);
+      s = versions_->LogAndApply(&edit);
       if (!s.ok()) {
         delete lfile;
         env_->DeleteFile(LogFileName(dbname_, new_log_number));
@@ -1024,6 +1047,7 @@ Status DBImpl::MakeRoomForWrite(bool force) {
       imm_ = mem_;
       has_imm_.Release_Store(imm_);
       mem_ = new MemTable(internal_comparator_);
+      mem_->Ref();
       force = false;   // Do not force another compaction if have room
       MaybeScheduleCompaction();
     }
@@ -1141,10 +1165,11 @@ Status DB::Open(const Options& options, const std::string& dbname,
       edit.SetLogNumber(new_log_number);
       impl->logfile_ = lfile;
       impl->log_ = new log::Writer(lfile);
-      s = impl->versions_->LogAndApply(&edit, NULL);
+      s = impl->versions_->LogAndApply(&edit);
     }
     if (s.ok()) {
       impl->DeleteObsoleteFiles();
+      impl->MaybeScheduleCompaction();
     }
   }
   impl->mutex_.Unlock();
@@ -1156,6 +1181,9 @@ Status DB::Open(const Options& options, const std::string& dbname,
   return s;
 }
 
+Snapshot::~Snapshot() {
+}
+
 Status DestroyDB(const std::string& dbname, const Options& options) {
   Env* env = options.env;
   std::vector<std::string> filenames;
diff --git a/db/db_test.cc b/db/db_test.cc
index f828e3dce..06565b27a 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -3,7 +3,6 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "leveldb/db.h"
-
 #include "db/db_impl.h"
 #include "db/filename.h"
 #include "db/version_set.h"
@@ -802,8 +801,17 @@ TEST(DBTest, DBOpen_Options) {
   db = NULL;
 }
 
+namespace {
+typedef std::map<std::string, std::string> KVMap;
+}
+
 class ModelDB: public DB {
  public:
+  class ModelSnapshot : public Snapshot {
+   public:
+    KVMap map_;
+  };
+
   explicit ModelDB(const Options& options): options_(options) { }
   ~ModelDB() { }
   virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) {
@@ -824,35 +832,34 @@ class ModelDB: public DB {
       return new ModelIter(saved, true);
     } else {
       const KVMap* snapshot_state =
-          reinterpret_cast<const KVMap*>(options.snapshot->number_);
+          &(reinterpret_cast<const ModelSnapshot*>(options.snapshot)->map_);
       return new ModelIter(snapshot_state, false);
     }
   }
   virtual const Snapshot* GetSnapshot() {
-    KVMap* saved = new KVMap;
-    *saved = map_;
-    return snapshots_.New(
-        reinterpret_cast<SequenceNumber>(saved));
+    ModelSnapshot* snapshot = new ModelSnapshot;
+    snapshot->map_ = map_;
+    return snapshot;
   }
 
   virtual void ReleaseSnapshot(const Snapshot* snapshot) {
-    const KVMap* saved = reinterpret_cast<const KVMap*>(snapshot->number_);
-    delete saved;
-    snapshots_.Delete(snapshot);
+    delete reinterpret_cast<const ModelSnapshot*>(snapshot);
   }
   virtual Status Write(const WriteOptions& options, WriteBatch* batch) {
     assert(options.post_write_snapshot == NULL);   // Not supported
-    for (WriteBatchInternal::Iterator it(*batch); !it.Done(); it.Next()) {
-      switch (it.op()) {
-        case kTypeValue:
-          map_[it.key().ToString()] = it.value().ToString();
-          break;
-        case kTypeDeletion:
-          map_.erase(it.key().ToString());
-          break;
+    class Handler : public WriteBatch::Handler {
+     public:
+      KVMap* map_;
+      virtual void Put(const Slice& key, const Slice& value) {
+        (*map_)[key.ToString()] = value.ToString();
       }
-    }
-    return Status::OK();
+      virtual void Delete(const Slice& key) {
+        map_->erase(key.ToString());
+      }
+    };
+    Handler handler;
+    handler.map_ = &map_;
+    return batch->Iterate(&handler);
   }
 
   virtual bool GetProperty(const Slice& property, std::string* value) {
@@ -864,7 +871,6 @@ class ModelDB: public DB {
     }
   }
  private:
-  typedef std::map<std::string, std::string> KVMap;
   class ModelIter: public Iterator {
    public:
     ModelIter(const KVMap* map, bool owned)
@@ -897,7 +903,6 @@ class ModelDB: public DB {
   };
   const Options options_;
   KVMap map_;
-  SnapshotList snapshots_;
 };
 
 static std::string RandomKey(Random* rnd) {
@@ -1023,8 +1028,70 @@ TEST(DBTest, Randomized) {
   if (db_snap != NULL) db_->ReleaseSnapshot(db_snap);
 }
 
+std::string MakeKey(unsigned int num) {
+  char buf[30];
+  snprintf(buf, sizeof(buf), "%016u", num);
+  return std::string(buf);
+}
+
+void BM_LogAndApply(int iters, int num_base_files) {
+  std::string dbname = test::TmpDir() + "/leveldb_test_benchmark";
+  DestroyDB(dbname, Options());
+
+  DB* db = NULL;
+  Options opts;
+  opts.create_if_missing = true;
+  Status s = DB::Open(opts, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != NULL);
+
+  delete db;
+  db = NULL;
+
+  Env* env = Env::Default();
+
+  InternalKeyComparator cmp(BytewiseComparator());
+  Options options;
+  VersionSet vset(dbname, &options, NULL, &cmp);
+  ASSERT_OK(vset.Recover());
+  VersionEdit vbase;
+  uint64_t fnum = 1;
+  for (int i = 0; i < num_base_files; i++) {
+    InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
+    InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
+    vbase.AddFile(2, fnum++, 1 /* file size */, start, limit);
+  }
+  ASSERT_OK(vset.LogAndApply(&vbase));
+
+  uint64_t start_micros = env->NowMicros();
+
+  for (int i = 0; i < iters; i++) {
+    VersionEdit vedit;
+    vedit.DeleteFile(2, fnum);
+    InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
+    InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
+    vedit.AddFile(2, fnum++, 1 /* file size */, start, limit);
+    vset.LogAndApply(&vedit);
+  }
+  uint64_t stop_micros = env->NowMicros();
+  unsigned int us = stop_micros - start_micros;
+  char buf[16];
+  snprintf(buf, sizeof(buf), "%d", num_base_files);
+  fprintf(stderr,
+          "BM_LogAndApply/%-6s   %8d iters : %9u us (%7.0f us / iter)\n",
+          buf, iters, us, ((float)us) / iters);
+}
+
 }
 
 int main(int argc, char** argv) {
+  if (argc > 1 && std::string(argv[1]) == "--benchmark") {
+    leveldb::BM_LogAndApply(1000, 1);
+    leveldb::BM_LogAndApply(1000, 100);
+    leveldb::BM_LogAndApply(1000, 10000);
+    leveldb::BM_LogAndApply(100, 100000);
+    return 0;
+  }
+
   return leveldb::test::RunAllTests();
 }
diff --git a/db/dbformat.h b/db/dbformat.h
index d58366526..89c4afb34 100644
--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -19,6 +19,16 @@ namespace leveldb {
 // parameters set via options.
 namespace config {
 static const int kNumLevels = 7;
+
+// Level-0 compaction is started when we hit this many files.
+static const int kL0_CompactionTrigger = 4;
+
+// Soft limit on number of level-0 files.  We slow down writes at this point.
+static const int kL0_SlowdownWritesTrigger = 8;
+
+// Maximum number of level-0 files.  We stop writes at this point.
+static const int kL0_StopWritesTrigger = 12;
+
 }
 
 class InternalKey;
diff --git a/db/log_reader.cc b/db/log_reader.cc
index 75e1d2850..87210719c 100644
--- a/db/log_reader.cc
+++ b/db/log_reader.cc
@@ -4,7 +4,6 @@
 
 #include "db/log_reader.h"
 
-#include <stdint.h>
 #include "leveldb/env.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
@@ -15,46 +14,104 @@ namespace log {
 Reader::Reporter::~Reporter() {
 }
 
-Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum)
+Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum,
+               uint64_t initial_offset)
     : file_(file),
       reporter_(reporter),
       checksum_(checksum),
       backing_store_(new char[kBlockSize]),
       buffer_(),
-      eof_(false) {
+      eof_(false),
+      last_record_offset_(0),
+      end_of_buffer_offset_(0),
+      initial_offset_(initial_offset) {
 }
 
 Reader::~Reader() {
   delete[] backing_store_;
 }
 
+bool Reader::SkipToInitialBlock() {
+  size_t offset_in_block = initial_offset_ % kBlockSize;
+  uint64_t block_start_location = initial_offset_ - offset_in_block;
+
+  // Don't search a block if we'd be in the trailer
+  if (offset_in_block > kBlockSize - 6) {
+    offset_in_block = 0;
+    block_start_location += kBlockSize;
+  }
+
+  end_of_buffer_offset_ = block_start_location;
+
+  // Skip to start of first block that can contain the initial record
+  if (block_start_location > 0) {
+    Status skip_status = file_->Skip(block_start_location);
+    if (!skip_status.ok()) {
+      ReportDrop(block_start_location, skip_status);
+      return false;
+    }
+  }
+
+  return true;
+}
+
 bool Reader::ReadRecord(Slice* record, std::string* scratch) {
+  if (last_record_offset_ < initial_offset_) {
+    if (!SkipToInitialBlock()) {
+      return false;
+    }
+  }
+
   scratch->clear();
   record->clear();
   bool in_fragmented_record = false;
+  // Record offset of the logical record that we're reading
+  // 0 is a dummy value to make compilers happy
+  uint64_t prospective_record_offset = 0;
 
   Slice fragment;
   while (true) {
+    uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
     switch (ReadPhysicalRecord(&fragment)) {
       case kFullType:
         if (in_fragmented_record) {
-          ReportDrop(scratch->size(), "partial record without end");
+          // Handle bug in earlier versions of log::Writer where
+          // it could emit an empty kFirstType record at the tail end
+          // of a block followed by a kFullType or kFirstType record
+          // at the beginning of the next block.
+          if (scratch->empty()) {
+            in_fragmented_record = false;
+          } else {
+            ReportCorruption(scratch->size(), "partial record without end(1)");
+          }
         }
+        prospective_record_offset = physical_record_offset;
         scratch->clear();
         *record = fragment;
+        last_record_offset_ = prospective_record_offset;
         return true;
 
       case kFirstType:
         if (in_fragmented_record) {
-          ReportDrop(scratch->size(), "partial record without end");
+          // Handle bug in earlier versions of log::Writer where
+          // it could emit an empty kFirstType record at the tail end
+          // of a block followed by a kFullType or kFirstType record
+          // at the beginning of the next block.
+          if (scratch->empty()) {
+            in_fragmented_record = false;
+          } else {
+            ReportCorruption(scratch->size(), "partial record without end(2)");
+          }
         }
+        prospective_record_offset = physical_record_offset;
         scratch->assign(fragment.data(), fragment.size());
         in_fragmented_record = true;
         break;
 
       case kMiddleType:
         if (!in_fragmented_record) {
-          ReportDrop(fragment.size(), "missing start of fragmented record");
+          ReportCorruption(fragment.size(),
+                           "missing start of fragmented record(1)");
         } else {
           scratch->append(fragment.data(), fragment.size());
         }
@@ -62,31 +119,33 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) {
 
       case kLastType:
         if (!in_fragmented_record) {
-          ReportDrop(fragment.size(), "missing start of fragmented record");
+          ReportCorruption(fragment.size(),
+                           "missing start of fragmented record(2)");
         } else {
           scratch->append(fragment.data(), fragment.size());
           *record = Slice(*scratch);
+          last_record_offset_ = prospective_record_offset;
           return true;
         }
         break;
 
       case kEof:
         if (in_fragmented_record) {
-          ReportDrop(scratch->size(), "partial record without end");
+          ReportCorruption(scratch->size(), "partial record without end(3)");
           scratch->clear();
         }
         return false;
 
       case kBadRecord:
         if (in_fragmented_record) {
-          ReportDrop(scratch->size(), "error in middle of record");
+          ReportCorruption(scratch->size(), "error in middle of record");
           in_fragmented_record = false;
           scratch->clear();
         }
         break;
 
       default:
-        ReportDrop(
+        ReportCorruption(
             (fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
             "unknown record type");
         in_fragmented_record = false;
@@ -97,9 +156,18 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) {
   return false;
 }
 
-void Reader::ReportDrop(size_t bytes, const char* reason) {
-  if (reporter_ != NULL) {
-    reporter_->Corruption(bytes, Status::Corruption(reason));
+uint64_t Reader::LastRecordOffset() {
+  return last_record_offset_;
+}
+
+void Reader::ReportCorruption(size_t bytes, const char* reason) {
+  ReportDrop(bytes, Status::Corruption(reason));
+}
+
+void Reader::ReportDrop(size_t bytes, const Status& reason) {
+  if (reporter_ != NULL &&
+      end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
+    reporter_->Corruption(bytes, reason);
   }
 }
 
@@ -110,11 +178,10 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
         // Last read was a full read, so this is a trailer to skip
         buffer_.clear();
         Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
+        end_of_buffer_offset_ += buffer_.size();
         if (!status.ok()) {
-          if (reporter_ != NULL) {
-            reporter_->Corruption(kBlockSize, status);
-          }
           buffer_.clear();
+          ReportDrop(kBlockSize, status);
           eof_ = true;
           return kEof;
         } else if (buffer_.size() < kBlockSize) {
@@ -125,8 +192,9 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
         // End of file
         return kEof;
       } else {
-        ReportDrop(buffer_.size(), "truncated record at end of file");
+        size_t drop_size = buffer_.size();
         buffer_.clear();
+        ReportCorruption(drop_size, "truncated record at end of file");
         return kEof;
       }
     }
@@ -138,8 +206,9 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
     const unsigned int type = header[6];
     const uint32_t length = a | (b << 8);
     if (kHeaderSize + length > buffer_.size()) {
-      ReportDrop(buffer_.size(), "bad record length");
+      size_t drop_size = buffer_.size();
       buffer_.clear();
+      ReportCorruption(drop_size, "bad record length");
       return kBadRecord;
     }
 
@@ -160,13 +229,22 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
         // been corrupted and if we trust it, we could find some
         // fragment of a real log record that just happens to look
         // like a valid log record.
-        ReportDrop(buffer_.size(), "checksum mismatch");
+        size_t drop_size = buffer_.size();
         buffer_.clear();
+        ReportCorruption(drop_size, "checksum mismatch");
         return kBadRecord;
       }
     }
 
     buffer_.remove_prefix(kHeaderSize + length);
+
+    // Skip physical record that started before initial_offset_
+    if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length <
+        initial_offset_) {
+      result->clear();
+      return kBadRecord;
+    }
+
     *result = Slice(header + kHeaderSize, length);
     return type;
   }
diff --git a/db/log_reader.h b/db/log_reader.h
index baf147560..61cc41496 100644
--- a/db/log_reader.h
+++ b/db/log_reader.h
@@ -5,6 +5,8 @@
 #ifndef STORAGE_LEVELDB_DB_LOG_READER_H_
 #define STORAGE_LEVELDB_DB_LOG_READER_H_
 
+#include <stdint.h>
+
 #include "db/log_format.h"
 #include "leveldb/slice.h"
 #include "leveldb/status.h"
@@ -35,7 +37,11 @@ class Reader {
   // live while this Reader is in use.
   //
   // If "checksum" is true, verify checksums if available.
-  Reader(SequentialFile* file, Reporter* reporter, bool checksum);
+  //
+  // The Reader will start reading at the first record located at physical
+  // position >= initial_offset within the file.
+  Reader(SequentialFile* file, Reporter* reporter, bool checksum,
+         uint64_t initial_offset);
 
   ~Reader();
 
@@ -46,6 +52,11 @@ class Reader {
   // reader or the next mutation to *scratch.
   bool ReadRecord(Slice* record, std::string* scratch);
 
+  // Returns the physical offset of the last record returned by ReadRecord.
+  //
+  // Undefined before the first call to ReadRecord.
+  uint64_t LastRecordOffset();
+
  private:
   SequentialFile* const file_;
   Reporter* const reporter_;
@@ -54,15 +65,37 @@ class Reader {
   Slice buffer_;
   bool eof_;   // Last Read() indicated EOF by returning < kBlockSize
 
+  // Offset of the last record returned by ReadRecord.
+  uint64_t last_record_offset_;
+  // Offset of the first location past the end of buffer_.
+  uint64_t end_of_buffer_offset_;
+
+  // Offset at which to start looking for the first record to return
+  uint64_t const initial_offset_;
+
   // Extend record types with the following special values
   enum {
     kEof = kMaxRecordType + 1,
+    // Returned whenever we find an invalid physical record.
+    // Currently there are three situations in which this happens:
+    // * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
+    // * The record is a 0-length record (No drop is reported)
+    // * The record is below constructor's initial_offset (No drop is reported)
     kBadRecord = kMaxRecordType + 2
   };
 
+  // Skips all blocks that are completely before "initial_offset_".
+  //
+  // Returns true on success. Handles reporting.
+  bool SkipToInitialBlock();
+
   // Return type, or one of the preceding special values
   unsigned int ReadPhysicalRecord(Slice* result);
-  void ReportDrop(size_t bytes, const char* reason);
+
+  // Reports dropped bytes to the reporter.
+  // buffer_ must be updated to remove the dropped bytes prior to invocation.
+  void ReportCorruption(size_t bytes, const char* reason);
+  void ReportDrop(size_t bytes, const Status& reason);
 
   // No copying allowed
   Reader(const Reader&);
diff --git a/db/log_test.cc b/db/log_test.cc
index 025a5ff84..040bdff8a 100644
--- a/db/log_test.cc
+++ b/db/log_test.cc
@@ -60,7 +60,6 @@ class LogTest {
 
     virtual Status Read(size_t n, Slice* result, char* scratch) {
       ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error";
-      ASSERT_EQ(kBlockSize, n);
 
       if (force_error_) {
         force_error_ = false;
@@ -76,6 +75,17 @@ class LogTest {
       contents_.remove_prefix(n);
       return Status::OK();
     }
+
+    virtual Status Skip(size_t n) {
+      if (n > contents_.size()) {
+        contents_.clear();
+        return Status::NotFound("in-memory file skipepd past end");
+      }
+
+      contents_.remove_prefix(n);
+
+      return Status::OK();
+    }
   };
 
   class ReportCollector : public Reader::Reporter {
@@ -97,10 +107,15 @@ class LogTest {
   Writer writer_;
   Reader reader_;
 
+  // Record metadata for testing initial offset functionality
+  static size_t initial_offset_record_sizes_[];
+  static uint64_t initial_offset_last_record_offsets_[];
+
  public:
   LogTest() : reading_(false),
               writer_(&dest_),
-              reader_(&source_, &report_, true/*checksum*/) {
+              reader_(&source_, &report_, true/*checksum*/,
+                      0/*initial_offset*/) {
   }
 
   void Write(const std::string& msg) {
@@ -153,6 +168,10 @@ class LogTest {
     return report_.dropped_bytes_;
   }
 
+  std::string ReportMessage() const {
+    return report_.message_;
+  }
+
   // Returns OK iff recorded error message contains "msg"
   std::string MatchError(const std::string& msg) const {
     if (report_.message_.find(msg) == std::string::npos) {
@@ -161,8 +180,61 @@ class LogTest {
       return "OK";
     }
   }
+
+  void WriteInitialOffsetLog() {
+    for (int i = 0; i < 4; i++) {
+      std::string record(initial_offset_record_sizes_[i],
+                         static_cast<char>('a' + i));
+      Write(record);
+    }
+  }
+
+  void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) {
+    WriteInitialOffsetLog();
+    reading_ = true;
+    source_.contents_ = Slice(dest_.contents_);
+    Reader* offset_reader = new Reader(&source_, &report_, true/*checksum*/,
+                                       WrittenBytes() + offset_past_end);
+    Slice record;
+    std::string scratch;
+    ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch));
+    delete offset_reader;
+  }
+
+  void CheckInitialOffsetRecord(uint64_t initial_offset,
+                                int expected_record_offset) {
+    WriteInitialOffsetLog();
+    reading_ = true;
+    source_.contents_ = Slice(dest_.contents_);
+    Reader* offset_reader = new Reader(&source_, &report_, true/*checksum*/,
+                                       initial_offset);
+    Slice record;
+    std::string scratch;
+    ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
+    ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset],
+              record.size());
+    ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset],
+              offset_reader->LastRecordOffset());
+    ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]);
+    delete offset_reader;
+  }
+
 };
 
+size_t LogTest::initial_offset_record_sizes_[] =
+    {10000,  // Two sizable records in first block
+     10000,
+     2 * log::kBlockSize - 1000,  // Span three blocks
+     1};
+
+uint64_t LogTest::initial_offset_last_record_offsets_[] =
+    {0,
+     kHeaderSize + 10000,
+     2 * (kHeaderSize + 10000),
+     2 * (kHeaderSize + 10000) +
+         (2 * log::kBlockSize - 1000) + 3 * kHeaderSize};
+
+
 TEST(LogTest, Empty) {
   ASSERT_EQ("EOF", Read());
 }
@@ -213,6 +285,19 @@ TEST(LogTest, MarginalTrailer) {
   ASSERT_EQ("EOF", Read());
 }
 
+TEST(LogTest, MarginalTrailer2) {
+  // Make a trailer that is exactly the same length as an empty record.
+  const int n = kBlockSize - 2*kHeaderSize;
+  Write(BigString("foo", n));
+  ASSERT_EQ(kBlockSize - kHeaderSize, WrittenBytes());
+  Write("bar");
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(0, DroppedBytes());
+  ASSERT_EQ("", ReportMessage());
+}
+
 TEST(LogTest, ShortTrailer) {
   const int n = kBlockSize - 2*kHeaderSize + 4;
   Write(BigString("foo", n));
@@ -353,6 +438,60 @@ TEST(LogTest, ErrorJoinsRecords) {
   ASSERT_GE(dropped, 2*kBlockSize);
 }
 
+TEST(LogTest, ReadStart) {
+  CheckInitialOffsetRecord(0, 0);
+}
+
+TEST(LogTest, ReadSecondOneOff) {
+  CheckInitialOffsetRecord(1, 1);
+}
+
+TEST(LogTest, ReadSecondTenThousand) {
+  CheckInitialOffsetRecord(10000, 1);
+}
+
+TEST(LogTest, ReadSecondStart) {
+  CheckInitialOffsetRecord(10007, 1);
+}
+
+TEST(LogTest, ReadThirdOneOff) {
+  CheckInitialOffsetRecord(10008, 2);
+}
+
+TEST(LogTest, ReadThirdStart) {
+  CheckInitialOffsetRecord(20014, 2);
+}
+
+TEST(LogTest, ReadFourthOneOff) {
+  CheckInitialOffsetRecord(20015, 3);
+}
+
+TEST(LogTest, ReadFourthFirstBlockTrailer) {
+  CheckInitialOffsetRecord(log::kBlockSize - 4, 3);
+}
+
+TEST(LogTest, ReadFourthMiddleBlock) {
+  CheckInitialOffsetRecord(log::kBlockSize + 1, 3);
+}
+
+TEST(LogTest, ReadFourthLastBlock) {
+  CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3);
+}
+
+TEST(LogTest, ReadFourthStart) {
+  CheckInitialOffsetRecord(
+      2 * (kHeaderSize + 1000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize,
+      3);
+}
+
+TEST(LogTest, ReadEnd) {
+  CheckOffsetPastEndReturnsNoRecords(0);
+}
+
+TEST(LogTest, ReadPastEnd) {
+  CheckOffsetPastEndReturnsNoRecords(5);
+}
+
 }
 }
 
diff --git a/db/log_writer.cc b/db/log_writer.cc
index 169685105..0887f6c8b 100644
--- a/db/log_writer.cc
+++ b/db/log_writer.cc
@@ -32,6 +32,7 @@ Status Writer::AddRecord(const Slice& slice) {
   // is empty, we still want to iterate once to emit a single
   // zero-length record
   Status s;
+  bool begin = true;
   do {
     const int leftover = kBlockSize - block_offset_;
     assert(leftover >= 0);
@@ -52,7 +53,6 @@ Status Writer::AddRecord(const Slice& slice) {
     const size_t fragment_length = (left < avail) ? left : avail;
 
     RecordType type;
-    const bool begin = (ptr == slice.data());
     const bool end = (left == fragment_length);
     if (begin && end) {
       type = kFullType;
@@ -67,6 +67,7 @@ Status Writer::AddRecord(const Slice& slice) {
     s = EmitPhysicalRecord(type, ptr, fragment_length);
     ptr += fragment_length;
     left -= fragment_length;
+    begin = false;
   } while (s.ok() && left > 0);
   return s;
 }
diff --git a/db/memtable.cc b/db/memtable.cc
index a3b618a8a..9c25f6d9e 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -20,10 +20,12 @@ static Slice GetLengthPrefixedSlice(const char* data) {
 
 MemTable::MemTable(const InternalKeyComparator& cmp)
     : comparator_(cmp),
+      refs_(0),
       table_(comparator_, &arena_) {
 }
 
 MemTable::~MemTable() {
+  assert(refs_ == 0);
 }
 
 size_t MemTable::ApproximateMemoryUsage() { return arena_.MemoryUsage(); }
@@ -48,10 +50,15 @@ static const char* EncodeKey(std::string* scratch, const Slice& target) {
 
 class MemTableIterator: public Iterator {
  public:
-  explicit MemTableIterator(MemTable::Table* table) {
+  explicit MemTableIterator(MemTable* mem, MemTable::Table* table) {
+    mem_ = mem;
     iter_ = new MemTable::Table::Iterator(table);
+    mem->Ref();
+  }
+  virtual ~MemTableIterator() {
+    delete iter_;
+    mem_->Unref();
   }
-  virtual ~MemTableIterator() { delete iter_; }
 
   virtual bool Valid() const { return iter_->Valid(); }
   virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); }
@@ -68,6 +75,7 @@ class MemTableIterator: public Iterator {
   virtual Status status() const { return Status::OK(); }
 
  private:
+  MemTable* mem_;
   MemTable::Table::Iterator* iter_;
   std::string tmp_;       // For passing to EncodeKey
 
@@ -77,7 +85,7 @@ class MemTableIterator: public Iterator {
 };
 
 Iterator* MemTable::NewIterator() {
-  return new MemTableIterator(&table_);
+  return new MemTableIterator(this, &table_);
 }
 
 void MemTable::Add(SequenceNumber s, ValueType type,
diff --git a/db/memtable.h b/db/memtable.h
index 45b3342ae..2e9bd618c 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -19,8 +19,21 @@ class MemTableIterator;
 
 class MemTable {
  public:
+  // MemTables are reference counted.  The initial reference count
+  // is zero and the caller must call Ref() at least once.
   explicit MemTable(const InternalKeyComparator& comparator);
-  ~MemTable();
+
+  // Increase reference count.
+  void Ref() { ++refs_; }
+
+  // Drop reference count.  Delete if no more references exist.
+  void Unref() {
+    --refs_;
+    assert(refs_ >= 0);
+    if (refs_ <= 0) {
+      delete this;
+    }
+  }
 
   // Returns an estimate of the number of bytes of data in use by this
   // data structure.
@@ -45,6 +58,8 @@ class MemTable {
            const Slice& value);
 
  private:
+  ~MemTable();  // Private since only Unref() should be used to delete it
+
   struct KeyComparator {
     const InternalKeyComparator comparator;
     explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
@@ -56,6 +71,7 @@ class MemTable {
   typedef SkipList<const char*, KeyComparator> Table;
 
   KeyComparator comparator_;
+  int refs_;
   Arena arena_;
   Table table_;
 
diff --git a/db/repair.cc b/db/repair.cc
index c8e7b9e5f..4b57169a3 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -183,13 +183,15 @@ class Repairer {
     // corruptions cause entire commits to be skipped instead of
     // propagating bad information (like overly large sequence
     // numbers).
-    log::Reader reader(lfile, &reporter, false/*do not checksum*/);
+    log::Reader reader(lfile, &reporter, false/*do not checksum*/,
+                       0/*initial_offset*/);
 
     // Read all the records and add to a memtable
     std::string scratch;
     Slice record;
     WriteBatch batch;
-    MemTable mem(icmp_);
+    MemTable* mem = new MemTable(icmp_);
+    mem->Ref();
     int counter = 0;
     while (reader.ReadRecord(&record, &scratch)) {
       if (record.size() < 12) {
@@ -198,7 +200,7 @@ class Repairer {
         continue;
       }
       WriteBatchInternal::SetContents(&batch, record);
-      status = WriteBatchInternal::InsertInto(&batch, &mem);
+      status = WriteBatchInternal::InsertInto(&batch, mem);
       if (status.ok()) {
         counter += WriteBatchInternal::Count(&batch);
       } else {
@@ -215,10 +217,12 @@ class Repairer {
     VersionEdit skipped;
     FileMetaData meta;
     meta.number = next_file_number_++;
-    Iterator* iter = mem.NewIterator();
+    Iterator* iter = mem->NewIterator();
     status = BuildTable(dbname_, env_, options_, table_cache_, iter,
                         &meta, &skipped);
     delete iter;
+    mem->Unref();
+    mem = NULL;
     if (status.ok()) {
       if (meta.file_size > 0) {
         table_numbers_.push_back(meta.number);
diff --git a/db/snapshot.h b/db/snapshot.h
index 9a9075672..a08dbd31b 100644
--- a/db/snapshot.h
+++ b/db/snapshot.h
@@ -12,17 +12,17 @@ namespace leveldb {
 class SnapshotList;
 
 // Snapshots are kept in a doubly-linked list in the DB.
-// Each Snapshot corresponds to a particular sequence number.
-class Snapshot {
+// Each SnapshotImpl corresponds to a particular sequence number.
+class SnapshotImpl : public Snapshot {
  public:
   SequenceNumber number_;  // const after creation
 
  private:
   friend class SnapshotList;
 
-  // Snapshot is kept in a doubly-linked circular list
-  Snapshot* prev_;
-  Snapshot* next_;
+  // SnapshotImpl is kept in a doubly-linked circular list
+  SnapshotImpl* prev_;
+  SnapshotImpl* next_;
 
   SnapshotList* list_;                 // just for sanity checks
 };
@@ -35,11 +35,11 @@ class SnapshotList {
   }
 
   bool empty() const { return list_.next_ == &list_; }
-  Snapshot* oldest() const { assert(!empty()); return list_.next_; }
-  Snapshot* newest() const { assert(!empty()); return list_.prev_; }
+  SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; }
+  SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; }
 
-  const Snapshot* New(SequenceNumber seq) {
-    Snapshot* s = new Snapshot;
+  const SnapshotImpl* New(SequenceNumber seq) {
+    SnapshotImpl* s = new SnapshotImpl;
     s->number_ = seq;
     s->list_ = this;
     s->next_ = &list_;
@@ -49,7 +49,7 @@ class SnapshotList {
     return s;
   }
 
-  void Delete(const Snapshot* s) {
+  void Delete(const SnapshotImpl* s) {
     assert(s->list_ == this);
     s->prev_->next_ = s->next_;
     s->next_->prev_ = s->prev_;
@@ -58,7 +58,7 @@ class SnapshotList {
 
  private:
   // Dummy head of doubly-linked list of snapshots
-  Snapshot list_;
+  SnapshotImpl list_;
 };
 
 }
diff --git a/db/version_set.cc b/db/version_set.cc
index c439f49ef..f64ac8d1a 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -57,17 +57,22 @@ std::string IntSetToString(const std::set<uint64_t>& s) {
 
 Version::~Version() {
   assert(refs_ == 0);
+
+  // Remove from linked list
+  prev_->next_ = next_;
+  next_->prev_ = prev_;
+
+  // Drop references to files
   for (int level = 0; level < config::kNumLevels; level++) {
     for (size_t i = 0; i < files_[level].size(); i++) {
       FileMetaData* f = files_[level][i];
-      assert(f->refs >= 0);
+      assert(f->refs > 0);
       f->refs--;
       if (f->refs <= 0) {
         delete f;
       }
     }
   }
-  delete cleanup_mem_;
 }
 
 // An internal iterator.  For a given version/level pair, yields
@@ -77,9 +82,9 @@ Version::~Version() {
 // encoded using EncodeFixed64.
 class Version::LevelFileNumIterator : public Iterator {
  public:
-  LevelFileNumIterator(const Version* version,
+  LevelFileNumIterator(const InternalKeyComparator& icmp,
                        const std::vector<FileMetaData*>* flist)
-      : icmp_(version->vset_->icmp_.user_comparator()),
+      : icmp_(icmp),
         flist_(flist),
         index_(flist->size()) {        // Marks as invalid
   }
@@ -157,7 +162,7 @@ static Iterator* GetFileIterator(void* arg,
 Iterator* Version::NewConcatenatingIterator(const ReadOptions& options,
                                             int level) const {
   return NewTwoLevelIterator(
-      new LevelFileNumIterator(this, &files_[level]),
+      new LevelFileNumIterator(vset_->icmp_, &files_[level]),
       &GetFileIterator, vset_->table_cache_, options);
 }
 
@@ -185,11 +190,11 @@ void Version::Ref() {
 }
 
 void Version::Unref() {
+  assert(this != &vset_->dummy_versions_);
   assert(refs_ >= 1);
   --refs_;
   if (refs_ == 0) {
-    vset_->MaybeDeleteOldVersions();
-    // TODO: try to delete obsolete files
+    delete this;
   }
 }
 
@@ -222,37 +227,58 @@ std::string Version::DebugString() const {
 // Versions that contain full copies of the intermediate state.
 class VersionSet::Builder {
  private:
-  typedef std::map<uint64_t, FileMetaData*> FileMap;
+  // Helper to sort by v->files_[file_number].smallest
+  struct BySmallestKey {
+    const InternalKeyComparator* internal_comparator;
+
+    bool operator()(FileMetaData* f1, FileMetaData* f2) const {
+      int r = internal_comparator->Compare(f1->smallest, f2->smallest);
+      if (r != 0) {
+        return (r < 0);
+      } else {
+        // Break ties by file number
+        return (f1->number < f2->number);
+      }
+    }
+  };
+
+  typedef std::set<FileMetaData*, BySmallestKey> FileSet;
+  struct LevelState {
+    std::set<uint64_t> deleted_files;
+    FileSet* added_files;
+  };
+
   VersionSet* vset_;
-  FileMap files_[config::kNumLevels];
+  Version* base_;
+  LevelState levels_[config::kNumLevels];
 
  public:
   // Initialize a builder with the files from *base and other info from *vset
   Builder(VersionSet* vset, Version* base)
-      : vset_(vset) {
+      : vset_(vset),
+        base_(base) {
+    base_->Ref();
+    BySmallestKey cmp;
+    cmp.internal_comparator = &vset_->icmp_;
     for (int level = 0; level < config::kNumLevels; level++) {
-      const std::vector<FileMetaData*>& files = base->files_[level];
-      for (size_t i = 0; i < files.size(); i++) {
-        FileMetaData* f = files[i];
-        f->refs++;
-        files_[level].insert(std::make_pair(f->number, f));
-      }
+      levels_[level].added_files = new FileSet(cmp);
     }
   }
 
   ~Builder() {
     for (int level = 0; level < config::kNumLevels; level++) {
-      const FileMap& fmap = files_[level];
-      for (FileMap::const_iterator iter = fmap.begin();
-           iter != fmap.end();
-           ++iter) {
-        FileMetaData* f = iter->second;
+      std::vector<FileMetaData*> to_unref(levels_[level].added_files->begin(),
+                                          levels_[level].added_files->end());
+      delete levels_[level].added_files;
+      for (int i = 0; i < to_unref.size(); i++) {
+        FileMetaData* f = to_unref[i];
         f->refs--;
         if (f->refs <= 0) {
           delete f;
         }
       }
     }
+    base_->Unref();
   }
 
   // Apply all of the edits in *edit to the current state.
@@ -271,16 +297,7 @@ class VersionSet::Builder {
          ++iter) {
       const int level = iter->first;
       const uint64_t number = iter->second;
-      FileMap::iterator fiter = files_[level].find(number);
-      assert(fiter != files_[level].end());  // Sanity check for debug mode
-      if (fiter != files_[level].end()) {
-        FileMetaData* f = fiter->second;
-        f->refs--;
-        if (f->refs <= 0) {
-          delete f;
-        }
-        files_[level].erase(fiter);
-      }
+      levels_[level].deleted_files.insert(number);
     }
 
     // Add new files
@@ -288,22 +305,66 @@ class VersionSet::Builder {
       const int level = edit->new_files_[i].first;
       FileMetaData* f = new FileMetaData(edit->new_files_[i].second);
       f->refs = 1;
-      assert(files_[level].count(f->number) == 0);
-      files_[level].insert(std::make_pair(f->number, f));
+      levels_[level].deleted_files.erase(f->number);
+      levels_[level].added_files->insert(f);
     }
   }
 
   // Save the current state in *v.
   void SaveTo(Version* v) {
+    BySmallestKey cmp;
+    cmp.internal_comparator = &vset_->icmp_;
     for (int level = 0; level < config::kNumLevels; level++) {
-      const FileMap& fmap = files_[level];
-      for (FileMap::const_iterator iter = fmap.begin();
-           iter != fmap.end();
-           ++iter) {
-        FileMetaData* f = iter->second;
-        f->refs++;
-        v->files_[level].push_back(f);
+      // Merge the set of added files with the set of pre-existing files.
+      // Drop any deleted files.  Store the result in *v.
+      const std::vector<FileMetaData*>& base_files = base_->files_[level];
+      std::vector<FileMetaData*>::const_iterator base_iter = base_files.begin();
+      std::vector<FileMetaData*>::const_iterator base_end = base_files.end();
+      const FileSet* added = levels_[level].added_files;
+      v->files_[level].reserve(base_files.size() + added->size());
+      for (FileSet::const_iterator added_iter = added->begin();
+           added_iter != added->end();
+           ++added_iter) {
+        // Add all smaller files listed in base_
+        for (std::vector<FileMetaData*>::const_iterator bpos
+                 = std::upper_bound(base_iter, base_end, *added_iter, cmp);
+             base_iter != bpos;
+             ++base_iter) {
+          MaybeAddFile(v, level, *base_iter);
+        }
+
+        MaybeAddFile(v, level, *added_iter);
+      }
+
+      // Add remaining base files
+      for (; base_iter != base_end; ++base_iter) {
+        MaybeAddFile(v, level, *base_iter);
       }
+
+#ifndef NDEBUG
+      // Make sure there is no overlap in levels > 0
+      if (level > 0) {
+        for (int i = 1; i < v->files_[level].size(); i++) {
+          const InternalKey& prev_end = v->files_[level][i-1]->largest;
+          const InternalKey& this_begin = v->files_[level][i]->smallest;
+          if (vset_->icmp_.Compare(prev_end, this_begin) >= 0) {
+            fprintf(stderr, "overlapping ranges in same level %s vs. %s\n",
+                    EscapeString(prev_end.Encode()).c_str(),
+                    EscapeString(this_begin.Encode()).c_str());
+            abort();
+          }
+        }
+      }
+#endif
+    }
+  }
+
+  void MaybeAddFile(Version* v, int level, FileMetaData* f) {
+    if (levels_[level].deleted_files.count(f->number) > 0) {
+      // File is deleted: do nothing
+    } else {
+      f->refs++;
+      v->files_[level].push_back(f);
     }
   }
 };
@@ -324,22 +385,36 @@ VersionSet::VersionSet(const std::string& dbname,
       prev_log_number_(0),
       descriptor_file_(NULL),
       descriptor_log_(NULL),
-      current_(new Version(this)),
-      oldest_(current_) {
+      dummy_versions_(this),
+      current_(NULL) {
+  AppendVersion(new Version(this));
 }
 
 VersionSet::~VersionSet() {
-  for (Version* v = oldest_; v != NULL; ) {
-    Version* next = v->next_;
-    assert(v->refs_ == 0);
-    delete v;
-    v = next;
-  }
+  current_->Unref();
+  assert(dummy_versions_.next_ == &dummy_versions_);  // List must be empty
   delete descriptor_log_;
   delete descriptor_file_;
 }
 
-Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) {
+void VersionSet::AppendVersion(Version* v) {
+  // Make "v" current
+  assert(v->refs_ == 0);
+  assert(v != current_);
+  if (current_ != NULL) {
+    current_->Unref();
+  }
+  current_ = v;
+  v->Ref();
+
+  // Append to linked list
+  v->prev_ = dummy_versions_.prev_;
+  v->next_ = &dummy_versions_;
+  v->prev_->next_ = v;
+  v->next_->prev_ = v;
+}
+
+Status VersionSet::LogAndApply(VersionEdit* edit) {
   if (edit->has_log_number_) {
     assert(edit->log_number_ >= log_number_);
     assert(edit->log_number_ < next_file_number_);
@@ -360,22 +435,20 @@ Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) {
     builder.Apply(edit);
     builder.SaveTo(v);
   }
-
-  std::string new_manifest_file;
-  Status s = Finalize(v);
+  Finalize(v);
 
   // Initialize new descriptor log file if necessary by creating
   // a temporary file that contains a snapshot of the current version.
-  if (s.ok()) {
-    if (descriptor_log_ == NULL) {
-      assert(descriptor_file_ == NULL);
-      new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_);
-      edit->SetNextFile(next_file_number_);
-      s = env_->NewWritableFile(new_manifest_file, &descriptor_file_);
-      if (s.ok()) {
-        descriptor_log_ = new log::Writer(descriptor_file_);
-        s = WriteSnapshot(descriptor_log_);
-      }
+  std::string new_manifest_file;
+  Status s;
+  if (descriptor_log_ == NULL) {
+    assert(descriptor_file_ == NULL);
+    new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_);
+    edit->SetNextFile(next_file_number_);
+    s = env_->NewWritableFile(new_manifest_file, &descriptor_file_);
+    if (s.ok()) {
+      descriptor_log_ = new log::Writer(descriptor_file_);
+      s = WriteSnapshot(descriptor_log_);
     }
   }
 
@@ -397,12 +470,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) {
 
   // Install the new version
   if (s.ok()) {
-    assert(current_->next_ == NULL);
-    assert(current_->cleanup_mem_ == NULL);
-    current_->cleanup_mem_ = cleanup_mem;
-    v->next_ = NULL;
-    current_->next_ = v;
-    current_ = v;
+    AppendVersion(v);
     log_number_ = edit->log_number_;
     prev_log_number_ = edit->prev_log_number_;
   } else {
@@ -458,7 +526,7 @@ Status VersionSet::Recover() {
   {
     LogReporter reporter;
     reporter.status = &s;
-    log::Reader reader(file, &reporter, true/*checksum*/);
+    log::Reader reader(file, &reporter, true/*checksum*/, 0/*initial_offset*/);
     Slice record;
     std::string scratch;
     while (reader.ReadRecord(&record, &scratch) && s.ok()) {
@@ -518,20 +586,14 @@ Status VersionSet::Recover() {
   if (s.ok()) {
     Version* v = new Version(this);
     builder.SaveTo(v);
-    s = Finalize(v);
-    if (!s.ok()) {
-      delete v;
-    } else {
-      // Install recovered version
-      v->next_ = NULL;
-      current_->next_ = v;
-      current_ = v;
-      manifest_file_number_ = next_file;
-      next_file_number_ = next_file + 1;
-      last_sequence_ = last_sequence;
-      log_number_ = log_number;
-      prev_log_number_ = prev_log_number;
-    }
+    // Install recovered version
+    Finalize(v);
+    AppendVersion(v);
+    manifest_file_number_ = next_file;
+    next_file_number_ = next_file + 1;
+    last_sequence_ = last_sequence;
+    log_number_ = log_number;
+    prev_log_number_ = prev_log_number;
   }
 
   return s;
@@ -545,15 +607,12 @@ static int64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
   return sum;
 }
 
-Status VersionSet::Finalize(Version* v) {
+void VersionSet::Finalize(Version* v) {
   // Precomputed best level for next compaction
   int best_level = -1;
   double best_score = -1;
 
-  Status s;
-  for (int level = 0; s.ok() && level < config::kNumLevels-1; level++) {
-    s = SortLevel(v, level);
-
+  for (int level = 0; level < config::kNumLevels-1; level++) {
     double score;
     if (level == 0) {
       // We treat level-0 specially by bounding the number of files
@@ -567,7 +626,8 @@ Status VersionSet::Finalize(Version* v) {
       // file size is small (perhaps because of a small write-buffer
       // setting, or very high compression ratios, or lots of
       // overwrites/deletions).
-      score = v->files_[level].size() / 4.0;
+      score = v->files_[level].size() /
+          static_cast<double>(config::kL0_CompactionTrigger);
     } else {
       // Compute the ratio of current size to size limit.
       const uint64_t level_bytes = TotalFileSize(v->files_[level]);
@@ -582,7 +642,6 @@ Status VersionSet::Finalize(Version* v) {
 
   v->compaction_level_ = best_level;
   v->compaction_score_ = best_score;
-  return s;
 }
 
 Status VersionSet::WriteSnapshot(log::Writer* log) {
@@ -615,44 +674,27 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
   return log->AddRecord(record);
 }
 
-// Helper to sort by tables_[file_number].smallest
-struct VersionSet::BySmallestKey {
-  const InternalKeyComparator* internal_comparator;
-
-  bool operator()(FileMetaData* f1, FileMetaData* f2) const {
-    return internal_comparator->Compare(f1->smallest, f2->smallest) < 0;
-  }
-};
-
-Status VersionSet::SortLevel(Version* v, uint64_t level) {
-  Status result;
-  BySmallestKey cmp;
-  cmp.internal_comparator = &icmp_;
-  std::sort(v->files_[level].begin(), v->files_[level].end(), cmp);
-
-  if (result.ok() && level > 0) {
-    // There should be no overlap
-    for (size_t i = 1; i < v->files_[level].size(); i++) {
-      const InternalKey& prev_end = v->files_[level][i-1]->largest;
-      const InternalKey& this_begin = v->files_[level][i]->smallest;
-      if (icmp_.Compare(prev_end, this_begin) >= 0) {
-        result = Status::Corruption(
-            "overlapping ranges in same level",
-            (EscapeString(prev_end.Encode()) + " vs. " +
-             EscapeString(this_begin.Encode())));
-        break;
-      }
-    }
-  }
-  return result;
-}
-
 int VersionSet::NumLevelFiles(int level) const {
   assert(level >= 0);
   assert(level < config::kNumLevels);
   return current_->files_[level].size();
 }
 
+const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const {
+  // Update code if kNumLevels changes
+  assert(config::kNumLevels == 7);
+  snprintf(scratch->buffer, sizeof(scratch->buffer),
+           "files[ %d %d %d %d %d %d %d ]",
+           int(current_->files_[0].size()),
+           int(current_->files_[1].size()),
+           int(current_->files_[2].size()),
+           int(current_->files_[3].size()),
+           int(current_->files_[4].size()),
+           int(current_->files_[5].size()),
+           int(current_->files_[6].size()));
+  return scratch->buffer;
+}
+
 uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
   uint64_t result = 0;
   for (int level = 0; level < config::kNumLevels; level++) {
@@ -685,19 +727,10 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
   return result;
 }
 
-void VersionSet::MaybeDeleteOldVersions() {
-  // Note: it is important to delete versions in order since a newer
-  // version with zero refs may be holding a pointer to a memtable
-  // that is used by somebody who has a ref on an older version.
-  while (oldest_ != current_ && oldest_->refs_ == 0) {
-    Version* next = oldest_->next_;
-    delete oldest_;
-    oldest_ = next;
-  }
-}
-
 void VersionSet::AddLiveFiles(std::set<uint64_t>* live) {
-  for (Version* v = oldest_; v != NULL; v = v->next_) {
+  for (Version* v = dummy_versions_.next_;
+       v != &dummy_versions_;
+       v = v->next_) {
     for (int level = 0; level < config::kNumLevels; level++) {
       const std::vector<FileMetaData*>& files = v->files_[level];
       for (size_t i = 0; i < files.size(); i++) {
@@ -809,8 +842,7 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
       } else {
         // Create concatenating iterator for the files from this level
         list[num++] = NewTwoLevelIterator(
-            new Version::LevelFileNumIterator(
-                c->input_version_, &c->inputs_[which]),
+            new Version::LevelFileNumIterator(icmp_, &c->inputs_[which]),
             &GetFileIterator, table_cache_, options);
       }
     }
@@ -996,11 +1028,12 @@ bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
   return true;
 }
 
-bool Compaction::ShouldStopBefore(const InternalKey& key) {
+bool Compaction::ShouldStopBefore(const Slice& internal_key) {
   // Scan to find earliest grandparent file that contains key.
   const InternalKeyComparator* icmp = &input_version_->vset_->icmp_;
   while (grandparent_index_ < grandparents_.size() &&
-      icmp->Compare(key, grandparents_[grandparent_index_]->largest) > 0) {
+      icmp->Compare(internal_key,
+                    grandparents_[grandparent_index_]->largest.Encode()) > 0) {
     if (seen_key_) {
       overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
     }
diff --git a/db/version_set.h b/db/version_set.h
index e37751300..2bac5e207 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -59,8 +59,8 @@ class Version {
 
   VersionSet* vset_;            // VersionSet to which this Version belongs
   Version* next_;               // Next version in linked list
+  Version* prev_;               // Previous version in linked list
   int refs_;                    // Number of live refs to this version
-  MemTable* cleanup_mem_;       // NULL, or table to delete when version dropped
 
   // List of files per level
   std::vector<FileMetaData*> files_[config::kNumLevels];
@@ -72,8 +72,7 @@ class Version {
   int compaction_level_;
 
   explicit Version(VersionSet* vset)
-      : vset_(vset), next_(NULL), refs_(0),
-        cleanup_mem_(NULL),
+      : vset_(vset), next_(this), prev_(this), refs_(0),
         compaction_score_(-1),
         compaction_level_(-1) {
   }
@@ -95,10 +94,8 @@ class VersionSet {
 
   // Apply *edit to the current version to form a new descriptor that
   // is both saved to persistent state and installed as the new
-  // current version.  Iff Apply() returns OK, arrange to delete
-  // cleanup_mem (if cleanup_mem != NULL) when it is no longer needed
-  // by older versions.
-  Status LogAndApply(VersionEdit* edit, MemTable* cleanup_mem);
+  // current version.
+  Status LogAndApply(VersionEdit* edit);
 
   // Recover the last saved descriptor from persistent storage.
   Status Recover();
@@ -171,19 +168,20 @@ class VersionSet {
   // "key" as of version "v".
   uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
 
+  // Return a human-readable short (single-line) summary of the number
+  // of files per level.  Uses *scratch as backing store.
+  struct LevelSummaryStorage {
+    char buffer[100];
+  };
+  const char* LevelSummary(LevelSummaryStorage* scratch) const;
+
  private:
   class Builder;
 
   friend class Compaction;
   friend class Version;
 
-  Status Finalize(Version* v);
-
-  // Delete any old versions that are no longer needed.
-  void MaybeDeleteOldVersions();
-
-  struct BySmallestKey;
-  Status SortLevel(Version* v, uint64_t level);
+  void Finalize(Version* v);
 
   void GetOverlappingInputs(
       int level,
@@ -202,6 +200,8 @@ class VersionSet {
 
   void SetupOtherInputs(Compaction* c);
 
+  void AppendVersion(Version* v);
+
   Env* const env_;
   const std::string dbname_;
   const Options* const options_;
@@ -216,10 +216,8 @@ class VersionSet {
   // Opened lazily
   WritableFile* descriptor_file_;
   log::Writer* descriptor_log_;
-
-  // Versions are kept in a singly linked list that is never empty
-  Version* current_;    // Pointer to the last (newest) list entry
-  Version* oldest_;     // Pointer to the first (oldest) list entry
+  Version dummy_versions_;  // Head of circular doubly-linked list of versions.
+  Version* current_;        // == dummy_versions_.prev_
 
   // Per-level key at which the next compaction at that level should start.
   // Either an empty string, or a valid InternalKey.
@@ -265,8 +263,8 @@ class Compaction {
   bool IsBaseLevelForKey(const Slice& user_key);
 
   // Returns true iff we should stop building the current output
-  // before processing "key".
-  bool ShouldStopBefore(const InternalKey& key);
+  // before processing "internal_key".
+  bool ShouldStopBefore(const Slice& internal_key);
 
   // Release the input version for the compaction, once the compaction
   // is successful.
diff --git a/db/write_batch.cc b/db/write_batch.cc
index d5615289a..4e1e899d4 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -29,11 +29,53 @@ WriteBatch::WriteBatch() {
 
 WriteBatch::~WriteBatch() { }
 
+WriteBatch::Handler::~Handler() { }
+
 void WriteBatch::Clear() {
   rep_.clear();
   rep_.resize(12);
 }
 
+Status WriteBatch::Iterate(Handler* handler) const {
+  Slice input(rep_);
+  if (input.size() < 12) {
+    return Status::Corruption("malformed WriteBatch (too small)");
+  }
+
+  input.remove_prefix(12);
+  Slice key, value;
+  int found = 0;
+  while (!input.empty()) {
+    found++;
+    char tag = input[0];
+    input.remove_prefix(1);
+    switch (tag) {
+      case kTypeValue:
+        if (GetLengthPrefixedSlice(&input, &key) &&
+            GetLengthPrefixedSlice(&input, &value)) {
+          handler->Put(key, value);
+        } else {
+          return Status::Corruption("bad WriteBatch Put");
+        }
+        break;
+      case kTypeDeletion:
+        if (GetLengthPrefixedSlice(&input, &key)) {
+          handler->Delete(key);
+        } else {
+          return Status::Corruption("bad WriteBatch Delete");
+        }
+        break;
+      default:
+        return Status::Corruption("unknown WriteBatch tag");
+    }
+  }
+  if (found != WriteBatchInternal::Count(this)) {
+    return Status::Corruption("WriteBatch has wrong count");
+  } else {
+    return Status::OK();
+  }
+}
+
 int WriteBatchInternal::Count(const WriteBatch* b) {
   return DecodeFixed32(b->rep_.data() + 8);
 }
@@ -63,28 +105,29 @@ void WriteBatch::Delete(const Slice& key) {
   PutLengthPrefixedSlice(&rep_, key);
 }
 
-Status WriteBatchInternal::InsertInto(const WriteBatch* b,
-                                      MemTable* memtable) {
-  const int count = WriteBatchInternal::Count(b);
-  int found = 0;
-  Iterator it(*b);
-  for (; !it.Done(); it.Next()) {
-    switch (it.op()) {
-      case kTypeDeletion:
-        memtable->Add(it.sequence_number(), kTypeDeletion, it.key(), Slice());
-        break;
-      case kTypeValue:
-        memtable->Add(it.sequence_number(), kTypeValue, it.key(), it.value());
-        break;
-    }
-    found++;
+namespace {
+class MemTableInserter : public WriteBatch::Handler {
+ public:
+  SequenceNumber sequence_;
+  MemTable* mem_;
+
+  virtual void Put(const Slice& key, const Slice& value) {
+    mem_->Add(sequence_, kTypeValue, key, value);
+    sequence_++;
   }
-  if (!it.status().ok()) {
-    return it.status();
-  } else if (found != count) {
-    return Status::Corruption("wrong count in WriteBatch");
+  virtual void Delete(const Slice& key) {
+    mem_->Add(sequence_, kTypeDeletion, key, Slice());
+    sequence_++;
   }
-  return Status::OK();
+};
+}
+
+Status WriteBatchInternal::InsertInto(const WriteBatch* b,
+                                      MemTable* memtable) {
+  MemTableInserter inserter;
+  inserter.sequence_ = WriteBatchInternal::Sequence(b);
+  inserter.mem_ = memtable;
+  return b->Iterate(&inserter);
 }
 
 void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
@@ -92,57 +135,4 @@ void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
   b->rep_.assign(contents.data(), contents.size());
 }
 
-WriteBatchInternal::Iterator::Iterator(const WriteBatch& batch)
-    : input_(WriteBatchInternal::Contents(&batch)),
-      done_(false) {
-  if (input_.size() < 12) {
-    done_ = true;
-  } else {
-    seq_ = WriteBatchInternal::Sequence(&batch),
-    input_.remove_prefix(12);
-    GetNextEntry();
-  }
-}
-
-void WriteBatchInternal::Iterator::Next() {
-  assert(!done_);
-  seq_++;
-  GetNextEntry();
-}
-
-void WriteBatchInternal::Iterator::GetNextEntry() {
-  if (input_.empty()) {
-    done_ = true;
-    return;
-  }
-  char tag = input_[0];
-  input_.remove_prefix(1);
-  switch (tag) {
-    case kTypeValue:
-      if (GetLengthPrefixedSlice(&input_, &key_) &&
-          GetLengthPrefixedSlice(&input_, &value_)) {
-        op_ = static_cast<ValueType>(tag);
-      } else {
-        status_ = Status::Corruption("bad WriteBatch Put");
-        done_ = true;
-        input_.clear();
-      }
-      break;
-    case kTypeDeletion:
-      if (GetLengthPrefixedSlice(&input_, &key_)) {
-        op_ = kTypeDeletion;
-      } else {
-        status_ = Status::Corruption("bad WriteBatch Delete");
-        done_ = true;
-        input_.clear();
-      }
-      break;
-    default:
-      status_ = Status::Corruption("unknown WriteBatch tag");
-      done_ = true;
-      input_.clear();
-      break;
-  }
-}
-
 }
diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h
index ab0a823a3..d9754448f 100644
--- a/db/write_batch_internal.h
+++ b/db/write_batch_internal.h
@@ -37,30 +37,6 @@ class WriteBatchInternal {
   static void SetContents(WriteBatch* batch, const Slice& contents);
 
   static Status InsertInto(const WriteBatch* batch, MemTable* memtable);
-
-  // Iterate over the contents of a write batch.
-  class Iterator {
-   public:
-    explicit Iterator(const WriteBatch& batch);
-    bool Done() const { return done_; }
-    void Next();
-    ValueType op() const { return op_; }
-    const Slice& key() const { return key_; }
-    const Slice& value() const { return value_; }
-    SequenceNumber sequence_number() const { return seq_; }
-    Status status() const { return status_; }
-
-   private:
-    void GetNextEntry();
-
-    Slice input_;
-    bool done_;
-    ValueType op_;
-    Slice key_;
-    Slice value_;
-    SequenceNumber seq_;
-    Status status_;
-  };
 };
 
 }
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index 2bf1134a6..73d68fdda 100644
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -14,10 +14,11 @@ namespace leveldb {
 
 static std::string PrintContents(WriteBatch* b) {
   InternalKeyComparator cmp(BytewiseComparator());
-  MemTable mem(cmp);
+  MemTable* mem = new MemTable(cmp);
+  mem->Ref();
   std::string state;
-  Status s = WriteBatchInternal::InsertInto(b, &mem);
-  Iterator* iter = mem.NewIterator();
+  Status s = WriteBatchInternal::InsertInto(b, mem);
+  Iterator* iter = mem->NewIterator();
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
     ParsedInternalKey ikey;
     ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
@@ -42,6 +43,7 @@ static std::string PrintContents(WriteBatch* b) {
   if (!s.ok()) {
     state.append("ParseError()");
   }
+  mem->Unref();
   return state;
 }
 
diff --git a/doc/impl.html b/doc/impl.html
index dd09fea17..e870795d2 100644
--- a/doc/impl.html
+++ b/doc/impl.html
@@ -17,14 +17,14 @@ However the organization of the files that make up the representation
 is somewhat different and is explained below.
 
 <p>
-Each database is represented by a set of file stored in a directory.
+Each database is represented by a set of files stored in a directory.
 There are several different types of files as documented below:
 <p>
 <h2>Log files</h2>
 <p>
 A log file (*.log) stores a sequence of recent updates.  Each update
 is appended to the current log file.  When the log file reaches a
-pre-determined size (approximately 1MB by default), it is converted
+pre-determined size (approximately 4MB by default), it is converted
 to a sorted table (see below) and a new log file is created for future
 updates.
 <p>
@@ -83,19 +83,15 @@ Other files used for miscellaneous purposes may also be present
 <h1>Level 0</h1>
 When the log file grows above a certain size (1MB by default):
 <ul>
-<li>Write the contents of the current memtable to an sstable
-<li>Replace the current memtable by a brand new empty memtable
-<li>Switch to a new log file
+<li>Create a brand new memtable and log file and direct future updates here
+<li>In the background:
+<ul>
+<li>Write the contents of the previous memtable to an sstable
+<li>Discard the memtable
 <li>Delete the old log file and the old memtable
+<li>Add the new sstable to the young (level-0) level.
+</ul>
 </ul>
-Experimental measurements show that generating an sstable from a 1MB
-log file takes ~12ms, which seems like an acceptable latency hiccup to
-add infrequently to a log write.
-
-<p>
-The new sstable is added to a special level-0 level.  level-0 contains
-a set of files (up to 4 by default).  However unlike other levels,
-these files do not cover disjoint ranges, but may overlap each other.
 
 <h1>Compactions</h1>
 
@@ -162,8 +158,8 @@ read.
 <p>
 Solution 1: To reduce this problem, we might want to increase the log
 switching threshold when the number of level-0 files is large.  Though
-the downside is that the larger this threshold, the larger the delay
-that we will add to write latency when a write triggers a log switch.
+the downside is that the larger this threshold, the more memory we will
+need to hold the corresponding memtable.
 
 <p>
 Solution 2: We might want to decrease write rate artificially when the
diff --git a/doc/index.html b/doc/index.html
index c2312b7fb..58442e898 100644
--- a/doc/index.html
+++ b/doc/index.html
@@ -141,10 +141,18 @@ the batch.
 <p>
 <h1>Concurrency</h1>
 <p>
-A database may only be opened by one process at a time.  The <code>leveldb</code>
-implementation acquires a lock from the operating system to prevent
-misuse.  Within a single process, the same <code>leveldb::DB</code> object may
-be safely used by multiple concurrent threads.
+A database may only be opened by one process at a time.
+The <code>leveldb</code> implementation acquires a lock from the
+operating system to prevent misuse.  Within a single process, the
+same <code>leveldb::DB</code> object may be safely shared by multiple
+concurrent threads.  I.e., different threads may write into or fetch
+iterators or call <code>Get</code> on the same database without any
+external synchronization (the leveldb implementation will
+automatically do the required synchronization).  However other objects
+(like Iterator and WriteBatch) may require external synchronization.
+If two threads share such an object, they must protect access to it
+using their own locking protocol.  More details are available in
+the public header files.
 <p>
 <h1>Iteration</h1>
 <p>
diff --git a/include/leveldb/comparator.h b/include/leveldb/comparator.h
index 4e00e4dfb..c215facfc 100644
--- a/include/leveldb/comparator.h
+++ b/include/leveldb/comparator.h
@@ -12,7 +12,9 @@ namespace leveldb {
 class Slice;
 
 // A Comparator object provides a total order across slices that are
-// used as keys in an sstable or a database.
+// used as keys in an sstable or a database.  A Comparator implementation
+// must be thread-safe since leveldb may invoke its methods concurrently
+// from multiple threads.
 class Comparator {
  public:
   virtual ~Comparator();
diff --git a/include/leveldb/db.h b/include/leveldb/db.h
index f18ded347..79bd283d3 100644
--- a/include/leveldb/db.h
+++ b/include/leveldb/db.h
@@ -13,26 +13,32 @@
 namespace leveldb {
 
 static const int kMajorVersion = 1;
-static const int kMinorVersion = 1;
+static const int kMinorVersion = 2;
 
 struct Options;
 struct ReadOptions;
 struct WriteOptions;
-
-class Snapshot;
 class WriteBatch;
 
-// Some internal types.  Clients should ignore.
-class WriteBatchInternal;
+// Abstract handle to particular state of a DB.
+// A Snapshot is an immutable object and can therefore be safely
+// accessed from multiple threads without any external synchronization.
+class Snapshot {
+ protected:
+  virtual ~Snapshot();
+};
 
+// A range of keys
 struct Range {
-  Slice start;
-  Slice limit;
+  Slice start;          // Included in the range
+  Slice limit;          // Not included in the range
 
   Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
 };
 
 // A DB is a persistent ordered map from keys to values.
+// A DB is safe for concurrent access from multiple threads without
+// any external synchronization.
 class DB {
  public:
   // Open the database with the specified "name".
diff --git a/include/leveldb/env.h b/include/leveldb/env.h
index 4b6e712a7..39f6a1a22 100644
--- a/include/leveldb/env.h
+++ b/include/leveldb/env.h
@@ -6,6 +6,9 @@
 // operating system functionality like the filesystem etc.  Callers
 // may wish to provide a custom Env object when opening a database to
 // get fine gain control; e.g., to rate limit file system operations.
+//
+// All Env implementations are safe for concurrent access from
+// multiple threads without any external synchronization.
 
 #ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_
 #define STORAGE_LEVELDB_INCLUDE_ENV_H_
@@ -160,6 +163,15 @@ class SequentialFile {
   //
   // REQUIRES: External synchronization
   virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
+
+  // Skip "n" bytes from the file. This is guaranteed to be no
+  // slower that reading the same data, but may be faster.
+  //
+  // If end of file is reached, skipping will stop at the end of the
+  // file, and Skip will return OK.
+  //
+  // REQUIRES: External synchronization
+  virtual Status Skip(uint64_t n) = 0;
 };
 
 // A file abstraction for randomly reading the contents of a file.
diff --git a/include/leveldb/iterator.h b/include/leveldb/iterator.h
index 1866fb5b8..6821d853f 100644
--- a/include/leveldb/iterator.h
+++ b/include/leveldb/iterator.h
@@ -6,6 +6,11 @@
 // The following class defines the interface.  Multiple implementations
 // are provided by this library.  In particular, iterators are provided
 // to access the contents of a Table or a DB.
+//
+// Multiple threads can invoke const methods on an Iterator without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Iterator must use
+// external synchronization.
 
 #ifndef STORAGE_LEVELDB_INCLUDE_ITERATOR_H_
 #define STORAGE_LEVELDB_INCLUDE_ITERATOR_H_
diff --git a/include/leveldb/slice.h b/include/leveldb/slice.h
index 62cb89499..3c000b814 100644
--- a/include/leveldb/slice.h
+++ b/include/leveldb/slice.h
@@ -6,6 +6,11 @@
 // storage and a size.  The user of a Slice must ensure that the slice
 // is not used after the corresponding external storage has been
 // deallocated.
+//
+// Multiple threads can invoke const methods on a Slice without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Slice must use
+// external synchronization.
 
 #ifndef STORAGE_LEVELDB_INCLUDE_SLICE_H_
 #define STORAGE_LEVELDB_INCLUDE_SLICE_H_
diff --git a/include/leveldb/status.h b/include/leveldb/status.h
index 47e3edf2b..6796fdd43 100644
--- a/include/leveldb/status.h
+++ b/include/leveldb/status.h
@@ -4,12 +4,16 @@
 //
 // A Status encapsulates the result of an operation.  It may indicate success,
 // or it may indicate an error with an associated error message.
+//
+// Multiple threads can invoke const methods on a Status without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Status must use
+// external synchronization.
 
 #ifndef STORAGE_LEVELDB_INCLUDE_STATUS_H_
 #define STORAGE_LEVELDB_INCLUDE_STATUS_H_
 
 #include <string>
-#include <utility>
 #include "leveldb/slice.h"
 
 namespace leveldb {
@@ -18,7 +22,7 @@ class Status {
  public:
   // Create a success status.
   Status() : state_(NULL) { }
-  ~Status() { delete state_; }
+  ~Status() { delete[] state_; }
 
   // Copy the specified status.
   Status(const Status& s);
@@ -29,7 +33,7 @@ class Status {
 
   // Return error status of an appropriate type.
   static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) {
-    return Status(kNotFound, msg, Slice());
+    return Status(kNotFound, msg, msg2);
   }
   static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
     return Status(kCorruption, msg, msg2);
@@ -55,6 +59,13 @@ class Status {
   std::string ToString() const;
 
  private:
+  // OK status has a NULL state_.  Otherwise, state_ is a new[] array
+  // of the following form:
+  //    state_[0..3] == length of message
+  //    state_[4]    == code
+  //    state_[5..]  == message
+  const char* state_;
+
   enum Code {
     kOk = 0,
     kNotFound = 1,
@@ -63,21 +74,24 @@ class Status {
     kInvalidArgument = 4,
     kIOError = 5,
   };
-  Code code() const { return (state_ == NULL) ? kOk : state_->first; }
 
-  Status(Code code, const Slice& msg, const Slice& msg2);
+  Code code() const {
+    return (state_ == NULL) ? kOk : static_cast<Code>(state_[4]);
+  }
 
-  typedef std::pair<Code, std::string> State;
-  State* state_;
+  Status(Code code, const Slice& msg, const Slice& msg2);
+  static const char* CopyState(const char* s);
 };
 
 inline Status::Status(const Status& s) {
-  state_ = (s.state_ == NULL) ? NULL : new State(*s.state_);
+  state_ = (s.state_ == NULL) ? NULL : CopyState(s.state_);
 }
 inline void Status::operator=(const Status& s) {
-  if (this != &s) {
-    delete state_;
-    state_ = (s.state_ == NULL) ? NULL : new State(*s.state_);
+  // The following condition catches both aliasing (when this == &s),
+  // and the common case where both s and *this are ok.
+  if (state_ != s.state_) {
+    delete[] state_;
+    state_ = (s.state_ == NULL) ? NULL : CopyState(s.state_);
   }
 }
 
diff --git a/include/leveldb/table.h b/include/leveldb/table.h
index bd9917600..35e5d22f3 100644
--- a/include/leveldb/table.h
+++ b/include/leveldb/table.h
@@ -17,7 +17,8 @@ class RandomAccessFile;
 struct ReadOptions;
 
 // A Table is a sorted map from strings to strings.  Tables are
-// immutable and persistent.
+// immutable and persistent.  A Table may be safely accessed from
+// multiple threads without external synchronization.
 class Table {
  public:
   // Attempt to open the table that is stored in bytes [0..file_size)
diff --git a/include/leveldb/table_builder.h b/include/leveldb/table_builder.h
index 49d2d514b..23851de43 100644
--- a/include/leveldb/table_builder.h
+++ b/include/leveldb/table_builder.h
@@ -4,6 +4,11 @@
 //
 // TableBuilder provides the interface used to build a Table
 // (an immutable and sorted map from keys to values).
+//
+// Multiple threads can invoke const methods on a TableBuilder without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same TableBuilder must use
+// external synchronization.
 
 #ifndef STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_
 #define STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_
diff --git a/include/leveldb/write_batch.h b/include/leveldb/write_batch.h
index 34119525f..b4446c220 100644
--- a/include/leveldb/write_batch.h
+++ b/include/leveldb/write_batch.h
@@ -12,11 +12,17 @@
 //    batch.Delete("key");
 //    batch.Put("key", "v2");
 //    batch.Put("key", "v3");
+//
+// Multiple threads can invoke const methods on a WriteBatch without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same WriteBatch must use
+// external synchronization.
 
 #ifndef STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_
 #define STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_
 
 #include <string>
+#include "leveldb/status.h"
 
 namespace leveldb {
 
@@ -36,6 +42,15 @@ class WriteBatch {
   // Clear all updates buffered in this batch.
   void Clear();
 
+  // Support for iterating over the contents of a batch.
+  class Handler {
+   public:
+    virtual ~Handler();
+    virtual void Put(const Slice& key, const Slice& value) = 0;
+    virtual void Delete(const Slice& key) = 0;
+  };
+  Status Iterate(Handler* handler) const;
+
  private:
   friend class WriteBatchInternal;
 
diff --git a/table/block_builder.cc b/table/block_builder.cc
index dc958c80c..d2ffa21be 100644
--- a/table/block_builder.cc
+++ b/table/block_builder.cc
@@ -80,7 +80,7 @@ void BlockBuilder::Add(const Slice& key, const Slice& value) {
   if (counter_ < options_->block_restart_interval) {
     // See how much sharing to do with previous string
     const size_t min_length = std::min(last_key_piece.size(), key.size());
-    while ((shared < min_length) && (last_key_[shared] == key[shared])) {
+    while ((shared < min_length) && (last_key_piece[shared] == key[shared])) {
       shared++;
     }
   } else {
diff --git a/table/table_test.cc b/table/table_test.cc
index 4b3e85e22..cf2bae03b 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -319,13 +319,15 @@ class MemTableConstructor: public Constructor {
       : Constructor(cmp),
         internal_comparator_(cmp) {
     memtable_ = new MemTable(internal_comparator_);
+    memtable_->Ref();
   }
   ~MemTableConstructor() {
-    delete memtable_;
+    memtable_->Unref();
   }
   virtual Status FinishImpl(const Options& options, const KVMap& data) {
-    delete memtable_;
+    memtable_->Unref();
     memtable_ = new MemTable(internal_comparator_);
+    memtable_->Ref();
     int seq = 1;
     for (KVMap::const_iterator it = data.begin();
          it != data.end();
@@ -736,16 +738,17 @@ class MemTableTest { };
 
 TEST(MemTableTest, Simple) {
   InternalKeyComparator cmp(BytewiseComparator());
-  MemTable memtable(cmp);
+  MemTable* memtable = new MemTable(cmp);
+  memtable->Ref();
   WriteBatch batch;
   WriteBatchInternal::SetSequence(&batch, 100);
   batch.Put(std::string("k1"), std::string("v1"));
   batch.Put(std::string("k2"), std::string("v2"));
   batch.Put(std::string("k3"), std::string("v3"));
   batch.Put(std::string("largekey"), std::string("vlarge"));
-  ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &memtable).ok());
+  ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, memtable).ok());
 
-  Iterator* iter = memtable.NewIterator();
+  Iterator* iter = memtable->NewIterator();
   iter->SeekToFirst();
   while (iter->Valid()) {
     fprintf(stderr, "key: '%s' -> '%s'\n",
@@ -755,6 +758,7 @@ TEST(MemTableTest, Simple) {
   }
 
   delete iter;
+  memtable->Unref();
 }
 
 static bool Between(uint64_t val, uint64_t low, uint64_t high) {
diff --git a/util/env_chromium.cc b/util/env_chromium.cc
index fd3a4c76d..1af525a5f 100644
--- a/util/env_chromium.cc
+++ b/util/env_chromium.cc
@@ -141,6 +141,13 @@ class ChromiumSequentialFile: public SequentialFile {
     }
     return s;
   }
+
+  virtual Status Skip(uint64_t n) {
+    if (fseek(file_, n, SEEK_CUR)) {
+      return Status::IOError(filename_, strerror(errno));
+    }
+    return Status::OK();
+  }
 };
 
 class ChromiumRandomAccessFile: public RandomAccessFile {
diff --git a/util/env_posix.cc b/util/env_posix.cc
index 5cddb0c75..fec1599dd 100644
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@@ -52,6 +52,13 @@ class PosixSequentialFile: public SequentialFile {
     }
     return s;
   }
+
+  virtual Status Skip(uint64_t n) {
+    if (fseek(file_, n, SEEK_CUR)) {
+      return Status::IOError(filename_, strerror(errno));
+    }
+    return Status::OK();
+  }
 };
 
 class PosixRandomAccessFile: public RandomAccessFile {
diff --git a/util/status.cc b/util/status.cc
index d9b7195a8..02051a9da 100644
--- a/util/status.cc
+++ b/util/status.cc
@@ -8,13 +8,29 @@
 
 namespace leveldb {
 
+const char* Status::CopyState(const char* state) {
+  uint32_t size;
+  memcpy(&size, state, sizeof(size));
+  char* result = new char[size + 5];
+  memcpy(result, state, size + 5);
+  return result;
+}
+
 Status::Status(Code code, const Slice& msg, const Slice& msg2) {
   assert(code != kOk);
-  state_ = new State(make_pair(code, std::string(msg.data(), msg.size())));
-  if (!msg2.empty()) {
-    state_->second.append(": ");
-    state_->second.append(msg2.data(), msg2.size());
+  const uint32_t len1 = msg.size();
+  const uint32_t len2 = msg2.size();
+  const uint32_t size = len1 + (len2 ? (2 + len2) : 0);
+  char* result = new char[size + 5];
+  memcpy(result, &size, sizeof(size));
+  result[4] = static_cast<char>(code);
+  memcpy(result + 5, msg.data(), len1);
+  if (len2) {
+    result[5 + len1] = ':';
+    result[6 + len1] = ' ';
+    memcpy(result + 7 + len1, msg2.data(), len2);
   }
+  state_ = result;
 }
 
 std::string Status::ToString() const {
@@ -23,12 +39,12 @@ std::string Status::ToString() const {
   } else {
     char tmp[30];
     const char* type;
-    switch (state_->first) {
+    switch (code()) {
       case kOk:
         type = "OK";
         break;
       case kNotFound:
-        type = "NotFound";
+        type = "NotFound: ";
         break;
       case kCorruption:
         type = "Corruption: ";
@@ -44,14 +60,14 @@ std::string Status::ToString() const {
         break;
       default:
         snprintf(tmp, sizeof(tmp), "Unknown code(%d): ",
-                 static_cast<int>(state_->first));
+                 static_cast<int>(code()));
         type = tmp;
         break;
     }
     std::string result(type);
-    if (!state_->second.empty()) {
-      result.append(state_->second);
-    }
+    uint32_t length;
+    memcpy(&length, state_, sizeof(length));
+    result.append(state_ + 5, length);
     return result;
   }
 }