diff --git a/db/db_bench.cc b/db/db_bench.cc
index b39cc40de..6341ac733 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -71,6 +71,7 @@ DEFINE_string(benchmarks,
               "newiteratorwhilewriting,"
               "seekrandom,"
               "seekrandomwhilewriting,"
+              "seekrandomwhilemerging,"
               "readseq,"
               "readreverse,"
               "compact,"
@@ -130,8 +131,12 @@ DEFINE_string(benchmarks,
               "\treadrandommergerandom -- perform N random read-or-merge "
               "operations. Must be used with merge_operator\n"
               "\tnewiterator   -- repeated iterator creation\n"
-              "\tseekrandom    -- N random seeks\n"
-              "\tseekrandom    -- 1 writer, N threads doing random seeks\n"
+              "\tseekrandom    -- N random seeks, call Next seek_nexts times "
+              "per seek\n"
+              "\tseekrandomwhilewriting -- seekrandom and 1 thread doing "
+              "overwrite\n"
+              "\tseekrandomwhilemerging -- seekrandom and 1 thread doing "
+              "merge\n"
               "\tcrc32c        -- repeated crc32c of 4K of data\n"
               "\txxhash        -- repeated xxHash of 4K of data\n"
               "\tacquireload   -- load N*1000 times\n"
@@ -182,7 +187,8 @@ DEFINE_int32(value_size, 100, "Size of each value");
 
 DEFINE_int32(seek_nexts, 0,
              "How many times to call Next() after Seek() in "
-             "fillseekseq and seekrandom");
+             "fillseekseq, seekrandom, seekrandomwhilewriting and "
+             "seekrandomwhilemerging");
 
 DEFINE_bool(reverse_iterator, false,
             "When true use Prev rather than Next for iterators that do "
@@ -1590,6 +1596,9 @@ class Benchmark {
       } else if (name == Slice("seekrandomwhilewriting")) {
         num_threads++;  // Add extra thread for writing
         method = &Benchmark::SeekRandomWhileWriting;
+      } else if (name == Slice("seekrandomwhilemerging")) {
+        num_threads++;  // Add extra thread for merging
+        method = &Benchmark::SeekRandomWhileMerging;
       } else if (name == Slice("readrandomsmall")) {
         reads_ /= 1000;
         method = &Benchmark::ReadRandom;
@@ -2535,6 +2544,7 @@ class Benchmark {
   void ReadRandom(ThreadState* thread) {
     int64_t read = 0;
     int64_t found = 0;
+    int64_t bytes = 0;
     ReadOptions options(FLAGS_verify_checksum, true);
     std::unique_ptr<const char[]> key_guard;
     Slice key = AllocateKey(&key_guard);
@@ -2558,6 +2568,7 @@ class Benchmark {
       }
       if (s.ok()) {
         found++;
+        bytes += key.size() + value.size();
       } else if (!s.IsNotFound()) {
         fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
         abort();
@@ -2569,6 +2580,7 @@ class Benchmark {
     snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n",
              found, read);
 
+    thread->stats.AddBytes(bytes);
     thread->stats.AddMessage(msg);
 
     if (FLAGS_perf_level > 0) {
@@ -2640,6 +2652,7 @@ class Benchmark {
   void SeekRandom(ThreadState* thread) {
     int64_t read = 0;
     int64_t found = 0;
+    int64_t bytes = 0;
     ReadOptions options(FLAGS_verify_checksum, true);
     options.tailing = FLAGS_use_tailing_iterator;
 
@@ -2696,6 +2709,7 @@ class Benchmark {
         Slice value = iter_to_use->value();
         memcpy(value_buffer, value.data(),
                std::min(value.size(), sizeof(value_buffer)));
+        bytes += iter_to_use->key().size() + iter_to_use->value().size();
 
         if (!FLAGS_reverse_iterator) {
           iter_to_use->Next();
@@ -2715,6 +2729,7 @@ class Benchmark {
     char msg[100];
     snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n",
              found, read);
+    thread->stats.AddBytes(bytes);
     thread->stats.AddMessage(msg);
     if (FLAGS_perf_level > 0) {
       thread->stats.AddMessage(perf_context.ToString());
@@ -2729,6 +2744,14 @@ class Benchmark {
     }
   }
 
+  void SeekRandomWhileMerging(ThreadState* thread) {
+    if (thread->tid > 0) {
+      SeekRandom(thread);
+    } else {
+      BGWriter(thread, kMerge);
+    }
+  }
+
   void DoDelete(ThreadState* thread, bool seq) {
     WriteBatch batch;
     Duration duration(seq ? 0 : FLAGS_duration, num_);
@@ -2784,6 +2807,7 @@ class Benchmark {
     double last = FLAGS_env->NowMicros();
     int writes_per_second_by_10 = 0;
     int num_writes = 0;
+    int64_t bytes = 0;
 
     // --writes_per_second rate limit is enforced per 100 milliseconds
     // intervals to avoid a burst of writes at the start of each second.
@@ -2820,6 +2844,7 @@ class Benchmark {
         fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
         exit(1);
       }
+      bytes += key.size() + value_size_;
       thread->stats.FinishedOps(&db_, db_.db, 1);
 
       ++num_writes;
@@ -2836,6 +2861,7 @@ class Benchmark {
         }
       }
     }
+    thread->stats.AddBytes(bytes);
   }
 
   // Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V)
@@ -3054,6 +3080,7 @@ class Benchmark {
     RandomGenerator gen;
     std::string value;
     int64_t found = 0;
+    int64_t bytes = 0;
     Duration duration(FLAGS_duration, readwrites_);
 
     std::unique_ptr<const char[]> key_guard;
@@ -3066,6 +3093,7 @@ class Benchmark {
       auto status = db->Get(options, key, &value);
       if (status.ok()) {
         ++found;
+        bytes += key.size() + value.size();
       } else if (!status.IsNotFound()) {
         fprintf(stderr, "Get returned an error: %s\n",
                 status.ToString().c_str());
@@ -3077,11 +3105,13 @@ class Benchmark {
         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
         exit(1);
       }
+      bytes += key.size() + value_size_;
       thread->stats.FinishedOps(nullptr, db, 1);
     }
     char msg[100];
     snprintf(msg, sizeof(msg),
              "( updates:%" PRIu64 " found:%" PRIu64 ")", readwrites_, found);
+    thread->stats.AddBytes(bytes);
     thread->stats.AddMessage(msg);
   }
 
@@ -3093,6 +3123,7 @@ class Benchmark {
     RandomGenerator gen;
     std::string value;
     int64_t found = 0;
+    int64_t bytes = 0;
 
     std::unique_ptr<const char[]> key_guard;
     Slice key = AllocateKey(&key_guard);
@@ -3105,6 +3136,7 @@ class Benchmark {
       auto status = db->Get(options, key, &value);
       if (status.ok()) {
         ++found;
+        bytes += key.size() + value.size();
       } else if (!status.IsNotFound()) {
         fprintf(stderr, "Get returned an error: %s\n",
                 status.ToString().c_str());
@@ -3128,12 +3160,14 @@ class Benchmark {
         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
         exit(1);
       }
+      bytes += key.size() + value.size();
       thread->stats.FinishedOps(nullptr, db, 1);
     }
 
     char msg[100];
     snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")",
             readwrites_, found);
+    thread->stats.AddBytes(bytes);
     thread->stats.AddMessage(msg);
   }
 
@@ -3149,7 +3183,7 @@ class Benchmark {
   // FLAGS_merge_keys.
   void MergeRandom(ThreadState* thread) {
     RandomGenerator gen;
-
+    int64_t bytes = 0;
     std::unique_ptr<const char[]> key_guard;
     Slice key = AllocateKey(&key_guard);
     // The number of iterations is the larger of read_ or write_
@@ -3164,12 +3198,14 @@ class Benchmark {
         fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
         exit(1);
       }
+      bytes += key.size() + value_size_;
       thread->stats.FinishedOps(nullptr, db, 1);
     }
 
     // Print some statistics
     char msg[100];
     snprintf(msg, sizeof(msg), "( updates:%" PRIu64 ")", readwrites_);
+    thread->stats.AddBytes(bytes);
     thread->stats.AddMessage(msg);
   }
 
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index 98d0e0f5c..868c988e9 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -13,6 +13,7 @@
 #endif
 
 #include <inttypes.h>
+#include <algorithm>
 #include <vector>
 #include "db/column_family.h"
 
@@ -408,22 +409,31 @@ void InternalStats::DumpDBStats(std::string* value) {
   // writes/batches is the average group commit size.
   //
   // The format is the same for interval stats.
-  AppendHumanMicros(write_stall_micros, human_micros, kHumanMicrosLen);
   snprintf(buf, sizeof(buf),
-           "Cumulative writes: %" PRIu64 " writes, %" PRIu64 " keys, %" PRIu64
-           " batches, %.1f writes per batch, %.2f GB user ingest, "
-           "stall time: %s\n",
-           write_other + write_self, num_keys_written, write_self,
+           "Cumulative writes: %s writes, %s keys, %s batches, "
+           "%.1f writes per batch, ingest: %.2f GB, %.2f MB/s\n",
+           NumberToHumanString(write_other + write_self).c_str(),
+           NumberToHumanString(num_keys_written).c_str(),
+           NumberToHumanString(write_self).c_str(),
            (write_other + write_self) / static_cast<double>(write_self + 1),
-           user_bytes_written / kGB, human_micros);
+           user_bytes_written / kGB, user_bytes_written / kMB / seconds_up);
   value->append(buf);
   // WAL
   snprintf(buf, sizeof(buf),
-           "Cumulative WAL: %" PRIu64 " writes, %" PRIu64 " syncs, "
-           "%.2f writes per sync, %.2f GB written\n",
-           write_with_wal, wal_synced,
+           "Cumulative WAL: %s writes, %s syncs, "
+           "%.2f writes per sync, written: %.2f GB, %.2f MB/s\n",
+           NumberToHumanString(write_with_wal).c_str(),
+           NumberToHumanString(wal_synced).c_str(),
            write_with_wal / static_cast<double>(wal_synced + 1),
-           wal_bytes / kGB);
+           wal_bytes / kGB, wal_bytes / kMB / seconds_up);
+  value->append(buf);
+  // Stall
+  AppendHumanMicros(write_stall_micros, human_micros, kHumanMicrosLen, true);
+  snprintf(buf, sizeof(buf),
+           "Cumulative stall: %s, %.1f percent\n",
+           human_micros,
+           // 10000 = divide by 1M to get secs, then multiply by 100 for pct
+           write_stall_micros / 10000.0 / std::max(seconds_up, 0.001));
   value->append(buf);
 
   // Interval
@@ -431,19 +441,18 @@ void InternalStats::DumpDBStats(std::string* value) {
   uint64_t interval_write_self = write_self - db_stats_snapshot_.write_self;
   uint64_t interval_num_keys_written =
       num_keys_written - db_stats_snapshot_.num_keys_written;
-  AppendHumanMicros(
-      write_stall_micros - db_stats_snapshot_.write_stall_micros,
-      human_micros, kHumanMicrosLen);
   snprintf(buf, sizeof(buf),
-           "Interval writes: %" PRIu64 " writes, %" PRIu64 " keys, %" PRIu64
-           " batches, %.1f writes per batch, %.1f MB user ingest, "
-           "stall time: %s\n",
-           interval_write_other + interval_write_self,
-           interval_num_keys_written, interval_write_self,
+           "Interval writes: %s writes, %s keys, %s batches, "
+           "%.1f writes per batch, ingest: %.2f MB, %.2f MB/s\n",
+           NumberToHumanString(
+               interval_write_other + interval_write_self).c_str(),
+           NumberToHumanString(interval_num_keys_written).c_str(),
+           NumberToHumanString(interval_write_self).c_str(),
            static_cast<double>(interval_write_other + interval_write_self) /
                (interval_write_self + 1),
            (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB,
-           human_micros);
+           (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB /
+               std::max(interval_seconds_up, 0.001)),
   value->append(buf);
 
   uint64_t interval_write_with_wal =
@@ -452,13 +461,26 @@ void InternalStats::DumpDBStats(std::string* value) {
   uint64_t interval_wal_bytes = wal_bytes - db_stats_snapshot_.wal_bytes;
 
   snprintf(buf, sizeof(buf),
-           "Interval WAL: %" PRIu64 " writes, %" PRIu64 " syncs, "
-           "%.2f writes per sync, %.2f MB written\n",
-           interval_write_with_wal,
-           interval_wal_synced,
+           "Interval WAL: %s writes, %s syncs, "
+           "%.2f writes per sync, written: %.2f MB, %.2f MB/s\n",
+           NumberToHumanString(interval_write_with_wal).c_str(),
+           NumberToHumanString(interval_wal_synced).c_str(),
            interval_write_with_wal /
               static_cast<double>(interval_wal_synced + 1),
-           interval_wal_bytes / kGB);
+           interval_wal_bytes / kGB,
+           interval_wal_bytes / kMB / std::max(interval_seconds_up, 0.001));
+  value->append(buf);
+
+  // Stall
+  AppendHumanMicros(
+      write_stall_micros - db_stats_snapshot_.write_stall_micros,
+      human_micros, kHumanMicrosLen, true);
+  snprintf(buf, sizeof(buf),
+           "Interval stall: %s, %.1f percent\n",
+           human_micros,
+           // 10000 = divide by 1M to get secs, then multiply by 100 for pct
+           (write_stall_micros - db_stats_snapshot_.write_stall_micros) /
+               10000.0 / std::max(interval_seconds_up, 0.001));
   value->append(buf);
 
   db_stats_snapshot_.seconds_up = seconds_up;
@@ -557,7 +579,7 @@ void InternalStats::DumpCFStats(std::string* value) {
   value->append(buf);
 
   snprintf(buf, sizeof(buf),
-           "Flush(GB): accumulative %.3f, interval %.3f\n",
+           "Flush(GB): cumulative %.3f, interval %.3f\n",
            curr_ingest / kGB, interval_ingest / kGB);
   value->append(buf);
 
diff --git a/tools/benchmark.sh b/tools/benchmark.sh
index 07850069c..2055ec898 100755
--- a/tools/benchmark.sh
+++ b/tools/benchmark.sh
@@ -35,21 +35,24 @@ if [ ! -z $DB_BENCH_NO_SYNC ]; then
   syncval="0";
 fi
 
-num_read_threads=${NUM_READ_THREADS:-16}
-# Only for readwhilewriting, readwhilemerging
-writes_per_second=${WRITES_PER_SEC:-$((80 * K))}
-num_nexts_per_seek=${NUM_NEXTS_PER_SEEK:-10}      # (only for rangescanwhilewriting)
-cache_size=$((1 * G))
+num_threads=${NUM_THREADS:-16}
+# Only for *whilewriting, *whilemerging
+writes_per_second=${WRITES_PER_SECOND:-$((10 * K))}
+# Only for tests that do range scans
+num_nexts_per_seek=${NUM_NEXTS_PER_SEEK:-10}
+cache_size=${CACHE_SIZE:-$((1 * G))}
 duration=${DURATION:-0}
 
 num_keys=${NUM_KEYS:-$((1 * G))}
 key_size=20
-value_size=800
+value_size=${VALUE_SIZE:-400}
 
 const_params="
   --db=$DB_DIR \
   --wal_dir=$WAL_DIR \
+  --disable_data_sync=0 \
   \
+  --num=$num_keys \
   --num_levels=6 \
   --key_size=$key_size \
   --value_size=$value_size \
@@ -57,10 +60,10 @@ const_params="
   --cache_size=$cache_size \
   --cache_numshardbits=6 \
   --compression_type=zlib \
-  --min_level_to_compress=2 \
+  --min_level_to_compress=3 \
   --compression_ratio=0.5 \
   \
-  --hard_rate_limit=2 \
+  --hard_rate_limit=3 \
   --rate_limit_delay_max_milliseconds=1000000 \
   --write_buffer_size=$((128 * M)) \
   --max_write_buffer_number=3 \
@@ -69,7 +72,8 @@ const_params="
   \
   --verify_checksum=1 \
   --delete_obsolete_files_period_micros=$((60 * M)) \
-  --max_grandparent_overlap_factor=10 \
+  --max_grandparent_overlap_factor=8 \
+  --max_bytes_for_level_multiplier=8 \
   \
   --statistics=1 \
   --stats_per_interval=1 \
@@ -82,158 +86,173 @@ const_params="
 
 l0_config="
   --level0_file_num_compaction_trigger=4 \
-  --level0_slowdown_writes_trigger=8 \
-  --level0_stop_writes_trigger=12"
+  --level0_slowdown_writes_trigger=12 \
+  --level0_stop_writes_trigger=20"
 
 if [ $duration -gt 0 ]; then
   const_params="$const_params --duration=$duration"
 fi
 
-params_r="$const_params $l0_config --max_background_compactions=4 --max_background_flushes=1"
 params_w="$const_params $l0_config --max_background_compactions=16 --max_background_flushes=16"
 params_bulkload="$const_params --max_background_compactions=16 --max_background_flushes=16 \
                  --level0_file_num_compaction_trigger=$((10 * M)) \
                  --level0_slowdown_writes_trigger=$((10 * M)) \
                  --level0_stop_writes_trigger=$((10 * M))"
 
+function summarize_result {
+  test_out=$1
+  test_name=$2
+  bench_name=$3
+
+  uptime=$( grep ^Uptime\(secs $test_out | tail -1 | awk '{ printf "%.0f", $2 }' )
+  stall_time=$( grep "^Cumulative stall" $test_out | tail -1  | awk '{  print $3 }' )
+  stall_pct=$( grep "^Cumulative stall" $test_out| tail -1  | awk '{  print $5 }' )
+  ops_sec=$( grep ^${bench_name} $test_out | awk '{ print $5 }' )
+  mb_sec=$( grep ^${bench_name} $test_out | awk '{ print $7 }' )
+  lo_wgb=$( grep "^  L0" $test_out | tail -1 | awk '{ print $8 }' )
+  sum_wgb=$( grep "^ Sum" $test_out | tail -1 | awk '{ print $8 }' )
+  wamp=$( echo "scale=1; $sum_wgb / $lo_wgb" | bc )
+  wmb_ps=$( echo "scale=1; ( $sum_wgb * 1024.0 ) / $uptime" | bc )
+  usecs_op=$( grep ^${bench_name} $test_out | awk '{ printf "%.1f", $3 }' )
+  p50=$( grep "^Percentiles:" $test_out | awk '{ printf "%.1f", $3 }' )
+  p75=$( grep "^Percentiles:" $test_out | awk '{ printf "%.1f", $5 }' )
+  p99=$( grep "^Percentiles:" $test_out | awk '{ printf "%.1f", $7 }' )
+  echo -e "$ops_sec\t$mb_sec\t$lo_wgb\t$sum_wgb\t$wamp\t$wmb_ps\t$usecs_op\t$p50\t$p75\t$p99\t$uptime\t$stall_time\t$stall_pct\t$test_name" \
+    >> $output_dir/report.txt
+}
+
 function run_bulkload {
-  echo "Bulk loading $num_keys random keys into database..."
-  cmd="./db_bench $params_bulkload --benchmarks=fillrandom \
+  echo "Bulk loading $num_keys random keys"
+  cmd="./db_bench --benchmarks=fillrandom \
        --use_existing_db=0 \
-       --num=$num_keys \
        --disable_auto_compactions=1 \
        --sync=0 \
-       --disable_data_sync=0 \
-       --threads=1 2>&1 | tee -a $output_dir/benchmark_bulkload_fillrandom.log"
+       $params_bulkload \
+       --threads=1 \
+       2>&1 | tee -a $output_dir/benchmark_bulkload_fillrandom.log"
   echo $cmd | tee $output_dir/benchmark_bulkload_fillrandom.log
   eval $cmd
+  summarize_result $output_dir/benchmark_bulkload_fillrandom.log bulkload fillrandom
   echo "Compacting..."
-  cmd="./db_bench $params_w --benchmarks=compact \
+  cmd="./db_bench --benchmarks=compact \
        --use_existing_db=1 \
-       --num=$num_keys \
        --disable_auto_compactions=1 \
        --sync=0 \
-       --disable_data_sync=0 \
-       --threads=1 2>&1 | tee -a $output_dir/benchmark_bulkload_compact.log"
+       $params_w \
+       --threads=1 \
+       2>&1 | tee -a $output_dir/benchmark_bulkload_compact.log"
   echo $cmd | tee $output_dir/benchmark_bulkload_compact.log
   eval $cmd
 }
 
 function run_fillseq {
-  echo "Loading $num_keys keys sequentially into database..."
-  cmd="./db_bench $params_w --benchmarks=fillseq \
+  echo "Loading $num_keys keys sequentially"
+  cmd="./db_bench --benchmarks=fillseq \
        --use_existing_db=0 \
-       --num=$num_keys \
        --sync=0 \
-       --disable_data_sync=0 \
-       --threads=1 2>&1 | tee -a $output_dir/benchmark_fillseq.log"
+       $params_w \
+       --threads=1 \
+       2>&1 | tee -a $output_dir/benchmark_fillseq.log"
   echo $cmd | tee $output_dir/benchmark_fillseq.log
   eval $cmd
+  summarize_result $output_dir/benchmark_fillseq.log fillseq fillseq
 }
 
-function run_overwrite {
-  echo "Loading $num_keys keys sequentially into database..."
-  cmd="./db_bench $params_w --benchmarks=overwrite \
+function run_change {
+  operation=$1
+  echo "Do $num_keys random $operation"
+  out_name="benchmark_${operation}.t${num_threads}.s${syncval}.log"
+  cmd="./db_bench --benchmarks=$operation \
        --use_existing_db=1 \
-       --num=$num_keys \
-       --sync=0 \
-       --disable_data_sync=0 \
-       --threads=1 2>&1 | tee -a $output_dir/benchmark_overwrite.log"
-  echo $cmd | tee $output_dir/benchmark_overwrite.log
+       --sync=$syncval \
+       $params_w \
+       --threads=$num_threads \
+       --merge_operator=\"put\" \
+       2>&1 | tee -a $output_dir/${out_name}"
+  echo $cmd | tee $output_dir/${out_name}
   eval $cmd
+  summarize_result $output_dir/${out_name} ${operation}.t${num_threads}.s${syncval} $operation
 }
 
 function run_filluniquerandom {
-  echo "Loading $num_keys unique keys randomly into database..."
-  cmd="./db_bench $params_w --benchmarks=filluniquerandom \
+  echo "Loading $num_keys unique keys randomly"
+  cmd="./db_bench --benchmarks=filluniquerandom \
        --use_existing_db=0 \
-       --num=$num_keys \
        --sync=0 \
-       --disable_data_sync=0 \
-       --threads=1 2>&1 | tee -a $output_dir/benchmark_filluniquerandom.log"
+       $params_w \
+       --threads=1 \
+       2>&1 | tee -a $output_dir/benchmark_filluniquerandom.log"
   echo $cmd | tee $output_dir/benchmark_filluniquerandom.log
   eval $cmd
+  summarize_result $output_dir/benchmark_filluniquerandom.log filluniquerandom filluniquerandom
 }
 
 function run_readrandom {
-  echo "Reading $num_keys random keys from database..."
-  cmd="./db_bench $params_r --benchmarks=readrandom \
-       --use_existing_db=1 \
-       --num=$num_keys \
-       --threads=$num_read_threads \
-       --disable_auto_compactions=1 \
-       2>&1 | tee -a $output_dir/benchmark_readrandom.log"
-  echo $cmd | tee $output_dir/benchmark_readrandom.log
-  eval $cmd
-}
-
-function run_readwhilewriting {
-  echo "Reading $num_keys random keys from database while writing.."
-  cmd="./db_bench $params_r --benchmarks=readwhilewriting \
+  echo "Reading $num_keys random keys"
+  out_name="benchmark_readrandom.t${num_threads}.log"
+  cmd="./db_bench --benchmarks=readrandom \
        --use_existing_db=1 \
-       --num=$num_keys \
-       --sync=$syncval \
-       --disable_data_sync=0 \
-       --threads=$num_read_threads \
-       --writes_per_second=$writes_per_second \
-       2>&1 | tee -a $output_dir/benchmark_readwhilewriting.log"
-  echo $cmd | tee $output_dir/benchmark_readwhilewriting.log
+       $params_w \
+       --threads=$num_threads \
+       2>&1 | tee -a $output_dir/${out_name}"
+  echo $cmd | tee $output_dir/${out_name}
   eval $cmd
+  summarize_result $output_dir/${out_name} readrandom.t${num_threads} readrandom
 }
 
-function run_readwhilemerging {
-  echo "Reading $num_keys random keys from database while merging.."
-  cmd="./db_bench $params_r --benchmarks=readwhilemerging \
+function run_readwhile {
+  operation=$1
+  echo "Reading $num_keys random keys while $operation"
+  out_name="benchmark_readwhile${operation}.t${num_threads}.log"
+  cmd="./db_bench --benchmarks=readwhile${operation} \
        --use_existing_db=1 \
-       --num=$num_keys \
        --sync=$syncval \
-       --disable_data_sync=0 \
-       --threads=$num_read_threads \
+       $params_w \
+       --threads=$num_threads \
        --writes_per_second=$writes_per_second \
        --merge_operator=\"put\" \
-       2>&1 | tee -a $output_dir/benchmark_readwhilemerging.log"
-  echo $cmd | tee $output_dir/benchmark_readwhilemerging.log
+       2>&1 | tee -a $output_dir/${out_name}"
+  echo $cmd | tee $output_dir/${out_name}
   eval $cmd
+  summarize_result $output_dir/${out_name} readwhile${operation}.t${num_threads} readwhile${operation}
 }
 
-function run_rangescanwhilewriting {
-  echo "Range scan $num_keys random keys from database while writing.."
-  cmd="./db_bench $params_r --benchmarks=seekrandomwhilewriting \
+function run_rangewhile {
+  operation=$1
+  full_name=$2
+  reverse_arg=$3
+  out_name="benchmark_${full_name}.t${num_threads}.log"
+  echo "Range scan $num_keys random keys while ${operation} for reverse_iter=${reverse_arg}"
+  cmd="./db_bench --benchmarks=seekrandomwhile${operation} \
        --use_existing_db=1 \
-       --num=$num_keys \
        --sync=$syncval \
-       --disable_data_sync=0 \
-       --threads=$num_read_threads \
+       $params_w \
+       --threads=$num_threads \
        --writes_per_second=$writes_per_second \
+       --merge_operator=\"put\" \
        --seek_nexts=$num_nexts_per_seek \
-       2>&1 | tee -a $output_dir/benchmark_rangescanwhilewriting.log"
-  echo $cmd | tee $output_dir/benchmark_rangescanwhilewriting.log
+       --reverse_iterator=$reverse_arg \
+       2>&1 | tee -a $output_dir/${out_name}"
+  echo $cmd | tee $output_dir/${out_name}
   eval $cmd
+  summarize_result $output_dir/${out_name} ${full_name}.t${num_threads} seekrandomwhile${operation}
 }
 
-function run_updaterandom {
-  echo "Read/Modify/Write $num_keys random keys (not using merge).."
-  cmd="./db_bench $params_w --benchmarks=updaterandom \
+function run_range {
+  full_name=$1
+  reverse_arg=$2
+  out_name="benchmark_${full_name}.t${num_threads}.log"
+  echo "Range scan $num_keys random keys for reverse_iter=${reverse_arg}"
+  cmd="./db_bench --benchmarks=seekrandom \
        --use_existing_db=1 \
-       --num=$num_keys \
-       --sync=$syncval \
-       --disable_data_sync=0 \
-       --threads=$num_read_threads 2>&1 | tee -a $output_dir/benchmark_updaterandom.log"
-  echo $cmd | tee $output_dir/benchmark_updaterandom.log
-  eval $cmd
-}
-
-function run_mergerandom {
-  echo "Read/Modify/Write $num_keys random keys (using merge operator).."
-  cmd="./db_bench $params_w --benchmarks=mergerandom \
-       --use_existing_db=1 \
-       --num=$num_keys \
-       --sync=$syncval \
-       --disable_data_sync=0 \
-       --merge_operator=\"put\" \
-       --threads=$num_read_threads 2>&1 | tee -a $output_dir/benchmark_mergerandom.log"
-  echo $cmd | tee $output_dir/benchmark_mergerandom.log
+       $params_w \
+       --threads=$num_threads \
+       --seek_nexts=$num_nexts_per_seek \
+       --reverse_iterator=$reverse_arg \
+       2>&1 | tee -a $output_dir/${out_name}"
+  echo $cmd | tee $output_dir/${out_name}
   eval $cmd
+  summarize_result $output_dir/${out_name} ${full_name}.t${num_threads} seekrandom
 }
 
 function now() {
@@ -241,6 +260,7 @@ function now() {
 }
 
 report="$output_dir/report.txt"
+schedule="$output_dir/schedule.txt"
 
 echo "===== Benchmark ====="
 
@@ -249,7 +269,7 @@ IFS=',' read -a jobs <<< $1
 for job in ${jobs[@]}; do
 
   if [ $job != debug ]; then
-    echo "Start $job at `date`" | tee -a $report
+    echo "Start $job at `date`" | tee -a $schedule
   fi
 
   start=$(now)
@@ -258,21 +278,31 @@ for job in ${jobs[@]}; do
   elif [ $job = fillseq ]; then
     run_fillseq
   elif [ $job = overwrite ]; then
-    run_overwrite
+    run_change overwrite
+  elif [ $job = updaterandom ]; then
+    run_change updaterandom
+  elif [ $job = mergerandom ]; then
+    run_change mergerandom
   elif [ $job = filluniquerandom ]; then
     run_filluniquerandom
   elif [ $job = readrandom ]; then
     run_readrandom
+  elif [ $job = fwdrange ]; then
+    run_range $job false
+  elif [ $job = revrange ]; then
+    run_range $job true
   elif [ $job = readwhilewriting ]; then
-    run_readwhilewriting
+    run_readwhile writing
   elif [ $job = readwhilemerging ]; then
-    run_readwhilemerging
-  elif [ $job = rangescanwhilewriting ]; then
-    run_rangescanwhilewriting
-  elif [ $job = updaterandom ]; then
-    run_updaterandom
-  elif [ $job = mergerandom ]; then
-    run_mergerandom
+    run_readwhile merging
+  elif [ $job = fwdrangewhilewriting ]; then
+    run_rangewhile writing $job false
+  elif [ $job = revrangewhilewriting ]; then
+    run_rangewhile writing $job true
+  elif [ $job = fwdrangewhilemerging ]; then
+    run_rangewhile merging $job false
+  elif [ $job = revrangewhilemerging ]; then
+    run_rangewhile merging $job true
   elif [ $job = debug ]; then
     num_keys=1000; # debug
     echo "Setting num_keys to $num_keys"
@@ -283,24 +313,10 @@ for job in ${jobs[@]}; do
   end=$(now)
 
   if [ $job != debug ]; then
-    echo "Complete $job in $((end-start)) seconds" | tee -a $report
+    echo "Complete $job in $((end-start)) seconds" | tee -a $schedule
   fi
 
-  if [[ $job == readrandom || $job == readwhilewriting \
-     || $job == rangescanwhilewriting || $job == updaterandom \
-     || $job == mergerandom || $job == readwhilemerging ]]; then
-    lat=$(grep "micros\/op" "$output_dir/benchmark_$job.log" \
-        | grep "ops\/sec" | awk '{print $3}')
-    qps=$(grep "micros\/op" "$output_dir/benchmark_$job.log" \
-        | grep "ops\/sec" | awk '{print $5}')
-    line=$(grep "rocksdb.db.get.micros" "$output_dir/benchmark_$job.log")
-    p50=$(echo $line | awk '{print $7}')
-    p99=$(echo $line | awk '{print $13}')
-    print_percentile=$(echo "$p50 != 0 || $p99 != 0" | bc);
-    if [ "$print_percentile" == "1" ]; then
-      echo "Read latency p50 = $p50 us, p99 = $p99 us" | tee -a $report
-    fi
-    echo "QPS = $qps ops/sec" | tee -a $report
-    echo "Avg Latency = $lat micros/op " | tee -a $report
-  fi
+  echo -e "ops/sec\tmb/sec\tL0_MB\tSum_GB\tW-Amp\tW-MB/s\tusec/op\tp50\tp75\tp99\tUptime\tStall-time\tStall%\tTest"
+  tail -1 $output_dir/report.txt
+
 done
diff --git a/tools/run_flash_bench.sh b/tools/run_flash_bench.sh
index 2d2fd2ade..224e1f6f0 100755
--- a/tools/run_flash_bench.sh
+++ b/tools/run_flash_bench.sh
@@ -3,50 +3,220 @@
 # After execution of this script, log files are generated in $output_dir.
 # report.txt provides a high level statistics
 
+# This should be run from the parent of the tools directory. The command line is:
+#   [$env_vars] tools/run_flash_bench.sh [list-of-threads]
+#
+# This runs a sequence of tests in the following sequence:
+#   step 1) load - bulkload, compact, fillseq, overwrite
+#   step 2) read-only for each number of threads
+#   step 3) read-write for each number of threads
+#   step 4) merge for each number of threads
+#
+# The list of threads is optional and when not set is equivalent to "24". 
+# Were list-of-threads specified as "1 2 4" then the tests in steps 2, 3 and
+# 4 above would be repeated for 1, 2 and 4 threads. The tests in step 1 are
+# only run for 1 thread.
+
+# Test output is written to $OUTPUT_DIR, currently /tmp/output. The performance
+# summary is in $OUTPUT_DIR/report.txt. There is one file in $OUTPUT_DIR per
+# test and the tests are listed below.
+#
+# The environment variables are also optional. The variables are:
+#   NKEYS         - number of key/value pairs to load
+#   NWRITESPERSEC - the writes/second rate limit for the *whilewriting* tests.
+#                   If this is too large then the non-writer threads can get
+#                   starved.
+#   NSECONDS      - number of seconds for which to run each test in steps 2,
+#                   3 and 4. There are currently 15 tests in those steps and
+#                   they are repeated for each entry in list-of-threads so
+#                   this variable lets you control the total duration to
+#                   finish the benchmark.
+#   RANGE_LIMIT   - the number of rows to read per range query for tests that
+#                   do range queries.
+#   VAL_SIZE      - the length of the value in the key/value pairs loaded.
+#                   You can estimate the size of the test database from this,
+#                   NKEYS and the compression rate (--compression_ratio) set
+#                   in tools/benchmark.sh
+#   CACHE_BYTES   - the size of the RocksDB block cache in bytes
+#   DATA_DIR      - directory in which to create database files
+#   LOG_DIR       - directory in which to create WAL files, may be the same
+#                   as DATA_DIR
+#   DO_SETUP      - when set to 0 then a backup of the database is copied from
+#                   $DATA_DIR.bak to $DATA_DIR and the load tests from step 1
+#                   The WAL directory is also copied from a backup if
+#                   DATA_DIR != LOG_DIR. This allows tests from steps 2, 3, 4
+#                   to be repeated faster.
+#   SAVE_SETUP    - saves a copy of the database at the end of step 1 to
+#                   $DATA_DIR.bak. When LOG_DIR != DATA_DIR then it is copied
+#                   to $LOG_DIR.bak.
+
 # Size constants
 K=1024
 M=$((1024 * K))
 G=$((1024 * M))
 
-n=$((1 * G))
-wps=$((80 * K))
-duration=$((12 * 60 * 60))
-num_read_threads=24
+num_keys=${NKEYS:-$((1 * G))}
+wps=${NWRITESPERSEC:-$((10 * K))}
+duration=${NSECONDS:-$((60 * 60))}
+nps=${RANGE_LIMIT:-10}
+vs=${VAL_SIZE:-400}
+cs=${CACHE_BYTES:-$(( 1 * G ))}
+
+# If no command line arguments then run for 24 threads.
+if [[ $# -eq 0 ]]; then
+  nthreads=( 24 )
+else
+  nthreads=( "$@" )
+fi
+
+for num_thr in "${nthreads[@]}" ; do
+  echo Will run for $num_thr threads
+done
 
 # Update these parameters before execution !!!
-db_dir="/tmp/rocksdb/"
-wal_dir="/tmp/rocksdb/"
+db_dir=${DATA_DIR:-"/tmp/rocksdb/"}
+wal_dir=${LOG_DIR:-"/tmp/rocksdb/"}
+
+do_setup=${DO_SETUP:-1}
+save_setup=${SAVE_SETUP:-0}
+
 output_dir="/tmp/output"
 
+ARGS="\
+OUTPUT_DIR=$output_dir \
+NUM_KEYS=$num_keys \
+DB_DIR=$db_dir \
+WAL_DIR=$wal_dir \
+VALUE_SIZE=$vs \
+CACHE_SIZE=$cs"
+
+mkdir -p $output_dir
+echo -e "ops/sec\tmb/sec\tL0_GB\tSum_GB\tW-Amp\tW-MB/s\tusec/op\tp50\tp75\tp99\tUptime\tStall-time\tStall%\tTest" \
+  > $output_dir/report.txt
+
+# Notes on test sequence:
+#   step 1) Setup database via sequential fill followed by overwrite to fragment it.
+#           Done without setting DURATION to make sure that overwrite does $num_keys writes
+#   step 2) read-only tests for all levels of concurrency requested
+#   step 3) non read-only tests for all levels of concurrency requested
+#   step 4) merge tests for all levels of concurrency requested. These must come last.
+
+###### Setup the database
+
+if [[ $do_setup != 0 ]]; then
+  echo Doing setup
+
+  # Test 1: bulk load
+  env $ARGS ./tools/benchmark.sh bulkload
+
+  # Test 2: sequential fill
+  env $ARGS ./tools/benchmark.sh fillseq
+
+  # Test 3: single-threaded overwrite
+  env $ARGS NUM_THREADS=1 DB_BENCH_NO_SYNC=1 ./tools/benchmark.sh overwrite
+
+else
+  echo Restoring from backup
+
+  rm -rf $db_dir
+
+  if [ ! -d ${db_dir}.bak ]; then
+    echo Database backup does not exist at ${db_dir}.bak
+    exit -1
+  fi
+
+  echo Restore database from ${db_dir}.bak
+  cp -p -r ${db_dir}.bak $db_dir
+
+  if [[ $db_dir != $wal_dir ]]; then
+    rm -rf $wal_dir
+
+    if [ ! -d ${wal_dir}.bak ]; then
+      echo WAL backup does not exist at ${wal_dir}.bak
+      exit -1
+    fi
+
+    echo Restore WAL from ${wal_dir}.bak
+    cp -p -r ${wal_dir}.bak $wal_dir
+  fi
+fi
+
+if [[ $save_setup != 0 ]]; then
+  echo Save database to ${db_dir}.bak
+  cp -p -r $db_dir ${db_dir}.bak
+
+  if [[ $db_dir != $wal_dir ]]; then
+    echo Save WAL to ${wal_dir}.bak
+    cp -p -r $wal_dir ${wal_dir}.bak
+  fi
+fi
+
+###### Read-only tests
+
+for num_thr in "${nthreads[@]}" ; do
+  # Test 4: random read
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr ./tools/benchmark.sh readrandom
+
+  # Test 5: random range scans
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr NUM_NEXTS_PER_SEEK=$nps \
+    ./tools/benchmark.sh fwdrange
+
+  # Test 6: random reverse range scans
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr NUM_NEXTS_PER_SEEK=$nps \
+    ./tools/benchmark.sh revrange
+done
+
+###### Non read-only tests
+
+for num_thr in "${nthreads[@]}" ; do
+  # Test 7: overwrite with sync=0
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr DB_BENCH_NO_SYNC=1 \
+    ./tools/benchmark.sh overwrite
+
+  # Test 8: overwrite with sync=1
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr ./tools/benchmark.sh overwrite
+
+  # Test 9: random update with sync=0
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr DB_BENCH_NO_SYNC=1 \
+    ./tools/benchmark.sh updaterandom
+
+  # Test 10: random update with sync=1
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr ./tools/benchmark.sh updaterandom
+
+  # Test 11: random read while writing
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr WRITES_PER_SECOND=$wps \
+    ./tools/benchmark.sh readwhilewriting
+
+  # Test 12: range scan while writing
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr WRITES_PER_SECOND=$wps \
+    NUM_NEXTS_PER_SEEK=$nps ./tools/benchmark.sh fwdrangewhilewriting
+
+  # Test 13: reverse range scan while writing
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr WRITES_PER_SECOND=$wps \
+    NUM_NEXTS_PER_SEEK=$nps ./tools/benchmark.sh revrangewhilewriting
+done
 
-# Test 1: bulk load
-OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \
-  ./benchmark.sh bulkload
+###### Merge tests
 
-# Test 2: sequential fill
-OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \
-  ./benchmark.sh fillseq
+for num_thr in "${nthreads[@]}" ; do
+  # Test 14: random merge with sync=0
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr DB_BENCH_NO_SYNC=1 \
+    ./tools/benchmark.sh mergerandom
 
-# Test 3: overwrite
-OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \
-  ./benchmark.sh overwrite
+  # Test 15: random merge with sync=1
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr ./tools/benchmark.sh mergerandom
 
-# Prepare: populate DB with random data
-OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \
-  ./benchmark.sh filluniquerandom
+  # Test 16: random read while merging 
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr WRITES_PER_SECOND=$wps \
+    ./tools/benchmark.sh readwhilemerging
 
-# Test 4: random read
-OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \
-  DURATION=$duration NUM_READ_THREADS=$num_read_threads \
-  ./benchmark.sh readrandom
+  # Test 17: range scan while merging 
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr WRITES_PER_SECOND=$wps \
+    NUM_NEXTS_PER_SEEK=$nps ./tools/benchmark.sh fwdrangewhilemerging
 
-# Test 5: random read while writing
-OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \
-  DURATION=$duration NUM_READ_THREADS=$num_read_threads WRITES_PER_SECOND=$wps \
-  ./benchmark.sh readwhilewriting
+  # Test 18: reverse range scan while merging 
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr WRITES_PER_SECOND=$wps \
+    NUM_NEXTS_PER_SEEK=$nps ./tools/benchmark.sh revrangewhilemerging
+done
 
-# Test 6: random seek + next()'s while writing
-OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \
-  DURATION=$duration NUM_READ_THREADS=$num_read_threads WRITES_PER_SECOND=$wps \
-  NUM_NEXTS_PER_SEEK=10 \
-  ./benchmark.sh rangescanwhilewriting
+cat $output_dir/report.txt
diff --git a/util/logging.cc b/util/logging.cc
index c8cd1217e..8917d099a 100644
--- a/util/logging.cc
+++ b/util/logging.cc
@@ -29,16 +29,17 @@ namespace rocksdb {
 // for micros >= 10 sec, print "XX sec".
 // for micros <= 1 hour, print Y:X M:S".
 // for micros > 1 hour, print Z:Y:X H:M:S".
-int AppendHumanMicros(uint64_t micros, char* output, int len) {
-  if (micros < 10000) {
+int AppendHumanMicros(uint64_t micros, char* output, int len,
+                      bool fixed_format) {
+  if (micros < 10000 && !fixed_format) {
     return snprintf(output, len, "%" PRIu64 " us", micros);
-  } else if (micros < 10000000) {
+  } else if (micros < 10000000 && !fixed_format) {
     return snprintf(output, len, "%.3lf ms",
                     static_cast<double>(micros) / 1000);
-  } else if (micros < 1000000l * 60) {
+  } else if (micros < 1000000l * 60 && !fixed_format) {
     return snprintf(output, len, "%.3lf sec",
                     static_cast<double>(micros) / 1000000);
-  } else if (micros < 1000000ll * 60 * 60) {
+  } else if (micros < 1000000ll * 60 * 60 && !fixed_format) {
     return snprintf(output, len, "%02" PRIu64 ":%05.3f M:S",
         micros / 1000000 / 60,
         static_cast<double>(micros % 60000000) / 1000000);
diff --git a/util/logging.h b/util/logging.h
index 63c88debf..10801bb88 100644
--- a/util/logging.h
+++ b/util/logging.h
@@ -21,7 +21,8 @@ namespace rocksdb {
 class Slice;
 
 // Append a human-readable time in micros.
-int AppendHumanMicros(uint64_t micros, char* output, int len);
+int AppendHumanMicros(uint64_t micros, char* output, int len,
+                      bool fixed_format);
 
 // Append a human-readable size in bytes
 int AppendHumanBytes(uint64_t bytes, char* output, int len);
diff --git a/util/thread_status_impl.cc b/util/thread_status_impl.cc
index 6d0778b8e..4725cbe49 100644
--- a/util/thread_status_impl.cc
+++ b/util/thread_status_impl.cc
@@ -40,7 +40,7 @@ const std::string ThreadStatus::MicrosToString(uint64_t micros) {
   }
   const int kBufferLen = 100;
   char buffer[kBufferLen];
-  AppendHumanMicros(micros, buffer, kBufferLen);
+  AppendHumanMicros(micros, buffer, kBufferLen, false);
   return std::string(buffer);
 }