Make the benchmark scripts configurable and add tests

Summary: This makes run_flash_bench.sh configurable. Previously it was hardwired for 1B keys and tests ran for 12 hours each. That kept me from using it. This makes it configuable, adds more tests, makes the duration per-test configurable and refactors the test scripts. Adds the seekrandomwhilemerging test to db_bench which is the same as seekrandomwhilewriting except the writer thread does Merge rather than Put. Forces the stall-time column in compaction IO stats to use a fixed format (H:M:S) which makes it easier to scrape and parse. Also adds an option to AppendHumanMicros to force a fixed format. Sometimes automation and humans want different format. Calls thread->stats.AddBytes(bytes); in db_bench for more tests to get the MB/sec summary stats in the output at test end. Adds the average ingest rate to compaction IO stats. Output now looks like: https://gist.github.com/mdcallag/2bd64d18be1b93adc494 More information on the benchmark output is at https://gist.github.com/mdcallag/db43a58bd5ac624f01e1 For benchmark.sh changes default RocksDB configuration to reduce stalls: * min_level_to_compress from 2 to 3 * hard_rate_limit from 2 to 3 * max_grandparent_overlap_factor and max_bytes_for_level_multiplier from 10 to 8 * L0 file count triggers from 4,8,12 to 4,12,20 for (start,stall,stop) Task ID: #6596829 Blame Rev: Test Plan: run tools/run_flash_bench.sh Revert Plan: Database Impact: Memcache Impact: Other Notes: EImportant: - begin *PUBLIC* platform impact section - Bugzilla: # - end platform impact - Reviewers: igor Reviewed By: igor Subscribers: dhruba Differential Revision: https://reviews.facebook.net/D36075
10 years ago · 99ec2412e5
parent 2158e0f831
commit 99ec2412e5
7 changed files with 438 additions and 192 deletions
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@ -71,6 +71,7 @@ DEFINE_string(benchmarks,
              "newiteratorwhilewriting,"
              "seekrandom,"
              "seekrandomwhilewriting,"
+              "seekrandomwhilemerging,"
              "readseq,"
              "readreverse,"
              "compact,"
@ -130,8 +131,12 @@ DEFINE_string(benchmarks,
              "\treadrandommergerandom -- perform N random read-or-merge "
              "operations. Must be used with merge_operator\n"
              "\tnewiterator   -- repeated iterator creation\n"
-              "\tseekrandom    -- N random seeks\n"
-              "\tseekrandom    -- 1 writer, N threads doing random seeks\n"
+              "\tseekrandom    -- N random seeks, call Next seek_nexts times "
+              "per seek\n"
+              "\tseekrandomwhilewriting -- seekrandom and 1 thread doing "
+              "overwrite\n"
+              "\tseekrandomwhilemerging -- seekrandom and 1 thread doing "
+              "merge\n"
              "\tcrc32c        -- repeated crc32c of 4K of data\n"
              "\txxhash        -- repeated xxHash of 4K of data\n"
              "\tacquireload   -- load N*1000 times\n"
@ -182,7 +187,8 @@ DEFINE_int32(value_size, 100, "Size of each value");

 DEFINE_int32(seek_nexts, 0,
             "How many times to call Next() after Seek() in "
-             "fillseekseq and seekrandom");
+             "fillseekseq, seekrandom, seekrandomwhilewriting and "
+             "seekrandomwhilemerging");

 DEFINE_bool(reverse_iterator, false,
            "When true use Prev rather than Next for iterators that do "
@ -1590,6 +1596,9 @@ class Benchmark {
      } else if (name == Slice("seekrandomwhilewriting")) {
        num_threads++;  // Add extra thread for writing
        method = &Benchmark::SeekRandomWhileWriting;
+      } else if (name == Slice("seekrandomwhilemerging")) {
+        num_threads++;  // Add extra thread for merging
+        method = &Benchmark::SeekRandomWhileMerging;
      } else if (name == Slice("readrandomsmall")) {
        reads_ /= 1000;
        method = &Benchmark::ReadRandom;
@ -2535,6 +2544,7 @@ class Benchmark {
  void ReadRandom(ThreadState* thread) {
    int64_t read = 0;
    int64_t found = 0;
+    int64_t bytes = 0;
    ReadOptions options(FLAGS_verify_checksum, true);
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
@ -2558,6 +2568,7 @@ class Benchmark {
      }
      if (s.ok()) {
        found++;
+        bytes += key.size() + value.size();
      } else if (!s.IsNotFound()) {
        fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
        abort();
@ -2569,6 +2580,7 @@ class Benchmark {
    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n",
             found, read);

+    thread->stats.AddBytes(bytes);
    thread->stats.AddMessage(msg);

    if (FLAGS_perf_level > 0) {
@ -2640,6 +2652,7 @@ class Benchmark {
  void SeekRandom(ThreadState* thread) {
    int64_t read = 0;
    int64_t found = 0;
+    int64_t bytes = 0;
    ReadOptions options(FLAGS_verify_checksum, true);
    options.tailing = FLAGS_use_tailing_iterator;

@ -2696,6 +2709,7 @@ class Benchmark {
        Slice value = iter_to_use->value();
        memcpy(value_buffer, value.data(),
               std::min(value.size(), sizeof(value_buffer)));
+        bytes += iter_to_use->key().size() + iter_to_use->value().size();

        if (!FLAGS_reverse_iterator) {
          iter_to_use->Next();
@ -2715,6 +2729,7 @@ class Benchmark {
    char msg[100];
    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n",
             found, read);
+    thread->stats.AddBytes(bytes);
    thread->stats.AddMessage(msg);
    if (FLAGS_perf_level > 0) {
      thread->stats.AddMessage(perf_context.ToString());
@ -2729,6 +2744,14 @@ class Benchmark {
    }
  }

+  void SeekRandomWhileMerging(ThreadState* thread) {
+    if (thread->tid > 0) {
+      SeekRandom(thread);
+    } else {
+      BGWriter(thread, kMerge);
+    }
+  }
+
  void DoDelete(ThreadState* thread, bool seq) {
    WriteBatch batch;
    Duration duration(seq ? 0 : FLAGS_duration, num_);
@ -2784,6 +2807,7 @@ class Benchmark {
    double last = FLAGS_env->NowMicros();
    int writes_per_second_by_10 = 0;
    int num_writes = 0;
+    int64_t bytes = 0;

    // --writes_per_second rate limit is enforced per 100 milliseconds
    // intervals to avoid a burst of writes at the start of each second.
@ -2820,6 +2844,7 @@ class Benchmark {
        fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
        exit(1);
      }
+      bytes += key.size() + value_size_;
      thread->stats.FinishedOps(&db_, db_.db, 1);

      ++num_writes;
@ -2836,6 +2861,7 @@ class Benchmark {
        }
      }
    }
+    thread->stats.AddBytes(bytes);
  }

  // Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V)
@ -3054,6 +3080,7 @@ class Benchmark {
    RandomGenerator gen;
    std::string value;
    int64_t found = 0;
+    int64_t bytes = 0;
    Duration duration(FLAGS_duration, readwrites_);

    std::unique_ptr<const char[]> key_guard;
@ -3066,6 +3093,7 @@ class Benchmark {
      auto status = db->Get(options, key, &value);
      if (status.ok()) {
        ++found;
+        bytes += key.size() + value.size();
      } else if (!status.IsNotFound()) {
        fprintf(stderr, "Get returned an error: %s\n",
                status.ToString().c_str());
@ -3077,11 +3105,13 @@ class Benchmark {
        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
        exit(1);
      }
+      bytes += key.size() + value_size_;
      thread->stats.FinishedOps(nullptr, db, 1);
    }
    char msg[100];
    snprintf(msg, sizeof(msg),
             "( updates:%" PRIu64 " found:%" PRIu64 ")", readwrites_, found);
+    thread->stats.AddBytes(bytes);
    thread->stats.AddMessage(msg);
  }

@ -3093,6 +3123,7 @@ class Benchmark {
    RandomGenerator gen;
    std::string value;
    int64_t found = 0;
+    int64_t bytes = 0;

    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
@ -3105,6 +3136,7 @@ class Benchmark {
      auto status = db->Get(options, key, &value);
      if (status.ok()) {
        ++found;
+        bytes += key.size() + value.size();
      } else if (!status.IsNotFound()) {
        fprintf(stderr, "Get returned an error: %s\n",
                status.ToString().c_str());
@ -3128,12 +3160,14 @@ class Benchmark {
        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
        exit(1);
      }
+      bytes += key.size() + value.size();
      thread->stats.FinishedOps(nullptr, db, 1);
    }

    char msg[100];
    snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")",
            readwrites_, found);
+    thread->stats.AddBytes(bytes);
    thread->stats.AddMessage(msg);
  }

@ -3149,7 +3183,7 @@ class Benchmark {
  // FLAGS_merge_keys.
  void MergeRandom(ThreadState* thread) {
    RandomGenerator gen;
-
+    int64_t bytes = 0;
    std::unique_ptr<const char[]> key_guard;
    Slice key = AllocateKey(&key_guard);
    // The number of iterations is the larger of read_ or write_
@ -3164,12 +3198,14 @@ class Benchmark {
        fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
        exit(1);
      }
+      bytes += key.size() + value_size_;
      thread->stats.FinishedOps(nullptr, db, 1);
    }

    // Print some statistics
    char msg[100];
    snprintf(msg, sizeof(msg), "( updates:%" PRIu64 ")", readwrites_);
+    thread->stats.AddBytes(bytes);
    thread->stats.AddMessage(msg);
  }

--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@ -13,6 +13,7 @@
 #endif

 #include <inttypes.h>
+#include <algorithm>
 #include <vector>
 #include "db/column_family.h"

@ -408,22 +409,31 @@ void InternalStats::DumpDBStats(std::string* value) {
  // writes/batches is the average group commit size.
  //
  // The format is the same for interval stats.
-  AppendHumanMicros(write_stall_micros, human_micros, kHumanMicrosLen);
  snprintf(buf, sizeof(buf),
-           "Cumulative writes: %" PRIu64 " writes, %" PRIu64 " keys, %" PRIu64
-           " batches, %.1f writes per batch, %.2f GB user ingest, "
-           "stall time: %s\n",
-           write_other + write_self, num_keys_written, write_self,
+           "Cumulative writes: %s writes, %s keys, %s batches, "
+           "%.1f writes per batch, ingest: %.2f GB, %.2f MB/s\n",
+           NumberToHumanString(write_other + write_self).c_str(),
+           NumberToHumanString(num_keys_written).c_str(),
+           NumberToHumanString(write_self).c_str(),
           (write_other + write_self) / static_cast<double>(write_self + 1),
-           user_bytes_written / kGB, human_micros);
+           user_bytes_written / kGB, user_bytes_written / kMB / seconds_up);
  value->append(buf);
  // WAL
  snprintf(buf, sizeof(buf),
-           "Cumulative WAL: %" PRIu64 " writes, %" PRIu64 " syncs, "
-           "%.2f writes per sync, %.2f GB written\n",
-           write_with_wal, wal_synced,
+           "Cumulative WAL: %s writes, %s syncs, "
+           "%.2f writes per sync, written: %.2f GB, %.2f MB/s\n",
+           NumberToHumanString(write_with_wal).c_str(),
+           NumberToHumanString(wal_synced).c_str(),
           write_with_wal / static_cast<double>(wal_synced + 1),
-           wal_bytes / kGB);
+           wal_bytes / kGB, wal_bytes / kMB / seconds_up);
+  value->append(buf);
+  // Stall
+  AppendHumanMicros(write_stall_micros, human_micros, kHumanMicrosLen, true);
+  snprintf(buf, sizeof(buf),
+           "Cumulative stall: %s, %.1f percent\n",
+           human_micros,
+           // 10000 = divide by 1M to get secs, then multiply by 100 for pct
+           write_stall_micros / 10000.0 / std::max(seconds_up, 0.001));
  value->append(buf);

  // Interval
@ -431,19 +441,18 @@ void InternalStats::DumpDBStats(std::string* value) {
  uint64_t interval_write_self = write_self - db_stats_snapshot_.write_self;
  uint64_t interval_num_keys_written =
      num_keys_written - db_stats_snapshot_.num_keys_written;
-  AppendHumanMicros(
-      write_stall_micros - db_stats_snapshot_.write_stall_micros,
-      human_micros, kHumanMicrosLen);
  snprintf(buf, sizeof(buf),
-           "Interval writes: %" PRIu64 " writes, %" PRIu64 " keys, %" PRIu64
-           " batches, %.1f writes per batch, %.1f MB user ingest, "
-           "stall time: %s\n",
-           interval_write_other + interval_write_self,
-           interval_num_keys_written, interval_write_self,
+           "Interval writes: %s writes, %s keys, %s batches, "
+           "%.1f writes per batch, ingest: %.2f MB, %.2f MB/s\n",
+           NumberToHumanString(
+               interval_write_other + interval_write_self).c_str(),
+           NumberToHumanString(interval_num_keys_written).c_str(),
+           NumberToHumanString(interval_write_self).c_str(),
           static_cast<double>(interval_write_other + interval_write_self) /
               (interval_write_self + 1),
           (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB,
-           human_micros);
+           (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB /
+               std::max(interval_seconds_up, 0.001)),
  value->append(buf);

  uint64_t interval_write_with_wal =
@ -452,13 +461,26 @@ void InternalStats::DumpDBStats(std::string* value) {
  uint64_t interval_wal_bytes = wal_bytes - db_stats_snapshot_.wal_bytes;

  snprintf(buf, sizeof(buf),
-           "Interval WAL: %" PRIu64 " writes, %" PRIu64 " syncs, "
-           "%.2f writes per sync, %.2f MB written\n",
-           interval_write_with_wal,
-           interval_wal_synced,
+           "Interval WAL: %s writes, %s syncs, "
+           "%.2f writes per sync, written: %.2f MB, %.2f MB/s\n",
+           NumberToHumanString(interval_write_with_wal).c_str(),
+           NumberToHumanString(interval_wal_synced).c_str(),
           interval_write_with_wal /
              static_cast<double>(interval_wal_synced + 1),
-           interval_wal_bytes / kGB);
+           interval_wal_bytes / kGB,
+           interval_wal_bytes / kMB / std::max(interval_seconds_up, 0.001));
+  value->append(buf);
+
+  // Stall
+  AppendHumanMicros(
+      write_stall_micros - db_stats_snapshot_.write_stall_micros,
+      human_micros, kHumanMicrosLen, true);
+  snprintf(buf, sizeof(buf),
+           "Interval stall: %s, %.1f percent\n",
+           human_micros,
+           // 10000 = divide by 1M to get secs, then multiply by 100 for pct
+           (write_stall_micros - db_stats_snapshot_.write_stall_micros) /
+               10000.0 / std::max(interval_seconds_up, 0.001));
  value->append(buf);

  db_stats_snapshot_.seconds_up = seconds_up;
@ -557,7 +579,7 @@ void InternalStats::DumpCFStats(std::string* value) {
  value->append(buf);

  snprintf(buf, sizeof(buf),
-           "Flush(GB): accumulative %.3f, interval %.3f\n",
+           "Flush(GB): cumulative %.3f, interval %.3f\n",
           curr_ingest / kGB, interval_ingest / kGB);
  value->append(buf);

--- a/tools/benchmark.sh
+++ b/tools/benchmark.sh
@ -35,21 +35,24 @@ if [ ! -z $DB_BENCH_NO_SYNC ]; then
  syncval="0";
 fi

-num_read_threads=${NUM_READ_THREADS:-16}
-# Only for readwhilewriting, readwhilemerging
-writes_per_second=${WRITES_PER_SEC:-$((80 * K))}
-num_nexts_per_seek=${NUM_NEXTS_PER_SEEK:-10}      # (only for rangescanwhilewriting)
-cache_size=$((1 * G))
+num_threads=${NUM_THREADS:-16}
+# Only for *whilewriting, *whilemerging
+writes_per_second=${WRITES_PER_SECOND:-$((10 * K))}
+# Only for tests that do range scans
+num_nexts_per_seek=${NUM_NEXTS_PER_SEEK:-10}
+cache_size=${CACHE_SIZE:-$((1 * G))}
 duration=${DURATION:-0}

 num_keys=${NUM_KEYS:-$((1 * G))}
 key_size=20
-value_size=800
+value_size=${VALUE_SIZE:-400}

 const_params="
  --db=$DB_DIR \
  --wal_dir=$WAL_DIR \
+  --disable_data_sync=0 \
  \
+  --num=$num_keys \
  --num_levels=6 \
  --key_size=$key_size \
  --value_size=$value_size \
@ -57,10 +60,10 @@ const_params="
  --cache_size=$cache_size \
  --cache_numshardbits=6 \
  --compression_type=zlib \
-  --min_level_to_compress=2 \
+  --min_level_to_compress=3 \
  --compression_ratio=0.5 \
  \
-  --hard_rate_limit=2 \
+  --hard_rate_limit=3 \
  --rate_limit_delay_max_milliseconds=1000000 \
  --write_buffer_size=$((128 * M)) \
  --max_write_buffer_number=3 \
@ -69,7 +72,8 @@ const_params="
  \
  --verify_checksum=1 \
  --delete_obsolete_files_period_micros=$((60 * M)) \
-  --max_grandparent_overlap_factor=10 \
+  --max_grandparent_overlap_factor=8 \
+  --max_bytes_for_level_multiplier=8 \
  \
  --statistics=1 \
  --stats_per_interval=1 \
@ -82,158 +86,173 @@ const_params="

 l0_config="
  --level0_file_num_compaction_trigger=4 \
-  --level0_slowdown_writes_trigger=8 \
-  --level0_stop_writes_trigger=12"
+  --level0_slowdown_writes_trigger=12 \
+  --level0_stop_writes_trigger=20"

 if [ $duration -gt 0 ]; then
  const_params="$const_params --duration=$duration"
 fi

-params_r="$const_params $l0_config --max_background_compactions=4 --max_background_flushes=1"
 params_w="$const_params $l0_config --max_background_compactions=16 --max_background_flushes=16"
 params_bulkload="$const_params --max_background_compactions=16 --max_background_flushes=16 \
                 --level0_file_num_compaction_trigger=$((10 * M)) \
                 --level0_slowdown_writes_trigger=$((10 * M)) \
                 --level0_stop_writes_trigger=$((10 * M))"

+function summarize_result {
+  test_out=$1
+  test_name=$2
+  bench_name=$3
+
+  uptime=$( grep ^Uptime\(secs $test_out | tail -1 | awk '{ printf "%.0f", $2 }' )
+  stall_time=$( grep "^Cumulative stall" $test_out | tail -1  | awk '{  print $3 }' )
+  stall_pct=$( grep "^Cumulative stall" $test_out| tail -1  | awk '{  print $5 }' )
+  ops_sec=$( grep ^${bench_name} $test_out | awk '{ print $5 }' )
+  mb_sec=$( grep ^${bench_name} $test_out | awk '{ print $7 }' )
+  lo_wgb=$( grep "^  L0" $test_out | tail -1 | awk '{ print $8 }' )
+  sum_wgb=$( grep "^ Sum" $test_out | tail -1 | awk '{ print $8 }' )
+  wamp=$( echo "scale=1; $sum_wgb / $lo_wgb" | bc )
+  wmb_ps=$( echo "scale=1; ( $sum_wgb * 1024.0 ) / $uptime" | bc )
+  usecs_op=$( grep ^${bench_name} $test_out | awk '{ printf "%.1f", $3 }' )
+  p50=$( grep "^Percentiles:" $test_out | awk '{ printf "%.1f", $3 }' )
+  p75=$( grep "^Percentiles:" $test_out | awk '{ printf "%.1f", $5 }' )
+  p99=$( grep "^Percentiles:" $test_out | awk '{ printf "%.1f", $7 }' )
+  echo -e "$ops_sec\t$mb_sec\t$lo_wgb\t$sum_wgb\t$wamp\t$wmb_ps\t$usecs_op\t$p50\t$p75\t$p99\t$uptime\t$stall_time\t$stall_pct\t$test_name" \
+    >> $output_dir/report.txt
+}
+
 function run_bulkload {
-  echo "Bulk loading $num_keys random keys into database..."
-  cmd="./db_bench $params_bulkload --benchmarks=fillrandom \
+  echo "Bulk loading $num_keys random keys"
+  cmd="./db_bench --benchmarks=fillrandom \
       --use_existing_db=0 \
-       --num=$num_keys \
       --disable_auto_compactions=1 \
       --sync=0 \
-       --disable_data_sync=0 \
-       --threads=1 2>&1 | tee -a $output_dir/benchmark_bulkload_fillrandom.log"
+       $params_bulkload \
+       --threads=1 \
+       2>&1 | tee -a $output_dir/benchmark_bulkload_fillrandom.log"
  echo $cmd | tee $output_dir/benchmark_bulkload_fillrandom.log
  eval $cmd
+  summarize_result $output_dir/benchmark_bulkload_fillrandom.log bulkload fillrandom
  echo "Compacting..."
-  cmd="./db_bench $params_w --benchmarks=compact \
+  cmd="./db_bench --benchmarks=compact \
       --use_existing_db=1 \
-       --num=$num_keys \
       --disable_auto_compactions=1 \
       --sync=0 \
-       --disable_data_sync=0 \
-       --threads=1 2>&1 | tee -a $output_dir/benchmark_bulkload_compact.log"
+       $params_w \
+       --threads=1 \
+       2>&1 | tee -a $output_dir/benchmark_bulkload_compact.log"
  echo $cmd | tee $output_dir/benchmark_bulkload_compact.log
  eval $cmd
 }

 function run_fillseq {
-  echo "Loading $num_keys keys sequentially into database..."
-  cmd="./db_bench $params_w --benchmarks=fillseq \
+  echo "Loading $num_keys keys sequentially"
+  cmd="./db_bench --benchmarks=fillseq \
       --use_existing_db=0 \
-       --num=$num_keys \
       --sync=0 \
-       --disable_data_sync=0 \
-       --threads=1 2>&1 | tee -a $output_dir/benchmark_fillseq.log"
+       $params_w \
+       --threads=1 \
+       2>&1 | tee -a $output_dir/benchmark_fillseq.log"
  echo $cmd | tee $output_dir/benchmark_fillseq.log
  eval $cmd
+  summarize_result $output_dir/benchmark_fillseq.log fillseq fillseq
 }

-function run_overwrite {
-  echo "Loading $num_keys keys sequentially into database..."
-  cmd="./db_bench $params_w --benchmarks=overwrite \
+function run_change {
+  operation=$1
+  echo "Do $num_keys random $operation"
+  out_name="benchmark_${operation}.t${num_threads}.s${syncval}.log"
+  cmd="./db_bench --benchmarks=$operation \
       --use_existing_db=1 \
-       --num=$num_keys \
-       --sync=0 \
-       --disable_data_sync=0 \
-       --threads=1 2>&1 | tee -a $output_dir/benchmark_overwrite.log"
-  echo $cmd | tee $output_dir/benchmark_overwrite.log
+       --sync=$syncval \
+       $params_w \
+       --threads=$num_threads \
+       --merge_operator=\"put\" \
+       2>&1 | tee -a $output_dir/${out_name}"
+  echo $cmd | tee $output_dir/${out_name}
  eval $cmd
+  summarize_result $output_dir/${out_name} ${operation}.t${num_threads}.s${syncval} $operation
 }

 function run_filluniquerandom {
-  echo "Loading $num_keys unique keys randomly into database..."
-  cmd="./db_bench $params_w --benchmarks=filluniquerandom \
+  echo "Loading $num_keys unique keys randomly"
+  cmd="./db_bench --benchmarks=filluniquerandom \
       --use_existing_db=0 \
-       --num=$num_keys \
       --sync=0 \
-       --disable_data_sync=0 \
-       --threads=1 2>&1 | tee -a $output_dir/benchmark_filluniquerandom.log"
+       $params_w \
+       --threads=1 \
+       2>&1 | tee -a $output_dir/benchmark_filluniquerandom.log"
  echo $cmd | tee $output_dir/benchmark_filluniquerandom.log
  eval $cmd
+  summarize_result $output_dir/benchmark_filluniquerandom.log filluniquerandom filluniquerandom
 }

 function run_readrandom {
-  echo "Reading $num_keys random keys from database..."
-  cmd="./db_bench $params_r --benchmarks=readrandom \
-       --use_existing_db=1 \
-       --num=$num_keys \
-       --threads=$num_read_threads \
-       --disable_auto_compactions=1 \
-       2>&1 | tee -a $output_dir/benchmark_readrandom.log"
-  echo $cmd | tee $output_dir/benchmark_readrandom.log
-  eval $cmd
-}
-
-function run_readwhilewriting {
-  echo "Reading $num_keys random keys from database while writing.."
-  cmd="./db_bench $params_r --benchmarks=readwhilewriting \
+  echo "Reading $num_keys random keys"
+  out_name="benchmark_readrandom.t${num_threads}.log"
+  cmd="./db_bench --benchmarks=readrandom \
       --use_existing_db=1 \
-       --num=$num_keys \
-       --sync=$syncval \
-       --disable_data_sync=0 \
-       --threads=$num_read_threads \
-       --writes_per_second=$writes_per_second \
-       2>&1 | tee -a $output_dir/benchmark_readwhilewriting.log"
-  echo $cmd | tee $output_dir/benchmark_readwhilewriting.log
+       $params_w \
+       --threads=$num_threads \
+       2>&1 | tee -a $output_dir/${out_name}"
+  echo $cmd | tee $output_dir/${out_name}
  eval $cmd
+  summarize_result $output_dir/${out_name} readrandom.t${num_threads} readrandom
 }

-function run_readwhilemerging {
-  echo "Reading $num_keys random keys from database while merging.."
-  cmd="./db_bench $params_r --benchmarks=readwhilemerging \
+function run_readwhile {
+  operation=$1
+  echo "Reading $num_keys random keys while $operation"
+  out_name="benchmark_readwhile${operation}.t${num_threads}.log"
+  cmd="./db_bench --benchmarks=readwhile${operation} \
       --use_existing_db=1 \
-       --num=$num_keys \
       --sync=$syncval \
-       --disable_data_sync=0 \
-       --threads=$num_read_threads \
+       $params_w \
+       --threads=$num_threads \
       --writes_per_second=$writes_per_second \
       --merge_operator=\"put\" \
-       2>&1 | tee -a $output_dir/benchmark_readwhilemerging.log"
-  echo $cmd | tee $output_dir/benchmark_readwhilemerging.log
+       2>&1 | tee -a $output_dir/${out_name}"
+  echo $cmd | tee $output_dir/${out_name}
  eval $cmd
+  summarize_result $output_dir/${out_name} readwhile${operation}.t${num_threads} readwhile${operation}
 }

-function run_rangescanwhilewriting {
-  echo "Range scan $num_keys random keys from database while writing.."
-  cmd="./db_bench $params_r --benchmarks=seekrandomwhilewriting \
+function run_rangewhile {
+  operation=$1
+  full_name=$2
+  reverse_arg=$3
+  out_name="benchmark_${full_name}.t${num_threads}.log"
+  echo "Range scan $num_keys random keys while ${operation} for reverse_iter=${reverse_arg}"
+  cmd="./db_bench --benchmarks=seekrandomwhile${operation} \
       --use_existing_db=1 \
-       --num=$num_keys \
       --sync=$syncval \
-       --disable_data_sync=0 \
-       --threads=$num_read_threads \
+       $params_w \
+       --threads=$num_threads \
       --writes_per_second=$writes_per_second \
+       --merge_operator=\"put\" \
       --seek_nexts=$num_nexts_per_seek \
-       2>&1 | tee -a $output_dir/benchmark_rangescanwhilewriting.log"
-  echo $cmd | tee $output_dir/benchmark_rangescanwhilewriting.log
+       --reverse_iterator=$reverse_arg \
+       2>&1 | tee -a $output_dir/${out_name}"
+  echo $cmd | tee $output_dir/${out_name}
  eval $cmd
+  summarize_result $output_dir/${out_name} ${full_name}.t${num_threads} seekrandomwhile${operation}
 }

-function run_updaterandom {
-  echo "Read/Modify/Write $num_keys random keys (not using merge).."
-  cmd="./db_bench $params_w --benchmarks=updaterandom \
+function run_range {
+  full_name=$1
+  reverse_arg=$2
+  out_name="benchmark_${full_name}.t${num_threads}.log"
+  echo "Range scan $num_keys random keys for reverse_iter=${reverse_arg}"
+  cmd="./db_bench --benchmarks=seekrandom \
       --use_existing_db=1 \
-       --num=$num_keys \
-       --sync=$syncval \
-       --disable_data_sync=0 \
-       --threads=$num_read_threads 2>&1 | tee -a $output_dir/benchmark_updaterandom.log"
-  echo $cmd | tee $output_dir/benchmark_updaterandom.log
-  eval $cmd
-}
-
-function run_mergerandom {
-  echo "Read/Modify/Write $num_keys random keys (using merge operator).."
-  cmd="./db_bench $params_w --benchmarks=mergerandom \
-       --use_existing_db=1 \
-       --num=$num_keys \
-       --sync=$syncval \
-       --disable_data_sync=0 \
-       --merge_operator=\"put\" \
-       --threads=$num_read_threads 2>&1 | tee -a $output_dir/benchmark_mergerandom.log"
-  echo $cmd | tee $output_dir/benchmark_mergerandom.log
+       $params_w \
+       --threads=$num_threads \
+       --seek_nexts=$num_nexts_per_seek \
+       --reverse_iterator=$reverse_arg \
+       2>&1 | tee -a $output_dir/${out_name}"
+  echo $cmd | tee $output_dir/${out_name}
  eval $cmd
+  summarize_result $output_dir/${out_name} ${full_name}.t${num_threads} seekrandom
 }

 function now() {
@ -241,6 +260,7 @@ function now() {
 }

 report="$output_dir/report.txt"
+schedule="$output_dir/schedule.txt"

 echo "===== Benchmark ====="

@ -249,7 +269,7 @@ IFS=',' read -a jobs <<< $1
 for job in ${jobs[@]}; do

  if [ $job != debug ]; then
-    echo "Start $job at `date`" | tee -a $report
+    echo "Start $job at `date`" | tee -a $schedule
  fi

  start=$(now)
@ -258,21 +278,31 @@ for job in ${jobs[@]}; do
  elif [ $job = fillseq ]; then
    run_fillseq
  elif [ $job = overwrite ]; then
-    run_overwrite
+    run_change overwrite
+  elif [ $job = updaterandom ]; then
+    run_change updaterandom
+  elif [ $job = mergerandom ]; then
+    run_change mergerandom
  elif [ $job = filluniquerandom ]; then
    run_filluniquerandom
  elif [ $job = readrandom ]; then
    run_readrandom
+  elif [ $job = fwdrange ]; then
+    run_range $job false
+  elif [ $job = revrange ]; then
+    run_range $job true
  elif [ $job = readwhilewriting ]; then
-    run_readwhilewriting
+    run_readwhile writing
  elif [ $job = readwhilemerging ]; then
-    run_readwhilemerging
-  elif [ $job = rangescanwhilewriting ]; then
-    run_rangescanwhilewriting
-  elif [ $job = updaterandom ]; then
-    run_updaterandom
-  elif [ $job = mergerandom ]; then
-    run_mergerandom
+    run_readwhile merging
+  elif [ $job = fwdrangewhilewriting ]; then
+    run_rangewhile writing $job false
+  elif [ $job = revrangewhilewriting ]; then
+    run_rangewhile writing $job true
+  elif [ $job = fwdrangewhilemerging ]; then
+    run_rangewhile merging $job false
+  elif [ $job = revrangewhilemerging ]; then
+    run_rangewhile merging $job true
  elif [ $job = debug ]; then
    num_keys=1000; # debug
    echo "Setting num_keys to $num_keys"
@ -283,24 +313,10 @@ for job in ${jobs[@]}; do
  end=$(now)

  if [ $job != debug ]; then
-    echo "Complete $job in $((end-start)) seconds" | tee -a $report
+    echo "Complete $job in $((end-start)) seconds" | tee -a $schedule
  fi

-  if [[ $job == readrandom || $job == readwhilewriting \
-     || $job == rangescanwhilewriting || $job == updaterandom \
-     || $job == mergerandom || $job == readwhilemerging ]]; then
-    lat=$(grep "micros\/op" "$output_dir/benchmark_$job.log" \
-        | grep "ops\/sec" | awk '{print $3}')
-    qps=$(grep "micros\/op" "$output_dir/benchmark_$job.log" \
-        | grep "ops\/sec" | awk '{print $5}')
-    line=$(grep "rocksdb.db.get.micros" "$output_dir/benchmark_$job.log")
-    p50=$(echo $line | awk '{print $7}')
-    p99=$(echo $line | awk '{print $13}')
-    print_percentile=$(echo "$p50 != 0 || $p99 != 0" | bc);
-    if [ "$print_percentile" == "1" ]; then
-      echo "Read latency p50 = $p50 us, p99 = $p99 us" | tee -a $report
-    fi
-    echo "QPS = $qps ops/sec" | tee -a $report
-    echo "Avg Latency = $lat micros/op " | tee -a $report
-  fi
+  echo -e "ops/sec\tmb/sec\tL0_MB\tSum_GB\tW-Amp\tW-MB/s\tusec/op\tp50\tp75\tp99\tUptime\tStall-time\tStall%\tTest"
+  tail -1 $output_dir/report.txt
+
 done
--- a/tools/run_flash_bench.sh
+++ b/tools/run_flash_bench.sh
@ -3,50 +3,220 @@
 # After execution of this script, log files are generated in $output_dir.
 # report.txt provides a high level statistics

+# This should be run from the parent of the tools directory. The command line is:
+#   [$env_vars] tools/run_flash_bench.sh [list-of-threads]
+#
+# This runs a sequence of tests in the following sequence:
+#   step 1) load - bulkload, compact, fillseq, overwrite
+#   step 2) read-only for each number of threads
+#   step 3) read-write for each number of threads
+#   step 4) merge for each number of threads
+#
+# The list of threads is optional and when not set is equivalent to "24". 
+# Were list-of-threads specified as "1 2 4" then the tests in steps 2, 3 and
+# 4 above would be repeated for 1, 2 and 4 threads. The tests in step 1 are
+# only run for 1 thread.
+
+# Test output is written to $OUTPUT_DIR, currently /tmp/output. The performance
+# summary is in $OUTPUT_DIR/report.txt. There is one file in $OUTPUT_DIR per
+# test and the tests are listed below.
+#
+# The environment variables are also optional. The variables are:
+#   NKEYS         - number of key/value pairs to load
+#   NWRITESPERSEC - the writes/second rate limit for the *whilewriting* tests.
+#                   If this is too large then the non-writer threads can get
+#                   starved.
+#   NSECONDS      - number of seconds for which to run each test in steps 2,
+#                   3 and 4. There are currently 15 tests in those steps and
+#                   they are repeated for each entry in list-of-threads so
+#                   this variable lets you control the total duration to
+#                   finish the benchmark.
+#   RANGE_LIMIT   - the number of rows to read per range query for tests that
+#                   do range queries.
+#   VAL_SIZE      - the length of the value in the key/value pairs loaded.
+#                   You can estimate the size of the test database from this,
+#                   NKEYS and the compression rate (--compression_ratio) set
+#                   in tools/benchmark.sh
+#   CACHE_BYTES   - the size of the RocksDB block cache in bytes
+#   DATA_DIR      - directory in which to create database files
+#   LOG_DIR       - directory in which to create WAL files, may be the same
+#                   as DATA_DIR
+#   DO_SETUP      - when set to 0 then a backup of the database is copied from
+#                   $DATA_DIR.bak to $DATA_DIR and the load tests from step 1
+#                   The WAL directory is also copied from a backup if
+#                   DATA_DIR != LOG_DIR. This allows tests from steps 2, 3, 4
+#                   to be repeated faster.
+#   SAVE_SETUP    - saves a copy of the database at the end of step 1 to
+#                   $DATA_DIR.bak. When LOG_DIR != DATA_DIR then it is copied
+#                   to $LOG_DIR.bak.
+
 # Size constants
 K=1024
 M=$((1024 * K))
 G=$((1024 * M))

-n=$((1 * G))
-wps=$((80 * K))
-duration=$((12 * 60 * 60))
-num_read_threads=24
+num_keys=${NKEYS:-$((1 * G))}
+wps=${NWRITESPERSEC:-$((10 * K))}
+duration=${NSECONDS:-$((60 * 60))}
+nps=${RANGE_LIMIT:-10}
+vs=${VAL_SIZE:-400}
+cs=${CACHE_BYTES:-$(( 1 * G ))}
+
+# If no command line arguments then run for 24 threads.
+if [[ $# -eq 0 ]]; then
+  nthreads=( 24 )
+else
+  nthreads=( "$@" )
+fi
+
+for num_thr in "${nthreads[@]}" ; do
+  echo Will run for $num_thr threads
+done

 # Update these parameters before execution !!!
-db_dir="/tmp/rocksdb/"
-wal_dir="/tmp/rocksdb/"
+db_dir=${DATA_DIR:-"/tmp/rocksdb/"}
+wal_dir=${LOG_DIR:-"/tmp/rocksdb/"}
+
+do_setup=${DO_SETUP:-1}
+save_setup=${SAVE_SETUP:-0}
+
 output_dir="/tmp/output"

+ARGS="\
+OUTPUT_DIR=$output_dir \
+NUM_KEYS=$num_keys \
+DB_DIR=$db_dir \
+WAL_DIR=$wal_dir \
+VALUE_SIZE=$vs \
+CACHE_SIZE=$cs"
+
+mkdir -p $output_dir
+echo -e "ops/sec\tmb/sec\tL0_GB\tSum_GB\tW-Amp\tW-MB/s\tusec/op\tp50\tp75\tp99\tUptime\tStall-time\tStall%\tTest" \
+  > $output_dir/report.txt
+
+# Notes on test sequence:
+#   step 1) Setup database via sequential fill followed by overwrite to fragment it.
+#           Done without setting DURATION to make sure that overwrite does $num_keys writes
+#   step 2) read-only tests for all levels of concurrency requested
+#   step 3) non read-only tests for all levels of concurrency requested
+#   step 4) merge tests for all levels of concurrency requested. These must come last.
+
+###### Setup the database
+
+if [[ $do_setup != 0 ]]; then
+  echo Doing setup
+
+  # Test 1: bulk load
+  env $ARGS ./tools/benchmark.sh bulkload
+
+  # Test 2: sequential fill
+  env $ARGS ./tools/benchmark.sh fillseq
+
+  # Test 3: single-threaded overwrite
+  env $ARGS NUM_THREADS=1 DB_BENCH_NO_SYNC=1 ./tools/benchmark.sh overwrite
+
+else
+  echo Restoring from backup
+
+  rm -rf $db_dir
+
+  if [ ! -d ${db_dir}.bak ]; then
+    echo Database backup does not exist at ${db_dir}.bak
+    exit -1
+  fi
+
+  echo Restore database from ${db_dir}.bak
+  cp -p -r ${db_dir}.bak $db_dir
+
+  if [[ $db_dir != $wal_dir ]]; then
+    rm -rf $wal_dir
+
+    if [ ! -d ${wal_dir}.bak ]; then
+      echo WAL backup does not exist at ${wal_dir}.bak
+      exit -1
+    fi
+
+    echo Restore WAL from ${wal_dir}.bak
+    cp -p -r ${wal_dir}.bak $wal_dir
+  fi
+fi
+
+if [[ $save_setup != 0 ]]; then
+  echo Save database to ${db_dir}.bak
+  cp -p -r $db_dir ${db_dir}.bak
+
+  if [[ $db_dir != $wal_dir ]]; then
+    echo Save WAL to ${wal_dir}.bak
+    cp -p -r $wal_dir ${wal_dir}.bak
+  fi
+fi
+
+###### Read-only tests
+
+for num_thr in "${nthreads[@]}" ; do
+  # Test 4: random read
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr ./tools/benchmark.sh readrandom
+
+  # Test 5: random range scans
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr NUM_NEXTS_PER_SEEK=$nps \
+    ./tools/benchmark.sh fwdrange
+
+  # Test 6: random reverse range scans
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr NUM_NEXTS_PER_SEEK=$nps \
+    ./tools/benchmark.sh revrange
+done
+
+###### Non read-only tests
+
+for num_thr in "${nthreads[@]}" ; do
+  # Test 7: overwrite with sync=0
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr DB_BENCH_NO_SYNC=1 \
+    ./tools/benchmark.sh overwrite
+
+  # Test 8: overwrite with sync=1
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr ./tools/benchmark.sh overwrite
+
+  # Test 9: random update with sync=0
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr DB_BENCH_NO_SYNC=1 \
+    ./tools/benchmark.sh updaterandom
+
+  # Test 10: random update with sync=1
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr ./tools/benchmark.sh updaterandom
+
+  # Test 11: random read while writing
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr WRITES_PER_SECOND=$wps \
+    ./tools/benchmark.sh readwhilewriting
+
+  # Test 12: range scan while writing
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr WRITES_PER_SECOND=$wps \
+    NUM_NEXTS_PER_SEEK=$nps ./tools/benchmark.sh fwdrangewhilewriting
+
+  # Test 13: reverse range scan while writing
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr WRITES_PER_SECOND=$wps \
+    NUM_NEXTS_PER_SEEK=$nps ./tools/benchmark.sh revrangewhilewriting
+done

-# Test 1: bulk load
-OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \
-  ./benchmark.sh bulkload
+###### Merge tests

-# Test 2: sequential fill
-OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \
-  ./benchmark.sh fillseq
+for num_thr in "${nthreads[@]}" ; do
+  # Test 14: random merge with sync=0
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr DB_BENCH_NO_SYNC=1 \
+    ./tools/benchmark.sh mergerandom

-# Test 3: overwrite
-OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \
-  ./benchmark.sh overwrite
+  # Test 15: random merge with sync=1
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr ./tools/benchmark.sh mergerandom

-# Prepare: populate DB with random data
-OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \
-  ./benchmark.sh filluniquerandom
+  # Test 16: random read while merging 
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr WRITES_PER_SECOND=$wps \
+    ./tools/benchmark.sh readwhilemerging

-# Test 4: random read
-OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \
-  DURATION=$duration NUM_READ_THREADS=$num_read_threads \
-  ./benchmark.sh readrandom
+  # Test 17: range scan while merging 
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr WRITES_PER_SECOND=$wps \
+    NUM_NEXTS_PER_SEEK=$nps ./tools/benchmark.sh fwdrangewhilemerging

-# Test 5: random read while writing
-OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \
-  DURATION=$duration NUM_READ_THREADS=$num_read_threads WRITES_PER_SECOND=$wps \
-  ./benchmark.sh readwhilewriting
+  # Test 18: reverse range scan while merging 
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr WRITES_PER_SECOND=$wps \
+    NUM_NEXTS_PER_SEEK=$nps ./tools/benchmark.sh revrangewhilemerging
+done

-# Test 6: random seek + next()'s while writing
-OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \
-  DURATION=$duration NUM_READ_THREADS=$num_read_threads WRITES_PER_SECOND=$wps \
-  NUM_NEXTS_PER_SEEK=10 \
-  ./benchmark.sh rangescanwhilewriting
+cat $output_dir/report.txt
--- a/util/logging.cc
+++ b/util/logging.cc
@ -29,16 +29,17 @@ namespace rocksdb {
 // for micros >= 10 sec, print "XX sec".
 // for micros <= 1 hour, print Y:X M:S".
 // for micros > 1 hour, print Z:Y:X H:M:S".
-int AppendHumanMicros(uint64_t micros, char* output, int len) {
-  if (micros < 10000) {
+int AppendHumanMicros(uint64_t micros, char* output, int len,
+                      bool fixed_format) {
+  if (micros < 10000 && !fixed_format) {
    return snprintf(output, len, "%" PRIu64 " us", micros);
-  } else if (micros < 10000000) {
+  } else if (micros < 10000000 && !fixed_format) {
    return snprintf(output, len, "%.3lf ms",
                    static_cast<double>(micros) / 1000);
-  } else if (micros < 1000000l * 60) {
+  } else if (micros < 1000000l * 60 && !fixed_format) {
    return snprintf(output, len, "%.3lf sec",
                    static_cast<double>(micros) / 1000000);
-  } else if (micros < 1000000ll * 60 * 60) {
+  } else if (micros < 1000000ll * 60 * 60 && !fixed_format) {
    return snprintf(output, len, "%02" PRIu64 ":%05.3f M:S",
        micros / 1000000 / 60,
        static_cast<double>(micros % 60000000) / 1000000);
--- a/util/logging.h
+++ b/util/logging.h
@ -21,7 +21,8 @@ namespace rocksdb {
 class Slice;

 // Append a human-readable time in micros.
-int AppendHumanMicros(uint64_t micros, char* output, int len);
+int AppendHumanMicros(uint64_t micros, char* output, int len,
+                      bool fixed_format);

 // Append a human-readable size in bytes
 int AppendHumanBytes(uint64_t bytes, char* output, int len);
--- a/util/thread_status_impl.cc
+++ b/util/thread_status_impl.cc
@ -40,7 +40,7 @@ const std::string ThreadStatus::MicrosToString(uint64_t micros) {
  }
  const int kBufferLen = 100;
  char buffer[kBufferLen];
-  AppendHumanMicros(micros, buffer, kBufferLen);
+  AppendHumanMicros(micros, buffer, kBufferLen, false);
  return std::string(buffer);
 }