Enhance to support more tuning options, and universal and integrated… (#9704)

Summary: … BlobDB for all tests This does two big things: * provides more tuning options * supports universal and integrated BlobDB for all of the benchmarks that are leveled-only It does several smaller things, and I will list a few * sets l0_slowdown_writes_trigger which wasn't set before this diff. * improves readability in report.tsv by using smaller field names in the header * adds more columns to report.tsv report.tsv before this diff: ``` ops_sec mb_sec total_size_gb level0_size_gb sum_gb write_amplification write_mbps usec_op percentile_50 percentile_75 percentile_99 percentile_99.9 percentile_99.99 uptime stall_time stall_percent test_name test_date rocksdb_version job_id 823294 329.8 0.0 21.5 21.5 1.0 183.4 1.2 1.0 1.0 3 6 14 120 00:00:0.000 0.0 fillseq.wal_disabled.v400 2022-03-16T15:46:45.000-07:00 7.0 326520 130.8 0.0 0.0 0.0 0.0 0 12.2 139.8 155.1 170 234 250 60 00:00:0.000 0.0 multireadrandom.t4 2022-03-16T15:48:47.000-07:00 7.0 86313 345.7 0.0 0.0 0.0 0.0 0 46.3 44.8 50.6 75 84 108 60 00:00:0.000 0.0 revrangewhilewriting.t4 2022-03-16T15:50:48.000-07:00 7.0 101294 405.7 0.0 0.1 0.1 1.0 1.6 39.5 40.4 45.9 64 75 103 62 00:00:0.000 0.0 fwdrangewhilewriting.t4 2022-03-16T15:52:50.000-07:00 7.0 258141 103.4 0.0 0.1 1.2 18.2 19.8 15.5 14.3 18.1 28 34 48 62 00:00:0.000 0.0 readwhilewriting.t4 2022-03-16T15:54:51.000-07:00 7.0 334690 134.1 0.0 7.6 18.7 4.2 308.8 12.0 11.8 13.7 21 30 62 62 00:00:0.000 0.0 overwrite.t4.s0 2022-03-16T15:56:53.000-07:00 7.0 ``` report.tsv with this diff: ``` ops_sec mb_sec lsm_sz blob_sz c_wgb w_amp c_mbps c_wsecs c_csecs b_rgb b_wgb usec_op p50 p99 p99.9 p99.99 pmax uptime stall% Nstall u_cpu s_cpu rss test date version job_id 831144 332.9 22GB 0.0GB, 21.7 1.0 185.1 264 262 0 0 1.2 1.0 3 6 14 9198 120 0.0 0 0.4 0.0 0.7 fillseq.wal_disabled.v400 2022-03-16T16:21:23 7.0 325229 130.3 22GB 0.0GB, 0.0 0.0 0 0 0 0 12.3 139.8 170 237 249 572 60 0.0 0 0.4 0.1 1.2 multireadrandom.t4 2022-03-16T16:23:25 7.0 312920 125.3 26GB 0.0GB, 11.1 2.6 189.3 115 113 0 0 12.8 11.8 21 34 1255 6442 60 0.2 1 0.7 0.1 0.6 overwritesome.t4.s0 2022-03-16T16:25:27 7.0 81698 327.2 25GB 0.0GB, 0.0 0.0 0 0 0 0 48.9 46.2 79 246 369 9445 60 0.0 0 0.4 0.1 1.4 revrangewhilewriting.t4 2022-03-16T16:30:21 7.0 92484 370.4 25GB 0.0GB, 0.1 1.5 1.1 1 0 0 0 43.2 42.3 75 103 110 9512 62 0.0 0 0.4 0.1 1.4 fwdrangewhilewriting.t4 2022-03-16T16:32:24 7.0 241661 96.8 25GB 0.0GB, 0.1 1.5 1.1 1 0 0 0 16.5 17.1 30 34 49 9092 62 0.0 0 0.4 0.1 1.4 readwhilewriting.t4 2022-03-16T16:34:27 7.0 305234 122.3 30GB 0.0GB, 12.1 2.7 201.7 127 124 0 0 13.1 11.8 21 128 1934 6339 62 0.0 0 0.7 0.1 0.7 overwrite.t4.s0 2022-03-16T16:36:30 7.0 ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/9704 Test Plan: run it Reviewed By: jay-zhuang Differential Revision: D36864627 Pulled By: mdcallag fbshipit-source-id: d5af1cfc258a16865210163fa6fd1b803ab1a7d3
3 years ago · 5506954b1f
parent 7b2c0140ba
commit 5506954b1f
1 changed files with 435 additions and 69 deletions
--- a/tools/benchmark.sh
+++ b/tools/benchmark.sh
@ -14,7 +14,7 @@ G=$((1024 * M))
 T=$((1024 * G))

 function display_usage() {
-  echo "useage: benchmark.sh [--help] <test>"
+  echo "usage: benchmark.sh [--help] <test>"
  echo ""
  echo "These are the available benchmark tests:"
  echo -e "\tbulkload"
@ -38,8 +38,8 @@ function display_usage() {
  echo -e "\tuniversal_compaction"
  echo -e "\tdebug"
  echo ""
-  echo "Enviroment Variables:"
-  echo -e "\tJOB_ID\t\tAn identifier for the benchmark job, will appear in the results"
+  echo "Generic enviroment Variables:"
+  echo -e "\tJOB_ID\t\t\t\tAn identifier for the benchmark job, will appear in the results"
  echo -e "\tDB_DIR\t\t\t\tPath to write the database data directory"
  echo -e "\tWAL_DIR\t\t\t\tPath to write the database WAL directory"
  echo -e "\tOUTPUT_DIR\t\t\tPath to write the benchmark results to (default: /tmp)"
@ -48,13 +48,47 @@ function display_usage() {
  echo -e "\tVALUE_SIZE\t\t\tThe size of the values to use in the benchmark (default: 400 bytes)"
  echo -e "\tBLOCK_SIZE\t\t\tThe size of the database blocks in the benchmark (default: 8 KB)"
  echo -e "\tDB_BENCH_NO_SYNC\t\tDisable fsync on the WAL"
+  echo -e "\tNUMACTL\t\t\t\tWhen defined use numactl --interleave=all"
  echo -e "\tNUM_THREADS\t\t\tThe number of threads to use (default: 64)"
-  echo -e "\tMB_WRITE_PER_SEC"
+  echo -e "\tMB_WRITE_PER_SEC\t\t\tRate limit for background writer"
  echo -e "\tNUM_NEXTS_PER_SEEK\t\t(default: 10)"
-  echo -e "\tCACHE_SIZE\t\t\t(default: 16GB)"
+  echo -e "\tCACHE_SIZE\t\t\tSize of the block cache(default: 16GB)"
  echo -e "\tCOMPRESSION_MAX_DICT_BYTES"
-  echo -e "\tCOMPRESSION_TYPE\t\t(default: zstd)"
-  echo -e "\tDURATION"
+  echo -e "\tCOMPRESSION_TYPE\t\tDefault compression(default: zstd)"
+  echo -e "\tBOTTOMMOST_COMPRESSION\t\t(default: none)"
+  echo -e "\tMIN_LEVEL_TO_COMPRESS\t\tValue for min_level_to_compress for Leveled"
+  echo -e "\tCOMPRESSION_SIZE_PERCENT\tValue for compression_size_percent for Universal"
+  echo -e "\tDURATION\t\t\tNumber of seconds for which the test runs"
+  echo -e "\tWRITES\t\t\t\tNumber of writes for which the test runs"
+  echo -e "\tWRITE_BUFFER_SIZE_MB\t\tThe size of the write buffer in MB (default: 128)"
+  echo -e "\tTARGET_FILE_SIZE_BASE_MB\tThe value for target_file_size_base in MB (default: 128)"
+  echo -e "\tMAX_BYTES_FOR_LEVEL_BASE_MB\tThe value for max_bytes_for_level_base in MB (default: 128)"
+  echo -e "\tMAX_BACKGROUND_JOBS\t\tThe value for max_background_jobs (default: 16)"
+  echo -e "\tCACHE_INDEX_AND_FILTER_BLOCKS\tThe value for cache_index_and_filter_blocks (default: 0)"
+  echo -e "\tUSE_O_DIRECT\t\t\tUse O_DIRECT for user reads and compaction"
+  echo -e "\tSTATS_INTERVAL_SECONDS\t\tValue for stats_interval_seconds"
+  echo -e "\tREPORT_INTERVAL_SECONDS\t\tValue for report_interval_seconds"
+  echo -e "\tSUBCOMPACTIONS\t\t\tValue for subcompactions"
+  echo -e "\tCOMPACTION_STYLE\t\tOne of leveled, universal, blob. Default is leveled."
+  echo -e "\nEnvironment variables (mostly) for leveled compaction:"
+  echo -e "\tLEVEL0_FILE_NUM_COMPACTION_TRIGGER\t\tValue for level0_file_num_compaction_trigger"
+  echo -e "\tLEVEL0_SLOWDOWN_WRITES_TRIGGER\t\t\tValue for level0_slowdown_writes_trigger"
+  echo -e "\tLEVEL0_STOP_WRITES_TRIGGER\t\t\tValue for level0_stop_writes_trigger"
+  echo -e "\tPER_LEVEL_FANOUT\t\t\t\tValue for max_bytes_for_level_multiplier"
+  echo -e "\tSOFT_PENDING_COMPACTION_BYTES_LIMIT_IN_GB\tThe value for soft_pending_compaction_bytes_limit in GB"
+  echo -e "\tHARD_PENDING_COMPACTION_BYTES_LIMIT_IN_GB\tThe value for hard_pending_compaction_bytes_limit in GB"
+  echo -e "\nEnvironment variables for universal compaction:"
+  echo -e "\tUNIVERSAL_MIN_MERGE_WIDTH\tValue of min_merge_width option for universal"
+  echo -e "\tUNIVERSAL_MAX_MERGE_WIDTH\tValue of min_merge_width option for universal"
+  echo -e "\tUNIVERSAL_SIZE_RATIO\t\tValue of size_ratio option for universal"
+  echo -e "\tUNIVERSAL_MAX_SIZE_AMP\t\tmax_size_amplification_percent for universal"
+  echo -e "\tUNIVERSAL_ALLOW_TRIVIAL_MOVE\tSet allow_trivial_move to true for universal, default is false"
+  echo -e "\nOptions for integrated BlobDB"
+  echo -e "\tMIN_BLOB_SIZE\tValue for min_blob_size"
+  echo -e "\tBLOB_FILE_SIZE\tValue for blob_file_size"
+  echo -e "\tBLOB_COMPRESSION_TYPE\tValue for blob_compression_type"
+  echo -e "\tBLOB_GC_AGE_CUTOFF\tValue for blob_garbage_collection_age_cutoff"
+  echo -e "\tBLOB_GC_FORCE_THRESHOLD\tValue for blob_garbage_collection_force_threshold"
 }

 if [ $# -lt 1 ]; then
@ -106,6 +140,18 @@ if [ ! -z $DB_BENCH_NO_SYNC ]; then
  syncval="0";
 fi

+compaction_style=${COMPACTION_STYLE:-leveled}
+if [ $compaction_style = "leveled" ]; then
+  echo Use leveled compaction
+elif [ $compaction_style = "universal" ]; then
+  echo Use universal compaction
+elif [ $compaction_style = "blob" ]; then
+  echo Use blob compaction
+else
+  echo COMPACTION_STYLE is :: $COMPACTION_STYLE :: and must be one of leveled, universal, blob
+  exit $EXIT_INVALID_ARGS
+fi
+
 num_threads=${NUM_THREADS:-64}
 mb_written_per_sec=${MB_WRITE_PER_SEC:-0}
 # Only for tests that do range scans
@ -113,19 +159,81 @@ num_nexts_per_seek=${NUM_NEXTS_PER_SEEK:-10}
 cache_size=${CACHE_SIZE:-$((17179869184))}
 compression_max_dict_bytes=${COMPRESSION_MAX_DICT_BYTES:-0}
 compression_type=${COMPRESSION_TYPE:-zstd}
+min_level_to_compress=${MIN_LEVEL_TO_COMPRESS:-"-1"}
+compression_size_percent=${COMPRESSION_SIZE_PERCENT:-"-1"}
+
 duration=${DURATION:-0}
+writes=${WRITES:-0}

 num_keys=${NUM_KEYS:-8000000000}
 key_size=${KEY_SIZE:-20}
 value_size=${VALUE_SIZE:-400}
 block_size=${BLOCK_SIZE:-8192}
+write_buffer_mb=${WRITE_BUFFER_SIZE_MB:-128}
+target_file_mb=${TARGET_FILE_SIZE_BASE_MB:-128}
+l1_mb=${MAX_BYTES_FOR_LEVEL_BASE_MB:-1024}
+max_background_jobs=${MAX_BACKGROUND_JOBS:-16}
+stats_interval_seconds=${STATS_INTERVAL_SECONDS:-60}
+report_interval_seconds=${REPORT_INTERVAL_SECONDS:-5}
+subcompactions=${SUBCOMPACTIONS:-1}
+per_level_fanout=${PER_LEVEL_FANOUT:-8}
+
+cache_index_and_filter=${CACHE_INDEX_AND_FILTER_BLOCKS:-0}
+if [[ $cache_index_and_filter -eq 0 ]]; then
+  cache_meta_flags=""
+elif [[ $cache_index_and_filter -eq 1 ]]; then
+  cache_meta_flags="\
+  --cache_index_and_filter_blocks=$cache_index_and_filter \
+  --cache_high_pri_pool_ratio=0.5"
+else
+  echo CACHE_INDEX_AND_FILTER_BLOCKS was $CACHE_INDEX_AND_FILTER_BLOCKS but must be 0 or 1
+  exit $EXIT_INVALID_ARGS
+fi
+
+soft_pending_arg=""
+if [ ! -z $SOFT_PENDING_COMPACTION_BYTES_LIMIT_IN_GB ]; then
+  soft_pending_bytes=$( echo $SOFT_PENDING_COMPACTION_BYTES_LIMIT_IN_GB | \
+    awk '{ printf "%.0f", $1 * GB }' GB=$G )
+  soft_pending_arg="--soft_pending_compaction_bytes_limit=$soft_pending_bytes"
+fi
+
+hard_pending_arg=""
+if [ ! -z $HARD_PENDING_COMPACTION_BYTES_LIMIT_IN_GB ]; then
+  hard_pending_bytes=$( echo $HARD_PENDING_COMPACTION_BYTES_LIMIT_IN_GB | \
+    awk '{ printf "%.0f", $1 * GB }' GB=$G )
+  hard_pending_arg="--hard_pending_compaction_bytes_limit=$hard_pending_bytes"
+fi
+
+o_direct_flags=""
+if [ ! -z $USE_O_DIRECT ]; then
+  # TODO: deal with flags only supported in new versions, like prepopulate_block_cache
+  #o_direct_flags="--use_direct_reads --use_direct_io_for_flush_and_compaction --prepopulate_block_cache=1"
+  o_direct_flags="--use_direct_reads --use_direct_io_for_flush_and_compaction"
+fi

-const_params="
+univ_min_merge_width=${UNIVERSAL_MIN_MERGE_WIDTH:-2}
+univ_max_merge_width=${UNIVERSAL_MAX_MERGE_WIDTH:-20}
+univ_size_ratio=${UNIVERSAL_SIZE_RATIO:-1}
+univ_max_size_amp=${UNIVERSAL_MAX_SIZE_AMP:-200}
+
+if [ ! -z $UNIVERSAL_ALLOW_TRIVIAL_MOVE ]; then
+  univ_allow_trivial_move=1
+else
+  univ_allow_trivial_move=0
+fi
+
+min_blob_size=${MIN_BLOB_SIZE:-0}
+blob_file_size=${BLOB_FILE_SIZE:-$(( 256 * $M ))}
+blob_compression_type=${BLOB_COMPRESSION_TYPE:-${compression_type}}
+blob_gc_age_cutoff=${BLOB_GC_AGE_CUTOFF:-"0.25"}
+blob_gc_force_threshold=${BLOB_GC_FORCE_THRESHOLD:-1}
+
+const_params_base="
  --db=$DB_DIR \
  --wal_dir=$WAL_DIR \
  \
  --num=$num_keys \
-  --num_levels=6 \
+  --num_levels=8 \
  --key_size=$key_size \
  --value_size=$value_size \
  --block_size=$block_size \
@ -134,49 +242,107 @@ const_params="
  --compression_max_dict_bytes=$compression_max_dict_bytes \
  --compression_ratio=0.5 \
  --compression_type=$compression_type \
-  --level_compaction_dynamic_level_bytes=true \
  --bytes_per_sync=$((8 * M)) \
-  --cache_index_and_filter_blocks=0 \
-  --pin_l0_filter_and_index_blocks_in_cache=1 \
+  $cache_meta_flags \
+  $o_direct_flags \
  --benchmark_write_rate_limit=$(( 1024 * 1024 * $mb_written_per_sec )) \
  \
-  --write_buffer_size=$((128 * M)) \
-  --target_file_size_base=$((128 * M)) \
-  --max_bytes_for_level_base=$((1 * G)) \
+  --write_buffer_size=$(( $write_buffer_mb * M)) \
+  --target_file_size_base=$(( $target_file_mb * M)) \
+  --max_bytes_for_level_base=$(( $l1_mb * M)) \
  \
  --verify_checksum=1 \
  --delete_obsolete_files_period_micros=$((60 * M)) \
-  --max_bytes_for_level_multiplier=8 \
+  --max_bytes_for_level_multiplier=$per_level_fanout \
  \
  --statistics=0 \
  --stats_per_interval=1 \
-  --stats_interval_seconds=60 \
+  --stats_interval_seconds=$stats_interval_seconds \
+  --report_interval_seconds=$report_interval_seconds \
  --histogram=1 \
  \
  --memtablerep=skip_list \
  --bloom_bits=10 \
  --open_files=-1 \
+  --subcompactions=$subcompactions \
  \
  $bench_args"

+level_const_params="
+  $const_params_base \
+  --compaction_style=0 \
+  --min_level_to_compress=$min_level_to_compress \
+  --level_compaction_dynamic_level_bytes=true \
+  --pin_l0_filter_and_index_blocks_in_cache=1 \
+  $soft_pending_arg \
+  $hard_pending_arg \
+"
+
+# TODO: these inherit level_const_params because the non-blob LSM tree uses leveled compaction
+blob_const_params="
+  $level_const_params \
+  --enable_blob_files=true \
+  --min_blob_size=$min_blob_size \
+  --blob_file_size=$blob_file_size \
+  --blob_compression_type=$blob_compression_type \
+  --enable_blob_garbage_collection=true \
+  --blob_garbage_collection_age_cutoff=$blob_gc_age_cutoff \
+  --blob_garbage_collection_force_threshold=$blob_gc_force_threshold \
+"
+
+# TODO:
+#   pin_l0_filter_and..., is this OK?
+univ_const_params="
+  $const_params_base \
+  --compaction_style=1 \
+  --universal_compression_size_percent=$compression_size_percent \
+  --pin_l0_filter_and_index_blocks_in_cache=1 \
+  --universal_min_merge_width=$univ_min_merge_width \
+  --universal_max_merge_width=$univ_max_merge_width \
+  --universal_size_ratio=$univ_size_ratio \
+  --universal_max_size_amplification_percent=$univ_max_size_amp \
+  --universal_allow_trivial_move=$univ_allow_trivial_move \
+"
+
+if [ $compaction_style == "leveled" ]; then
+  const_params="$level_const_params"
+  l0_file_num_compaction_trigger=${LEVEL0_FILE_NUM_COMPACTION_TRIGGER:-4}
+  l0_slowdown_writes_trigger=${LEVEL0_SLOWDOWN_WRITES_TRIGGER:-20}
+  l0_stop_writes_trigger=${LEVEL0_STOP_WRITES_TRIGGER:-30}
+elif [ $compaction_style == "universal" ]; then
+  const_params="$univ_const_params"
+  l0_file_num_compaction_trigger=${LEVEL0_FILE_NUM_COMPACTION_TRIGGER:-8}
+  l0_slowdown_writes_trigger=${LEVEL0_SLOWDOWN_WRITES_TRIGGER:-20}
+  l0_stop_writes_trigger=${LEVEL0_STOP_WRITES_TRIGGER:-30}
+else
+  # compaction_style == "blob"
+  const_params="$blob_const_params"
+  l0_file_num_compaction_trigger=${LEVEL0_FILE_NUM_COMPACTION_TRIGGER:-4}
+  l0_slowdown_writes_trigger=${LEVEL0_SLOWDOWN_WRITES_TRIGGER:-20}
+  l0_stop_writes_trigger=${LEVEL0_STOP_WRITES_TRIGGER:-30}
+fi
+
 l0_config="
-  --level0_file_num_compaction_trigger=4 \
-  --level0_stop_writes_trigger=20"
+  --level0_file_num_compaction_trigger=$l0_file_num_compaction_trigger \
+  --level0_slowdown_writes_trigger=$l0_slowdown_writes_trigger \
+  --level0_stop_writes_trigger=$l0_stop_writes_trigger"

+# You probably don't want to set both --writes and --duration
 if [ $duration -gt 0 ]; then
  const_params="$const_params --duration=$duration"
 fi
+if [ $writes -gt 0 ]; then
+  const_params="$const_params --writes=$writes"
+fi

 params_w="$l0_config \
-          --max_background_compactions=16 \
+          --max_background_jobs=$max_background_jobs \
          --max_write_buffer_number=8 \
-          --max_background_flushes=7 \
          $const_params"

-params_bulkload="--max_background_compactions=16 \
+params_bulkload="--max_background_jobs=$max_background_jobs \
                 --max_write_buffer_number=8 \
                 --allow_concurrent_memtable_write=false \
-                 --max_background_flushes=7 \
                 --level0_file_num_compaction_trigger=$((10 * M)) \
                 --level0_slowdown_writes_trigger=$((10 * M)) \
                 --level0_stop_writes_trigger=$((10 * M)) \
@ -204,6 +370,28 @@ params_univ_compact="$const_params \
                --level0_slowdown_writes_trigger=16 \
                --level0_stop_writes_trigger=20"

+tsv_header="ops_sec\tmb_sec\tlsm_sz\tblob_sz\tc_wgb\tw_amp\tc_mbps\tc_wsecs\tc_csecs\tb_rgb\tb_wgb\tusec_op\tp50\tp99\tp99.9\tp99.99\tpmax\tuptime\tstall%\tNstall\tu_cpu\ts_cpu\trss\ttest\tdate\tversion\tjob_id"
+
+function get_cmd() {
+  output=$1
+
+  numa=""
+  if [ ! -z $NUMACTL ]; then
+    numa="numactl --interleave=all "
+  fi
+
+  # Try to use timeout when duration is set because some tests (revrange*) hang
+  # for some versions (v6.10, v6.11).
+  timeout_cmd=""
+  if [ $duration -gt 0 ]; then
+    if hash timeout ; then
+      timeout_cmd="timeout $(( $duration + 600 ))"
+    fi
+  fi
+
+  echo "/usr/bin/time -f '%e %U %S' -o $output $numa $timeout_cmd"
+}
+
 function month_to_num() {
    local date_str=$1
    date_str="${date_str/Jan/01}"
@ -221,6 +409,45 @@ function month_to_num() {
    echo $date_str
 }

+function start_stats {
+  output=$1
+  iostat -y -mx 1  >& $output.io &
+  vmstat 1 >& $output.vm &
+  # tail -1 because "ps | grep db_bench" returns 2 entries and we want the second
+  while :; do ps aux | grep db_bench | grep -v grep | tail -1; sleep 10; done >& $output.ps &
+  # This sets a global value
+  pspid=$!
+
+  while :; do
+    b_gb=$( ls -l $DB_DIR 2> /dev/null | grep blob | awk '{ c += 1; b += $5 } END { printf "%.1f", b / (1024*1024*1024) }' )
+    s_gb=$( ls -l $DB_DIR 2> /dev/null | grep sst | awk '{ c += 1; b += $5 } END { printf "%.1f", b / (1024*1024*1024) }' )
+    l_gb=$( ls -l $WAL_DIR 2> /dev/null | grep log | awk '{ c += 1; b += $5 } END { printf "%.1f", b / (1024*1024*1024) }' )
+    a_gb=$( ls -l $DB_DIR 2> /dev/null | awk '{ c += 1; b += $5 } END { printf "%.1f", b / (1024*1024*1024) }' )
+    ts=$( date +%H%M%S )
+    echo -e "${a_gb}\t${s_gb}\t${l_gb}\t${b_gb}\t${ts}"
+    sleep 10
+  done >& $output.sizes &
+  # This sets a global value
+  szpid=$!
+}
+
+function stop_stats {
+  output=$1
+  kill $pspid
+  kill $szpid
+  killall iostat
+  killall vmstat
+  sleep 1
+  gzip $output.io
+  gzip $output.vm
+
+  am=$( sort -nk 1,1 $output.sizes | tail -1 | awk '{ print $1 }' )
+  sm=$( sort -nk 2,2 $output.sizes | tail -1 | awk '{ print $2 }' )
+  lm=$( sort -nk 3,3 $output.sizes | tail -1 | awk '{ print $3 }' )
+  bm=$( sort -nk 4,4 $output.sizes | tail -1 | awk '{ print $4 }' )
+  echo -e "max sizes (GB): $am all, $sm sst, $lm log, $bm blob" >> $output.sizes
+}
+
 function summarize_result {
  test_out=$1
  test_name=$2
@ -231,39 +458,77 @@ function summarize_result {
  # happen then empty output from grep when searching for "Sum" will cause
  # syntax errors.
  version=$( grep ^RocksDB: $test_out | awk '{ print $3 }' )
-  date=$( grep ^Date: $test_out | awk '{ print $6 "-" $3 "-" $4 "T" $5 ".000" }' )
-  iso_date=$( month_to_num $date )
-  tz=$( date "+%z" )
-  iso_tz="${tz:0:3}:${tz:3:2}"
-  iso_date="$iso_date$iso_tz"
+  date=$( grep ^Date: $test_out | awk '{ print $6 "-" $3 "-" $4 "T" $5 }' )
+  my_date=$( month_to_num $date )
  uptime=$( grep ^Uptime\(secs $test_out | tail -1 | awk '{ printf "%.0f", $2 }' )
-  stall_time=$( grep "^Cumulative stall" $test_out | tail -1  | awk '{  print $3 }' )
  stall_pct=$( grep "^Cumulative stall" $test_out| tail -1  | awk '{  print $5 }' )
+  nstall=$( grep ^Stalls\(count\):  $test_out | tail -1 | awk '{ print $2 + $6 + $10 + $14 + $18 + $20 }' )
  ops_sec=$( grep ^${bench_name} $test_out | awk '{ print $5 }' )
  mb_sec=$( grep ^${bench_name} $test_out | awk '{ print $7 }' )
-  l0_wgb=$( grep "^  L0" $test_out | tail -1 | awk '{ print $9 }' )
-  sum_wgb=$( grep "^ Sum" $test_out | tail -1 | awk '{ print $9 }' )
-  sum_size=$( grep "^ Sum" $test_out | tail -1 | awk '{ printf "%.1f", $3 / 1024.0 }' )
-  wamp=$( grep "^ Sum" $test_out | tail -1 | awk '{ printf "%.1f", $12 }' )
-  if [[ "$sum_wgb" == "" ]]; then
-      wmb_ps=""
+
+  flush_wgb=$( grep "^Flush(GB)" $test_out | tail -1 | awk '{ print $3 }' | tr ',' ' ' | awk '{ print $1 }' )
+  sum_wgb=$( grep "^Cumulative compaction" $test_out | tail -1 | awk '{ printf "%.1f", $3 }' )
+  cmb_ps=$( grep "^Cumulative compaction" $test_out | tail -1 | awk '{ printf "%.1f", $6 }' )
+  if [[ "$sum_wgb" == "" || "$flush_wgb" == "" || "$flush_wgb" == "0.000" ]]; then
+    wamp=""
  else
-      wmb_ps=$( echo "scale=1; ( $sum_wgb * 1024.0 ) / $uptime" | bc )
+    wamp=$( echo "$sum_wgb / $flush_wgb" | bc -l | awk '{ printf "%.1f", $1 }' )
  fi
+  c_wsecs=$( grep "^ Sum" $test_out | tail -1 | awk '{ printf "%.0f", $15 }' )
+  c_csecs=$( grep "^ Sum" $test_out | tail -1 | awk '{ printf "%.0f", $16 }' )
+
+  lsm_size=$( grep "^ Sum" $test_out | tail -1 | awk '{ printf "%.0f%s", $3, $4 }' )
+  blob_size=$( grep "^Blob file count:" $test_out | tail -1 | awk '{ printf "%s%s", $7, $8 }' )
+
+  b_rgb=$( grep "^ Sum" $test_out | tail -1 | awk '{ printf "%.0f", $21 }' )
+  b_wgb=$( grep "^ Sum" $test_out | tail -1 | awk '{ printf "%.0f", $22 }' )
+
  usecs_op=$( grep ^${bench_name} $test_out | awk '{ printf "%.1f", $3 }' )
  p50=$( grep "^Percentiles:" $test_out | tail -1 | awk '{ printf "%.1f", $3 }' )
-  p75=$( grep "^Percentiles:" $test_out | tail -1 | awk '{ printf "%.1f", $5 }' )
  p99=$( grep "^Percentiles:" $test_out | tail -1 | awk '{ printf "%.0f", $7 }' )
  p999=$( grep "^Percentiles:" $test_out | tail -1 | awk '{ printf "%.0f", $9 }' )
  p9999=$( grep "^Percentiles:" $test_out | tail -1 | awk '{ printf "%.0f", $11 }' )
+  pmax=$( grep "^Min: " $test_out | grep Median: | grep Max: | awk '{ printf "%.0f", $6 }' )
+
+  time_out=$test_out.time
+  u_cpu=$( awk '{ printf "%.1f", $2 / 1000.0 }' $time_out )
+  s_cpu=$( awk '{ printf "%.1f", $3 / 1000.0  }' $time_out )
+
+  rss="na"
+  if [ -f $test_out.stats.ps ]; then
+    rss=$(  tail -1 $test_out.stats.ps | awk '{ printf "%.1f\n", $6 / (1024 * 1024) }' )
+  fi

  # if the report TSV (Tab Separate Values) file does not yet exist, create it and write the header row to it
  if [ ! -f "$report" ]; then
-    echo -e "ops_sec\tmb_sec\ttotal_size_gb\tlevel0_size_gb\tsum_gb\twrite_amplification\twrite_mbps\tusec_op\tpercentile_50\tpercentile_75\tpercentile_99\tpercentile_99.9\tpercentile_99.99\tuptime\tstall_time\tstall_percent\ttest_name\ttest_date\trocksdb_version\tjob_id" \
-      >> $report
+    echo -e "# ops_sec - operations per second" >> $report
+    echo -e "# mb_sec - ops_sec * size-of-operation-in-MB" >> $report
+    echo -e "# lsm_sz - size of LSM tree" >> $report
+    echo -e "# blob_sz - size of BlobDB logs" >> $report
+    echo -e "# c_wgb - GB written by compaction" >> $report
+    echo -e "# w_amp - Write-amplification as (bytes written by compaction / bytes written by memtable flush)" >> $report
+    echo -e "# c_mbps - Average write rate for compaction" >> $report
+    echo -e "# c_wsecs - Wall clock seconds doing compaction" >> $report
+    echo -e "# c_csecs - CPU seconds doing compaction" >> $report
+    echo -e "# b_rgb - Blob compaction read GB" >> $report
+    echo -e "# b_wgb - Blob compaction write GB" >> $report
+    echo -e "# usec_op - Microseconds per operation" >> $report
+    echo -e "# p50, p99, p99.9, p99.99 - 50th, 99th, 99.9th, 99.99th percentile response time in usecs" >> $report
+    echo -e "# pmax - max response time in usecs" >> $report
+    echo -e "# uptime - RocksDB uptime in seconds" >> $report
+    echo -e "# stall% - Percentage of time writes are stalled" >> $report
+    echo -e "# Nstall - Number of stalls" >> $report
+    echo -e "# u_cpu - #seconds/1000 of user CPU" >> $report
+    echo -e "# s_cpu - #seconds/1000 of system CPU" >> $report
+    echo -e "# rss - max RSS in GB for db_bench process" >> $report
+    echo -e "# test - Name of test" >> $report
+    echo -e "# date - Date/time of test" >> $report
+    echo -e "# version - RocksDB version" >> $report
+    echo -e "# job_id - User-provided job ID" >> $report
+    echo -e $tsv_header >> $report
  fi

-  echo -e "$ops_sec\t$mb_sec\t$sum_size\t$l0_wgb\t$sum_wgb\t$wamp\t$wmb_ps\t$usecs_op\t$p50\t$p75\t$p99\t$p999\t$p9999\t$uptime\t$stall_time\t$stall_pct\t$test_name\t$iso_date\t$version\t$job_id" \
+  echo -e "$ops_sec\t$mb_sec\t$lsm_size\t$blob_size\t$sum_wgb\t$wamp\t$cmb_ps\t$c_wsecs\t$c_csecs\t$b_rgb\t$b_wgb\t$usecs_op\t$p50\t$p99\t$p999\t$p9999\t$pmax\t$uptime\t$stall_pct\t$nstall\t$u_cpu\t$s_cpu\t$rss\t$test_name\t$my_date\t$version\t$job_id" \
    >> $report
 }

@ -272,7 +537,8 @@ function run_bulkload {
  # client can discover where to restart a load after a crash. I think this is a good way to load.
  echo "Bulk loading $num_keys random keys"
  log_file_name=$output_dir/benchmark_bulkload_fillrandom.log
-  cmd="./db_bench --benchmarks=fillrandom \
+  time_cmd=$( get_cmd $log_file_name.time )
+  cmd="$time_cmd ./db_bench --benchmarks=fillrandom \
       --use_existing_db=0 \
       --disable_auto_compactions=1 \
       --sync=0 \
@ -282,6 +548,7 @@ function run_bulkload {
       --allow_concurrent_memtable_write=false \
       --disable_wal=1 \
       --seed=$( date +%s ) \
+       --report_file=${log_file_name}.r.csv \
       2>&1 | tee -a $log_file_name"
  if [[ "$job_id" != "" ]]; then
    echo "Job ID: ${job_id}" > $log_file_name
@ -294,7 +561,8 @@ function run_bulkload {

  echo "Compacting..."
  log_file_name=$output_dir/benchmark_bulkload_compact.log
-  cmd="./db_bench --benchmarks=compact \
+  time_cmd=$( get_cmd $log_file_name.time )
+  cmd="$time_cmd ./db_bench --benchmarks=compact \
       --use_existing_db=1 \
       --disable_auto_compactions=1 \
       --sync=0 \
@ -333,7 +601,8 @@ function run_manual_compaction_worker {
  fi

  # Make sure that fillrandom uses the same compaction options as compact.
-  cmd="./db_bench --benchmarks=fillrandom \
+  time_cmd=$( get_cmd $log_file_name.time )
+  cmd="$time_cmd ./db_bench --benchmarks=fillrandom \
       --use_existing_db=0 \
       --disable_auto_compactions=0 \
       --sync=0 \
@ -432,42 +701,106 @@ function run_fillseq {
    test_name=fillseq.wal_enabled.v${value_size}
  fi

+  # For Leveled compaction hardwire this to 0 so that data that is trivial-moved
+  # to larger levels (3, 4, etc) will be compressed.
+  if [ $compaction_style == "leveled" ]; then
+    comp_arg="--min_level_to_compress=0"
+  elif [ $compaction_style == "universal" ]; then
+    if [ ! -z $UNIVERSAL_ALLOW_TRIVIAL_MOVE ]; then
+      # See GetCompressionFlush where compression_size_percent < 1 means use the default
+      # compression which is needed because trivial moves are enabled
+      comp_arg="--universal_compression_size_percent=-1"
+    else
+      # See GetCompressionFlush where compression_size_percent > 0 means no compression.
+      # Don't set anything here because compression_size_percent is set in univ_const_params
+      comp_arg=""
+    fi
+  else
+    # compaction_style == "blob"
+    comp_arg="--min_level_to_compress=0"
+  fi
+
  echo "Loading $num_keys keys sequentially"
-  cmd="./db_bench --benchmarks=fillseq \
+  time_cmd=$( get_cmd $log_file_name.time )
+  cmd="$time_cmd ./db_bench --benchmarks=fillseq \
+       $params_fillseq \
+       $comp_arg \
       --use_existing_db=0 \
       --sync=0 \
-       $params_fillseq \
-       --min_level_to_compress=0 \
       --threads=1 \
       --memtablerep=vector \
       --allow_concurrent_memtable_write=false \
       --disable_wal=$1 \
       --seed=$( date +%s ) \
+       --report_file=${log_file_name}.r.csv \
       2>&1 | tee -a $log_file_name"
-
  if [[ "$job_id" != "" ]]; then
    echo "Job ID: ${job_id}" > $log_file_name
    echo $cmd | tee -a $log_file_name
  else
    echo $cmd | tee $log_file_name
  fi
+  start_stats $log_file_name.stats
  eval $cmd
+  stop_stats $log_file_name.stats

  # The constant "fillseq" which we pass to db_bench is the benchmark name.
  summarize_result $log_file_name $test_name fillseq
 }

+function run_lsm {
+  # This flushes the memtable and L0 to get the LSM tree into a deterministic
+  # state for read-only tests that will follow.
+  echo "Flush memtable, wait, compact L0, wait"
+  job=$1
+
+  if [ $job = flush_mt_l0 ]; then
+    benchmarks=levelstats,flush,waitforcompaction,compact0,waitforcompaction,memstats,levelstats
+  elif [ $job = waitforcompaction ]; then
+    benchmarks=levelstats,waitforcompaction,memstats,levelstats
+  else
+    echo Job unknown: $job
+    exit $EXIT_NOT_COMPACTION_TEST
+  fi
+
+  log_file_name=$output_dir/benchmark_${job}.log
+  time_cmd=$( get_cmd $log_file_name.time )
+  cmd="$time_cmd ./db_bench --benchmarks=$benchmarks \
+       --use_existing_db=1 \
+       --sync=0 \
+       $params_w \
+       --threads=1 \
+       --seed=$( date +%s ) \
+       2>&1 | tee -a $log_file_name"
+  if [[ "$job_id" != "" ]]; then
+    echo "Job ID: ${job_id}" > $log_file_name
+    echo $cmd | tee -a $log_file_name
+  else
+    echo $cmd | tee $log_file_name
+  fi
+  start_stats $log_file_name.stats
+  # waitforcompaction can hang with universal (compaction_style=1)
+  # see bug https://github.com/facebook/rocksdb/issues/9275
+  eval $cmd
+  stop_stats $log_file_name.stats
+  # Don't summarize, the log doesn't have the output needed for it
+}
+
 function run_change {
-  operation=$1
-  echo "Do $num_keys random $operation"
-  log_file_name="$output_dir/benchmark_${operation}.t${num_threads}.s${syncval}.log"
-  cmd="./db_bench --benchmarks=$operation \
+  output_name=$1
+  grep_name=$2
+  benchmarks=$3
+  echo "Do $num_keys random $output_name"
+  log_file_name="$output_dir/benchmark_${output_name}.t${num_threads}.s${syncval}.log"
+  time_cmd=$( get_cmd $log_file_name.time )
+  cmd="$time_cmd ./db_bench --benchmarks=$benchmarks \
       --use_existing_db=1 \
       --sync=$syncval \
       $params_w \
       --threads=$num_threads \
       --merge_operator=\"put\" \
       --seed=$( date +%s ) \
+       --report_file=${log_file_name}.r.csv \
       2>&1 | tee -a $log_file_name"
  if [[ "$job_id" != "" ]]; then
    echo "Job ID: ${job_id}" > $log_file_name
@ -475,19 +808,23 @@ function run_change {
  else
    echo $cmd | tee $log_file_name
  fi
+  start_stats $log_file_name.stats
  eval $cmd
-  summarize_result $log_file_name ${operation}.t${num_threads}.s${syncval} $operation
+  stop_stats $log_file_name.stats
+  summarize_result $log_file_name ${output_name}.t${num_threads}.s${syncval} $grep_name
 }

 function run_filluniquerandom {
  echo "Loading $num_keys unique keys randomly"
  log_file_name=$output_dir/benchmark_filluniquerandom.log
-  cmd="./db_bench --benchmarks=filluniquerandom \
+  time_cmd=$( get_cmd $log_file_name.time )
+  cmd="$time_cmd ./db_bench --benchmarks=filluniquerandom \
       --use_existing_db=0 \
       --sync=0 \
       $params_w \
       --threads=1 \
       --seed=$( date +%s ) \
+       --report_file=${log_file_name}.r.csv \
       2>&1 | tee -a $log_file_name"
  if [[ "$job_id" != "" ]]; then
    echo "Job ID: ${job_id}" > $log_file_name
@ -495,18 +832,22 @@ function run_filluniquerandom {
  else
    echo $cmd | tee $log_file_name
  fi
+  start_stats $log_file_name.stats
  eval $cmd
+  stop_stats $log_file_name.stats
  summarize_result $log_file_name filluniquerandom filluniquerandom
 }

 function run_readrandom {
  echo "Reading $num_keys random keys"
  log_file_name="${output_dir}/benchmark_readrandom.t${num_threads}.log"
-  cmd="./db_bench --benchmarks=readrandom \
+  time_cmd=$( get_cmd $log_file_name.time )
+  cmd="$time_cmd ./db_bench --benchmarks=readrandom \
       --use_existing_db=1 \
       $params_w \
       --threads=$num_threads \
       --seed=$( date +%s ) \
+       --report_file=${log_file_name}.r.csv \
       2>&1 | tee -a $log_file_name"
  if [[ "$job_id" != "" ]]; then
    echo "Job ID: ${job_id}" > $log_file_name
@ -514,19 +855,23 @@ function run_readrandom {
  else
    echo $cmd | tee $log_file_name
  fi
+  start_stats $log_file_name.stats
  eval $cmd
+  stop_stats $log_file_name.stats
  summarize_result $log_file_name readrandom.t${num_threads} readrandom
 }

 function run_multireadrandom {
  echo "Multi-Reading $num_keys random keys"
  log_file_name="${output_dir}/benchmark_multireadrandom.t${num_threads}.log"
-  cmd="./db_bench --benchmarks=multireadrandom \
+  time_cmd=$( get_cmd $log_file_name.time )
+  cmd="$time_cmd ./db_bench --benchmarks=multireadrandom \
       --use_existing_db=1 \
       --threads=$num_threads \
       --batch_size=10 \
       $params_w \
       --seed=$( date +%s ) \
+       --report_file=${log_file_name}.r.csv \
       2>&1 | tee -a $log_file_name"
  if [[ "$job_id" != "" ]]; then
    echo "Job ID: ${job_id}" > $log_file_name
@ -534,7 +879,9 @@ function run_multireadrandom {
  else
    echo $cmd | tee $log_file_name
  fi
+  start_stats $log_file_name.stats
  eval $cmd
+  stop_stats $log_file_name.stats
  summarize_result $log_file_name multireadrandom.t${num_threads} multireadrandom
 }

@ -542,13 +889,15 @@ function run_readwhile {
  operation=$1
  echo "Reading $num_keys random keys while $operation"
  log_file_name="${output_dir}/benchmark_readwhile${operation}.t${num_threads}.log"
-  cmd="./db_bench --benchmarks=readwhile${operation} \
+  time_cmd=$( get_cmd $log_file_name.time )
+  cmd="$time_cmd ./db_bench --benchmarks=readwhile${operation} \
       --use_existing_db=1 \
       --sync=$syncval \
       $params_w \
       --threads=$num_threads \
       --merge_operator=\"put\" \
       --seed=$( date +%s ) \
+       --report_file=${log_file_name}.r.csv \
       2>&1 | tee -a $log_file_name"
  if [[ "$job_id" != "" ]]; then
    echo "Job ID: ${job_id}" > $log_file_name
@ -556,7 +905,9 @@ function run_readwhile {
  else
    echo $cmd | tee $log_file_name
  fi
+  start_stats $log_file_name.stats
  eval $cmd
+  stop_stats $log_file_name.stats
  summarize_result $log_file_name readwhile${operation}.t${num_threads} readwhile${operation}
 }

@ -565,8 +916,9 @@ function run_rangewhile {
  full_name=$2
  reverse_arg=$3
  log_file_name="${output_dir}/benchmark_${full_name}.t${num_threads}.log"
+  time_cmd=$( get_cmd $log_file_name.time )
  echo "Range scan $num_keys random keys while ${operation} for reverse_iter=${reverse_arg}"
-  cmd="./db_bench --benchmarks=seekrandomwhile${operation} \
+  cmd="$time_cmd ./db_bench --benchmarks=seekrandomwhile${operation} \
       --use_existing_db=1 \
       --sync=$syncval \
       $params_w \
@ -575,9 +927,12 @@ function run_rangewhile {
       --seek_nexts=$num_nexts_per_seek \
       --reverse_iterator=$reverse_arg \
       --seed=$( date +%s ) \
+       --report_file=${log_file_name}.r.csv \
       2>&1 | tee -a $log_file_name"
  echo $cmd | tee $log_file_name
+  start_stats $log_file_name.stats
  eval $cmd
+  stop_stats $log_file_name.stats
  summarize_result $log_file_name ${full_name}.t${num_threads} seekrandomwhile${operation}
 }

@ -585,14 +940,16 @@ function run_range {
  full_name=$1
  reverse_arg=$2
  log_file_name="${output_dir}/benchmark_${full_name}.t${num_threads}.log"
+  time_cmd=$( get_cmd $log_file_name.time )
  echo "Range scan $num_keys random keys for reverse_iter=${reverse_arg}"
-  cmd="./db_bench --benchmarks=seekrandom \
+  cmd="$time_cmd ./db_bench --benchmarks=seekrandom \
       --use_existing_db=1 \
       $params_w \
       --threads=$num_threads \
       --seek_nexts=$num_nexts_per_seek \
       --reverse_iterator=$reverse_arg \
       --seed=$( date +%s ) \
+       --report_file=${log_file_name}.r.csv \
       2>&1 | tee -a $log_file_name"
  if [[ "$job_id" != "" ]]; then
    echo "Job ID: ${job_id}" > $log_file_name
@ -600,18 +957,22 @@ function run_range {
  else
    echo $cmd | tee $log_file_name
  fi
+  start_stats $log_file_name.stats
  eval $cmd
+  stop_stats $log_file_name.stats
  summarize_result $log_file_name ${full_name}.t${num_threads} seekrandom
 }

 function run_randomtransaction {
  echo "..."
  log_file_name=$output_dir/benchmark_randomtransaction.log
-  cmd="./db_bench $params_r --benchmarks=randomtransaction \
+  time_cmd=$( get_cmd $log_file_name.time )
+  cmd="$time_cmd ./db_bench $params_w --benchmarks=randomtransaction \
       --num=$num_keys \
       --transaction_db \
       --threads=5 \
       --transaction_sets=5 \
+       --report_file=${log_file_name}.r.csv \
       2>&1 | tee $log_file_name"
  if [[ "$job_id" != "" ]]; then
    echo "Job ID: ${job_id}" > $log_file_name
@ -619,7 +980,9 @@ function run_randomtransaction {
  else
    echo $cmd | tee $log_file_name
  fi
+  start_stats $log_file_name.stats
  eval $cmd
+  stop_stats $log_file_name.stats
 }

 function now() {
@ -641,22 +1004,25 @@ for job in ${jobs[@]}; do
  start=$(now)
  if [ $job = bulkload ]; then
    run_bulkload
+  elif [ $job = flush_mt_l0 ]; then
+    run_lsm flush_mt_l0
+  elif [ $job = waitforcompaction ]; then
+    run_lsm waitforcompaction
  elif [ $job = fillseq_disable_wal ]; then
    run_fillseq 1
  elif [ $job = fillseq_enable_wal ]; then
    run_fillseq 0
  elif [ $job = overwrite ]; then
-    syncval="0"
-    params_w="$params_w \
-        --writes=125000000 \
-        --subcompactions=4 \
-        --soft_pending_compaction_bytes_limit=$((1 * T)) \
-        --hard_pending_compaction_bytes_limit=$((4 * T)) "
-    run_change overwrite
+    run_change overwrite overwrite overwrite
+  elif [ $job = overwritesome ]; then
+    # This uses a different name for overwrite results so it can be run twice in one benchmark run.
+    run_change overwritesome overwrite overwrite
+  elif [ $job = overwriteandwait ]; then
+    run_change overwriteandwait overwrite overwrite,waitforcompaction
  elif [ $job = updaterandom ]; then
-    run_change updaterandom
+    run_change updaterandom updaterandom updaterandom
  elif [ $job = mergerandom ]; then
-    run_change mergerandom
+    run_change mergerandom mergerandom mergerandom
  elif [ $job = filluniquerandom ]; then
    run_filluniquerandom
  elif [ $job = readrandom ]; then
@ -696,7 +1062,7 @@ for job in ${jobs[@]}; do
    echo "Completed $job (ID: $job_id) in $((end-start)) seconds" | tee -a $schedule
  fi

-  echo -e "ops/sec\tmb/sec\tSize-GB\tL0_GB\tSum_GB\tW-Amp\tW-MB/s\tusec/op\tp50\tp75\tp99\tp99.9\tp99.99\tUptime\tStall-time\tStall%\tTest\tDate\tVersion\tJob-ID"
+  echo -e $tsv_header
  tail -1 $report

 done