regression_test.sh: kill very old db_bench (and more) (#10441)

Summary:
If a db_bench process gets hung or runaway on a machine, that
could prevent regression_test.sh from ever making progress. To fix that,
regression_test.sh will now kill any db_bench process that is >12 hours
old. Also made this more reliable by not using string matching (grep) to
get db_bench process IDs.

I also had to make some other updates to get local runs working
reliably:
* Fix some quoting hell and other dubious complexity with db_bench_cmd
* Only save a DB for re-use when building it passes
* Report failed command in more cases
* Add safeguards against "rm -rf ."

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10441

Test Plan:
manual (local and remote), with temporary changes e.g. to have
a manageable age threshold etc.

Reviewed By: riversand963

Differential Revision: D38285537

Pulled By: pdillinger

fbshipit-source-id: 4d598876aedc38ac4bd9d8ddf32c5995d8e44db8
main
Peter Dillinger 2 years ago committed by Facebook GitHub Bot
parent cc8ded6152
commit 9da97a3726
  1. 79
      tools/regression_test.sh

@ -46,6 +46,7 @@
# Default: 1 # Default: 1
# TEST_PATH: the root directory of the regression test. # TEST_PATH: the root directory of the regression test.
# Default: "/tmp/rocksdb/regression_test" # Default: "/tmp/rocksdb/regression_test"
# !!! NOTE !!! - a DB will also be saved in $TEST_PATH/../db
# RESULT_PATH: the directory where the regression results will be generated. # RESULT_PATH: the directory where the regression results will be generated.
# Default: "$TEST_PATH/current_time" # Default: "$TEST_PATH/current_time"
# REMOTE_USER_AT_HOST: If set, then test will run on the specified host under # REMOTE_USER_AT_HOST: If set, then test will run on the specified host under
@ -125,15 +126,14 @@ function main {
setup_test_directory setup_test_directory
if [ $TEST_MODE -le 1 ]; then if [ $TEST_MODE -le 1 ]; then
tmp=$DB_PATH test_remote "test -d $ORIGIN_PATH"
DB_PATH=$ORIGIN_PATH
test_remote "test -d $DB_PATH"
if [[ $? -ne 0 ]]; then if [[ $? -ne 0 ]]; then
echo "Building DB..." echo "Building DB..."
# compactall alone will not print ops or threads, which will fail update_report # compactall alone will not print ops or threads, which will fail update_report
run_db_bench "fillseq,compactall" $NUM_KEYS 1 0 0 run_db_bench "fillseq,compactall" $NUM_KEYS 1 0 0
# only save for future use on success
test_remote "mv $DB_PATH $ORIGIN_PATH"
fi fi
DB_PATH=$tmp
fi fi
if [ $TEST_MODE -ge 1 ]; then if [ $TEST_MODE -ge 1 ]; then
build_checkpoint build_checkpoint
@ -204,9 +204,32 @@ function init_arguments {
# $4 --- use_existing_db. Default: 1 # $4 --- use_existing_db. Default: 1
# $5 --- update_report. Default: 1 # $5 --- update_report. Default: 1
function run_db_bench { function run_db_bench {
# this will terminate all currently-running db_bench # Make sure no other db_bench is running. (Make sure command succeeds if pidof
find_db_bench_cmd="ps aux | grep db_bench | grep -v grep | grep -v aux | awk '{print \$2}'" # command exists but finds nothing.)
pids_cmd='pidof db_bench || pidof --version > /dev/null'
# But first, make best effort to kill any db_bench that have run for more
# than 12 hours, as that indicates a hung or runaway process.
kill_old_cmd='for PID in $(pidof db_bench); do [ "$(($(stat -c %Y /proc/$PID) + 43200))" -lt "$(date +%s)" ] && echo "Killing old db_bench $PID" && kill $PID && sleep 5 && kill -9 $PID && sleep 5; done; pidof --version > /dev/null'
if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
pids_cmd="$SSH $REMOTE_USER_AT_HOST '$pids_cmd'"
kill_old_cmd="$SSH $REMOTE_USER_AT_HOST '$kill_old_cmd'"
fi
eval $kill_old_cmd
exit_on_error $? "$kill_old_cmd"
pids_output="$(eval $pids_cmd)"
exit_on_error $? "$pids_cmd"
if [ "$pids_output" != "" ]; then
echo "Stopped regression_test.sh as there're still recent db_bench "
echo "processes running: $pids_output"
echo "Clean up test directory"
cleanup_test_directory $TEST_ROOT_DIR
exit 2
fi
# Build db_bench command
ops=${2:-$NUM_OPS} ops=${2:-$NUM_OPS}
threads=${3:-$NUM_THREADS} threads=${3:-$NUM_THREADS}
USE_EXISTING_DB=${4:-1} USE_EXISTING_DB=${4:-1}
@ -220,7 +243,7 @@ function run_db_bench {
options_file_arg=$(setup_options_file) options_file_arg=$(setup_options_file)
echo "$options_file_arg" echo "$options_file_arg"
# use `which time` to avoid using bash's internal time command # use `which time` to avoid using bash's internal time command
db_bench_cmd="("'\$(which time)'" -p $DB_BENCH_DIR/db_bench \ db_bench_cmd="\$(which time) -p $DB_BENCH_DIR/db_bench \
--benchmarks=$1 --db=$DB_PATH --wal_dir=$WAL_PATH \ --benchmarks=$1 --db=$DB_PATH --wal_dir=$WAL_PATH \
--use_existing_db=$USE_EXISTING_DB \ --use_existing_db=$USE_EXISTING_DB \
--perf_level=$PERF_LEVEL \ --perf_level=$PERF_LEVEL \
@ -248,38 +271,16 @@ function run_db_bench {
--seed=$SEED \ --seed=$SEED \
--multiread_batched=true \ --multiread_batched=true \
--batch_size=$MULTIREAD_BATCH_SIZE \ --batch_size=$MULTIREAD_BATCH_SIZE \
--multiread_stride=$MULTIREAD_STRIDE) 2>&1" --multiread_stride=$MULTIREAD_STRIDE 2>&1"
ps_cmd="ps aux"
if ! [ -z "$REMOTE_USER_AT_HOST" ]; then if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
echo "Running benchmark remotely on $REMOTE_USER_AT_HOST" echo "Running benchmark remotely on $REMOTE_USER_AT_HOST"
db_bench_cmd="$SSH $REMOTE_USER_AT_HOST \"$db_bench_cmd\"" db_bench_cmd="$SSH $REMOTE_USER_AT_HOST '$db_bench_cmd'"
ps_cmd="$SSH $REMOTE_USER_AT_HOST $ps_cmd"
fi
## make sure no db_bench is running
# The following statement is necessary make sure "eval $ps_cmd" will success.
# Otherwise, if we simply check whether "$(eval $ps_cmd | grep db_bench)" is
# successful or not, then it will always be false since grep will return
# non-zero status when there's no matching output.
ps_output="$(eval $ps_cmd)"
exit_on_error $? "$ps_cmd"
# perform the actual command to check whether db_bench is running
grep_output="$(eval $ps_cmd | grep db_bench | grep -v grep)"
if [ "$grep_output" != "" ]; then
echo "Stopped regression_test.sh as there're still db_bench processes running:"
echo $grep_output
echo "Clean up test directory"
cleanup_test_directory $TEST_ROOT_DIR
exit 2
fi fi
echo db_bench_cmd="$db_bench_cmd"
## run the db_bench # Run the db_bench command
cmd="($db_bench_cmd || db_bench_error=1) | tee -a $RESULT_PATH/$1" eval $db_bench_cmd | tee -a "$RESULT_PATH/$1"
exit_on_error $? exit_on_error ${PIPESTATUS[0]} db_bench
echo $cmd
eval $cmd
exit_on_error $db_bench_error
if [ $UPDATE_REPORT -ne 0 ]; then if [ $UPDATE_REPORT -ne 0 ]; then
update_report "$1" "$RESULT_PATH/$1" $ops $threads update_report "$1" "$RESULT_PATH/$1" $ops $threads
fi fi
@ -397,7 +398,7 @@ function test_remote {
function run_local { function run_local {
eval "$1" eval "$1"
exit_on_error $? exit_on_error $? "$1"
} }
function setup_options_file { function setup_options_file {
@ -416,8 +417,14 @@ function setup_options_file {
function setup_test_directory { function setup_test_directory {
echo "Deleting old regression test directories and creating new ones" echo "Deleting old regression test directories and creating new ones"
run_local 'test "$DB_PATH" != "."'
run_remote "rm -rf $DB_PATH" run_remote "rm -rf $DB_PATH"
if [ "$DB_BENCH_DIR" != "." ]; then
run_remote "rm -rf $DB_BENCH_DIR" run_remote "rm -rf $DB_BENCH_DIR"
fi
run_local 'test "$RESULT_PATH" != "."'
run_local "rm -rf $RESULT_PATH" run_local "rm -rf $RESULT_PATH"
if ! [ -z "$WAL_PATH" ]; then if ! [ -z "$WAL_PATH" ]; then

Loading…
Cancel
Save