Kill whitebox crash test if it is 15 minutes over the limit (#8341)

Summary:
Whitebox crash test can run significantly over the time limit for test slowness or no kiling points. This indefinite job can create problem when this test is periodically scheduled as a job. Instead, kill the job if it is 15 minutes over the limit.
Refactor the code slightly to consolidate the code for executing commands for white and black box tests.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/8341

Test Plan: Run both of black and white box tests with both of natual and explicit kill condition.

Reviewed By: jay-zhuang

Differential Revision: D28756170

fbshipit-source-id: f253149890e62ace78f871be927e093e9b12f49b
main
sdong 3 years ago committed by Facebook GitHub Bot
parent d561af487c
commit ab718b415f
  1. 80
      tools/db_crashtest.py

@ -461,6 +461,25 @@ def inject_inconsistencies_to_db_dir(dir_path):
with open(os.path.join(dir_path, fname), "w") as fd: with open(os.path.join(dir_path, fname), "w") as fd:
fd.write("garbage") fd.write("garbage")
def execute_cmd(cmd, timeout):
child = subprocess.Popen(cmd, stderr=subprocess.PIPE,
stdout=subprocess.PIPE)
print("Running db_stress with pid=%d: %s\n\n"
% (child.pid, ' '.join(cmd)))
try:
outs, errs = child.communicate(timeout=timeout)
hit_timeout = False
print("WARNING: db_stress ended before kill: exitcode=%d\n"
% child.returncode)
except subprocess.TimeoutExpired:
hit_timeout = True
child.kill()
print("KILLED %d\n" % child.pid)
outs, errs = child.communicate()
return hit_timeout, child.returncode, outs.decode('utf-8'), errs.decode('utf-8')
# This script runs and kills db_stress multiple times. It checks consistency # This script runs and kills db_stress multiple times. It checks consistency
# in case of unsafe crashes in RocksDB. # in case of unsafe crashes in RocksDB.
@ -474,47 +493,26 @@ def blackbox_crash_main(args, unknown_args):
+ "total-duration=" + str(cmd_params['duration']) + "\n") + "total-duration=" + str(cmd_params['duration']) + "\n")
while time.time() < exit_time: while time.time() < exit_time:
run_had_errors = False
killtime = time.time() + cmd_params['interval']
cmd = gen_cmd(dict( cmd = gen_cmd(dict(
list(cmd_params.items()) list(cmd_params.items())
+ list({'db': dbname}.items())), unknown_args) + list({'db': dbname}.items())), unknown_args)
child = subprocess.Popen(cmd, stderr=subprocess.PIPE) hit_timeout, retcode, outs, errs = execute_cmd(cmd, cmd_params['interval'])
print("Running db_stress with pid=%d: %s\n\n"
% (child.pid, ' '.join(cmd)))
stop_early = False if not hit_timeout:
while time.time() < killtime: print('Exit Before Killing')
if child.poll() is not None: print('stdout:')
print("WARNING: db_stress ended before kill: exitcode=%d\n" print(outs)
% child.returncode) print('stderr:')
stop_early = True print(errs)
break sys.exit(2)
time.sleep(1)
if not stop_early:
if child.poll() is not None:
print("WARNING: db_stress ended before kill: exitcode=%d\n"
% child.returncode)
else:
child.kill()
print("KILLED %d\n" % child.pid)
time.sleep(1) # time to stabilize after a kill
while True: for line in errs.split('\n'):
line = child.stderr.readline().strip().decode('utf-8') if line != '' and not line.startswith('WARNING'):
if line == '':
break
elif not line.startswith('WARNING'):
run_had_errors = True run_had_errors = True
print('stderr has error message:') print('stderr has error message:')
print('***' + line + '***') print('***' + line + '***')
if run_had_errors:
sys.exit(2)
time.sleep(1) # time to stabilize before the next run time.sleep(1) # time to stabilize before the next run
if args.test_best_efforts_recovery: if args.test_best_efforts_recovery:
@ -614,14 +612,14 @@ def whitebox_crash_main(args, unknown_args):
print("Running:" + ' '.join(cmd) + "\n") # noqa: E999 T25377293 Grandfathered in print("Running:" + ' '.join(cmd) + "\n") # noqa: E999 T25377293 Grandfathered in
popen = subprocess.Popen(cmd, stdout=subprocess.PIPE, # If the running time is 15 minutes over the run time, explicit kill and
stderr=subprocess.PIPE) # exit even if white box kill didn't hit. This is to guarantee run time
stdoutdata, stderrdata = popen.communicate() # limit, as if it runs as a job, running too long will create problems
if stdoutdata: # for job scheduling or execution.
stdoutdata = stdoutdata.decode('utf-8') # TODO detect a hanging condition. The job might run too long as RocksDB
if stderrdata: # hits a hanging bug.
stderrdata = stderrdata.decode('utf-8') hit_timeout, retncode, stdoutdata, stderrdata = execute_cmd(
retncode = popen.returncode cmd, exit_time - time.time() + 900)
msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format( msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format(
check_mode, additional_opts['kill_random_test'], retncode)) check_mode, additional_opts['kill_random_test'], retncode))
@ -629,6 +627,10 @@ def whitebox_crash_main(args, unknown_args):
print(stdoutdata) print(stdoutdata)
print(stderrdata) print(stderrdata)
if hit_timeout:
print("Killing the run for running too long")
break
expected = False expected = False
if additional_opts['kill_random_test'] is None and (retncode == 0): if additional_opts['kill_random_test'] is None and (retncode == 0):
# we expect zero retncode if no kill option # we expect zero retncode if no kill option

Loading…
Cancel
Save