From ab718b415fc9b2a66a2ed642c18803f764839d7b Mon Sep 17 00:00:00 2001 From: sdong Date: Tue, 1 Jun 2021 09:33:50 -0700 Subject: [PATCH] Kill whitebox crash test if it is 15 minutes over the limit (#8341) Summary: Whitebox crash test can run significantly over the time limit for test slowness or no kiling points. This indefinite job can create problem when this test is periodically scheduled as a job. Instead, kill the job if it is 15 minutes over the limit. Refactor the code slightly to consolidate the code for executing commands for white and black box tests. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8341 Test Plan: Run both of black and white box tests with both of natual and explicit kill condition. Reviewed By: jay-zhuang Differential Revision: D28756170 fbshipit-source-id: f253149890e62ace78f871be927e093e9b12f49b --- tools/db_crashtest.py | 84 ++++++++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 41 deletions(-) diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 6c60ea7f7..263ea807d 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -461,6 +461,25 @@ def inject_inconsistencies_to_db_dir(dir_path): with open(os.path.join(dir_path, fname), "w") as fd: fd.write("garbage") +def execute_cmd(cmd, timeout): + child = subprocess.Popen(cmd, stderr=subprocess.PIPE, + stdout=subprocess.PIPE) + print("Running db_stress with pid=%d: %s\n\n" + % (child.pid, ' '.join(cmd))) + + try: + outs, errs = child.communicate(timeout=timeout) + hit_timeout = False + print("WARNING: db_stress ended before kill: exitcode=%d\n" + % child.returncode) + except subprocess.TimeoutExpired: + hit_timeout = True + child.kill() + print("KILLED %d\n" % child.pid) + outs, errs = child.communicate() + + return hit_timeout, child.returncode, outs.decode('utf-8'), errs.decode('utf-8') + # This script runs and kills db_stress multiple times. It checks consistency # in case of unsafe crashes in RocksDB. @@ -474,47 +493,26 @@ def blackbox_crash_main(args, unknown_args): + "total-duration=" + str(cmd_params['duration']) + "\n") while time.time() < exit_time: - run_had_errors = False - killtime = time.time() + cmd_params['interval'] - cmd = gen_cmd(dict( list(cmd_params.items()) + list({'db': dbname}.items())), unknown_args) - child = subprocess.Popen(cmd, stderr=subprocess.PIPE) - print("Running db_stress with pid=%d: %s\n\n" - % (child.pid, ' '.join(cmd))) - - stop_early = False - while time.time() < killtime: - if child.poll() is not None: - print("WARNING: db_stress ended before kill: exitcode=%d\n" - % child.returncode) - stop_early = True - break - time.sleep(1) - - if not stop_early: - if child.poll() is not None: - print("WARNING: db_stress ended before kill: exitcode=%d\n" - % child.returncode) - else: - child.kill() - print("KILLED %d\n" % child.pid) - time.sleep(1) # time to stabilize after a kill - - while True: - line = child.stderr.readline().strip().decode('utf-8') - if line == '': - break - elif not line.startswith('WARNING'): + hit_timeout, retcode, outs, errs = execute_cmd(cmd, cmd_params['interval']) + + if not hit_timeout: + print('Exit Before Killing') + print('stdout:') + print(outs) + print('stderr:') + print(errs) + sys.exit(2) + + for line in errs.split('\n'): + if line != '' and not line.startswith('WARNING'): run_had_errors = True print('stderr has error message:') print('***' + line + '***') - if run_had_errors: - sys.exit(2) - time.sleep(1) # time to stabilize before the next run if args.test_best_efforts_recovery: @@ -614,14 +612,14 @@ def whitebox_crash_main(args, unknown_args): print("Running:" + ' '.join(cmd) + "\n") # noqa: E999 T25377293 Grandfathered in - popen = subprocess.Popen(cmd, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - stdoutdata, stderrdata = popen.communicate() - if stdoutdata: - stdoutdata = stdoutdata.decode('utf-8') - if stderrdata: - stderrdata = stderrdata.decode('utf-8') - retncode = popen.returncode + # If the running time is 15 minutes over the run time, explicit kill and + # exit even if white box kill didn't hit. This is to guarantee run time + # limit, as if it runs as a job, running too long will create problems + # for job scheduling or execution. + # TODO detect a hanging condition. The job might run too long as RocksDB + # hits a hanging bug. + hit_timeout, retncode, stdoutdata, stderrdata = execute_cmd( + cmd, exit_time - time.time() + 900) msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format( check_mode, additional_opts['kill_random_test'], retncode)) @@ -629,6 +627,10 @@ def whitebox_crash_main(args, unknown_args): print(stdoutdata) print(stderrdata) + if hit_timeout: + print("Killing the run for running too long") + break + expected = False if additional_opts['kill_random_test'] is None and (retncode == 0): # we expect zero retncode if no kill option