diff --git a/Makefile b/Makefile index 00fa12fd3..fb14eab92 100644 --- a/Makefile +++ b/Makefile @@ -568,12 +568,12 @@ ldb_tests: ldb crash_test: whitebox_crash_test blackbox_crash_test blackbox_crash_test: db_stress - python -u tools/db_crashtest.py -s - python -u tools/db_crashtest.py + python -u tools/db_crashtest.py --simple blackbox + python -u tools/db_crashtest.py blackbox whitebox_crash_test: db_stress - python -u tools/db_crashtest2.py -s - python -u tools/db_crashtest2.py + python -u tools/db_crashtest.py --simple whitebox + python -u tools/db_crashtest.py whitebox asan_check: $(MAKE) clean diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 6e72ade59..6f9a1e867 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -4,161 +4,183 @@ import re import sys import time import random -import getopt import logging import tempfile import subprocess import shutil +import argparse -# This script runs and kills db_stress multiple times. It checks consistency -# in case of unsafe crashes in RocksDB. +# params overwrite priority: +# for default: +# default_params < blackbox|whitebox_default_params < args +# for simple: +# simple_default_params < blackbox|whitebox_simple_default_params < args -def main(argv): - try: - opts, args = getopt.getopt(argv, "hsd:t:i:o:b:") - except getopt.GetoptError: - print("db_crashtest.py -d -t <#threads> " - "-i -o " - "-b [-s (simple mode)]\n") - sys.exit(2) - - # default values, will be overridden by cmdline args - interval = 120 # time for one db_stress instance to run - duration = 6000 # total time for this script to test db_stress - threads = 32 - # since we will be killing anyway, use large value for ops_per_thread - ops_per_thread = 100000000 - write_buf_size = 4 * 1024 * 1024 - simple_mode = False - write_buf_size_set = False - for opt, arg in opts: - if opt == '-h': - print("db_crashtest.py -d " - " -t <#threads> -i " - " -o -b " - " [-s (simple mode)]\n") - sys.exit() - elif opt == '-s': - simple_mode = True - if not write_buf_size_set: - write_buf_size = 32 * 1024 * 1024 - elif opt == "-d": - duration = int(arg) - elif opt == "-t": - threads = int(arg) - elif opt == "-i": - interval = int(arg) - elif opt == "-o": - ops_per_thread = int(arg) - elif opt == "-b": - write_buf_size = int(arg) - write_buf_size_set = True - else: - print("db_crashtest.py -d " - " -t <#threads> -i " - " -o -b \n") - sys.exit(2) - - exit_time = time.time() + duration +default_params = { + "block_size": 16384, + "cache_size": 1048576, + "delpercent": 5, + "destroy_db_initially": 0, + "disable_data_sync": 0, + "disable_wal": 0, + "filter_deletes": lambda: random.randint(0, 1), + "iterpercent": 10, + "max_background_compactions": 20, + "max_bytes_for_level_base": 10485760, + "max_key": 100000000, + "max_write_buffer_number": 3, + "memtablerep": "prefix_hash", + "mmap_read": lambda: random.randint(0, 1), + "open_files": 500000, + "prefix_size": 7, + "prefixpercent": 5, + "progress_reports": 0, + "readpercent": 45, + "reopen": 20, + "sync": 0, + "target_file_size_base": 2097152, + "target_file_size_multiplier": 2, + "threads": 32, + "verify_checksum": 1, + "write_buffer_size": 4 * 1024 * 1024, + "writepercent": 35, +} - print("Running blackbox-crash-test with \ninterval_between_crash=" - + str(interval) + "\ntotal-duration=" + str(duration) - + "\nthreads=" + str(threads) + "\nops_per_thread=" - + str(ops_per_thread) + "\nwrite_buffer_size=" - + str(write_buf_size) + "\n") +def get_dbname(test_name): test_tmpdir = os.environ.get("TEST_TMPDIR") if test_tmpdir is None or test_tmpdir == "": - dbname = tempfile.mkdtemp(prefix='rocksdb_crashtest_') + dbname = tempfile.mkdtemp(prefix='rocksdb_crashtest_' + test_name) else: - dbname = test_tmpdir + "/rocksdb_crashtest" + dbname = test_tmpdir + "/rocksdb_crashtest_" + test_name shutil.rmtree(dbname, True) + return dbname + +blackbox_default_params = { + 'db': lambda: get_dbname('blackbox'), + # total time for this script to test db_stress + "duration": 6000, + # time for one db_stress instance to run + "interval": 120, + # since we will be killing anyway, use large value for ops_per_thread + "ops_per_thread": 100000000, + "set_options_one_in": 10000, + "test_batches_snapshots": 1, +} + +whitebox_default_params = { + 'db': lambda: get_dbname('whitebox'), + "duration": 10000, + "log2_keys_per_lock": 10, + "nooverwritepercent": 1, + "ops_per_thread": 200000, + "test_batches_snapshots": lambda: random.randint(0, 1), + "write_buffer_size": 4 * 1024 * 1024, +} + +simple_default_params = { + "block_size": 16384, + "cache_size": 1048576, + "column_families": 1, + "delpercent": 5, + "destroy_db_initially": 0, + "disable_data_sync": 0, + "disable_wal": 0, + "filter_deletes": lambda: random.randint(0, 1), + "iterpercent": 10, + "max_background_compactions": 1, + "max_bytes_for_level_base": 67108864, + "max_key": 100000000, + "max_write_buffer_number": 3, + "memtablerep": "skip_list", + "mmap_read": lambda: random.randint(0, 1), + "prefix_size": 0, + "prefixpercent": 0, + "progress_reports": 0, + "readpercent": 50, + "reopen": 20, + "sync": 0, + "target_file_size_base": 16777216, + "target_file_size_multiplier": 1, + "test_batches_snapshots": 0, + "threads": 32, + "verify_checksum": 1, + "write_buffer_size": 32 * 1024 * 1024, + "writepercent": 35, +} + +blackbox_simple_default_params = { + 'db': lambda: get_dbname('blackbox'), + "duration": 6000, + "interval": 120, + "open_files": -1, + "ops_per_thread": 100000000, + "set_options_one_in": 0, + "test_batches_snapshots": 0, +} + +whitebox_simple_default_params = { + 'db': lambda: get_dbname('whitebox'), + "duration": 10000, + "log2_keys_per_lock": 10, + "nooverwritepercent": 1, + "open_files": 500000, + "ops_per_thread": 200000, + "write_buffer_size": 32 * 1024 * 1024, +} + + +def gen_cmd_params(args): + params = {} + + if args.simple: + params.update(simple_default_params) + if args.test_type == 'blackbox': + params.update(blackbox_simple_default_params) + if args.test_type == 'whitebox': + params.update(whitebox_simple_default_params) + + if not args.simple: + params.update(default_params) + if args.test_type == 'blackbox': + params.update(blackbox_default_params) + if args.test_type == 'whitebox': + params.update(whitebox_default_params) + + for k, v in vars(args).items(): + if v is not None: + params[k] = v + return params + + +def gen_cmd(params): + cmd = './db_stress ' + ' '.join( + '--{0}={1}'.format(k, v() if callable(v) else v) + for k, v in params.items() + if k not in set(['test_type', 'simple', 'duration', 'interval']) + and v is not None) + return cmd + + +# This script runs and kills db_stress multiple times. It checks consistency +# in case of unsafe crashes in RocksDB. +def blackbox_crash_main(args): + cmd_params = gen_cmd_params(args) + + exit_time = time.time() + cmd_params['duration'] + + print("Running blackbox-crash-test with \n" + + "interval_between_crash=" + str(cmd_params['interval']) + "\n" + + "total-duration=" + str(cmd_params['duration']) + "\n" + + "threads=" + str(cmd_params['threads']) + "\n" + + "ops_per_thread=" + str(cmd_params['ops_per_thread']) + "\n" + + "write_buffer_size=" + str(cmd_params['write_buffer_size']) + "\n") while time.time() < exit_time: run_had_errors = False - killtime = time.time() + interval - - if simple_mode: - cmd = re.sub('\s+', ' ', """ - ./db_stress - --column_families=1 - --test_batches_snapshots=0 - --ops_per_thread=%s - --threads=%s - --write_buffer_size=%s - --destroy_db_initially=0 - --reopen=20 - --readpercent=50 - --prefixpercent=0 - --writepercent=35 - --delpercent=5 - --iterpercent=10 - --db=%s - --max_key=100000000 - --mmap_read=%s - --block_size=16384 - --cache_size=1048576 - --open_files=-1 - --verify_checksum=1 - --sync=0 - --progress_reports=0 - --disable_wal=0 - --disable_data_sync=0 - --target_file_size_base=16777216 - --target_file_size_multiplier=1 - --max_write_buffer_number=3 - --max_background_compactions=1 - --max_bytes_for_level_base=67108864 - --filter_deletes=%s - --memtablerep=skip_list - --prefix_size=0 - --set_options_one_in=0 - """ % (ops_per_thread, - threads, - write_buf_size, - dbname, - random.randint(0, 1), - random.randint(0, 1))) - else: - cmd = re.sub('\s+', ' ', """ - ./db_stress - --test_batches_snapshots=1 - --ops_per_thread=%s - --threads=%s - --write_buffer_size=%s - --destroy_db_initially=0 - --reopen=20 - --readpercent=45 - --prefixpercent=5 - --writepercent=35 - --delpercent=5 - --iterpercent=10 - --db=%s - --max_key=100000000 - --mmap_read=%s - --block_size=16384 - --cache_size=1048576 - --open_files=500000 - --verify_checksum=1 - --sync=0 - --progress_reports=0 - --disable_wal=0 - --disable_data_sync=0 - --target_file_size_base=2097152 - --target_file_size_multiplier=2 - --max_write_buffer_number=3 - --max_background_compactions=20 - --max_bytes_for_level_base=10485760 - --filter_deletes=%s - --memtablerep=prefix_hash - --prefix_size=7 - --set_options_one_in=10000 - """ % (ops_per_thread, - threads, - write_buf_size, - dbname, - random.randint(0, 1), - random.randint(0, 1))) + killtime = time.time() + cmd_params['interval'] + + cmd = gen_cmd(cmd_params) child = subprocess.Popen([cmd], stderr=subprocess.PIPE, shell=True) @@ -199,5 +221,140 @@ def main(argv): # we need to clean up after ourselves -- only do this on test success shutil.rmtree(dbname, True) -if __name__ == "__main__": - sys.exit(main(sys.argv[1:])) + +# This python script runs db_stress multiple times. Some runs with +# kill_random_test that causes rocksdb to crash at various points in code. +def whitebox_crash_main(args): + cmd_params = gen_cmd_params(args) + + cur_time = time.time() + exit_time = cur_time + cmd_params['duration'] + half_time = cur_time + cmd_params['duration'] / 2 + + print("Running whitebox-crash-test with \n" + + "total-duration=" + str(cmd_params['duration']) + "\n" + + "threads=" + str(cmd_params['threads']) + "\n" + + "ops_per_thread=" + str(cmd_params['ops_per_thread']) + "\n" + + "write_buffer_size=" + str(cmd_params['write_buffer_size']) + "\n") + + total_check_mode = 4 + check_mode = 0 + kill_random_test = 97 + kill_mode = 0 + + while time.time() < exit_time: + if check_mode == 0: + additional_opts = { + # use large ops per thread since we will kill it anyway + "ops_per_thread": 100 * cmd_params['ops_per_thread'], + } + # run with kill_random_test + if kill_mode == 0: + additional_opts.update({ + "kill_random_test": kill_random_test, + }) + elif kill_mode == 1: + additional_opts.update({ + "kill_random_test": (kill_random_test / 3 + 1), + "kill_prefix_blacklist": "WritableFileWriter::Append," + + "WritableFileWriter::WriteBuffered", + }) + + # Run kill mode 0 and 1 by turn. + kill_mode = (kill_mode + 1) % 2 + elif check_mode == 1: + # normal run with universal compaction mode + additional_opts = { + "kill_random_test": None, + "ops_per_thread": cmd_params['ops_per_thread'], + "compaction_style": 1, + } + elif check_mode == 2: + # normal run with FIFO compaction mode + # ops_per_thread is divided by 5 because FIFO compaction + # style is quite a bit slower on reads with lot of files + additional_opts = { + "kill_random_test": None, + "ops_per_thread": cmd_params['ops_per_thread'] / 5, + "compaction_style": 2, + } + else: + # normal run + additional_opts = additional_opts = { + "kill_random_test": None, + "ops_per_thread": cmd_params['ops_per_thread'], + } + + cmd = gen_cmd(dict(cmd_params.items() + additional_opts.items())) + + print "Running:" + cmd + "\n" + + popen = subprocess.Popen([cmd], stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + shell=True) + stdoutdata, stderrdata = popen.communicate() + retncode = popen.returncode + msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format( + check_mode, additional_opts['kill_random_test'], retncode)) + print msg + print stdoutdata + + expected = False + if additional_opts['kill_random_test'] is None and (retncode == 0): + # we expect zero retncode if no kill option + expected = True + elif additional_opts['kill_random_test'] is not None and retncode < 0: + # we expect negative retncode if kill option was given + expected = True + + if not expected: + print "TEST FAILED. See kill option and exit code above!!!\n" + sys.exit(1) + + stdoutdata = stdoutdata.lower() + errorcount = (stdoutdata.count('error') - + stdoutdata.count('got errors 0 times')) + print "#times error occurred in output is " + str(errorcount) + "\n" + + if (errorcount > 0): + print "TEST FAILED. Output has 'error'!!!\n" + sys.exit(2) + if (stdoutdata.find('fail') >= 0): + print "TEST FAILED. Output has 'fail'!!!\n" + sys.exit(2) + + # First half of the duration, keep doing kill test. For the next half, + # try different modes. + if time.time() > half_time: + # we need to clean up after ourselves -- only do this on test + # success + shutil.rmtree(dbname, True) + check_mode = (check_mode + 1) % total_check_mode + + time.sleep(1) # time to stabilize after a kill + + +def main(): + parser = argparse.ArgumentParser(description="This script runs and kills \ + db_stress multiple times") + parser.add_argument("test_type", choices=["blackbox", "whitebox"]) + parser.add_argument("--simple", action="store_true") + + all_params = dict(default_params.items() + + blackbox_default_params.items() + + whitebox_default_params.items() + + simple_default_params.items() + + blackbox_simple_default_params.items() + + whitebox_simple_default_params.items()) + + for k, v in all_params.items(): + parser.add_argument("--" + k, type=type(v() if callable(v) else v)) + args = parser.parse_args() + + if args.test_type == 'blackbox': + blackbox_crash_main(args) + if args.test_type == 'whitebox': + whitebox_crash_main(args) + +if __name__ == '__main__': + main() diff --git a/tools/db_crashtest2.py b/tools/db_crashtest2.py deleted file mode 100644 index 93f1478bc..000000000 --- a/tools/db_crashtest2.py +++ /dev/null @@ -1,248 +0,0 @@ -#! /usr/bin/env python -import os -import re -import sys -import time -import random -import getopt -import logging -import tempfile -import subprocess -import shutil - -# This python script runs db_stress multiple times. Some runs with -# kill_random_test that causes rocksdb to crash at various points in code. - -def main(argv): - try: - opts, args = getopt.getopt(argv, "hsd:t:k:o:b:") - except getopt.GetoptError: - print str(getopt.GetoptError) - print "db_crashtest2.py -d -t <#threads> " \ - "-k -o "\ - "-b [-s (simple mode)]\n" - sys.exit(2) - - # default values, will be overridden by cmdline args - kill_random_test = 97 # kill with probability 1/97 by default - duration = 10000 # total time for this script to test db_stress - threads = 32 - ops_per_thread = 200000 - write_buf_size = 4 * 1024 * 1024 - simple_mode = False - write_buf_size_set = False - - for opt, arg in opts: - if opt == '-h': - print "db_crashtest2.py -d -t <#threads> " \ - "-k -o " \ - "-b [-s (simple mode)]\n" - sys.exit() - elif opt == '-s': - simple_mode = True - if not write_buf_size_set: - write_buf_size = 32 * 1024 * 1024 - elif opt == "-d": - duration = int(arg) - elif opt == "-t": - threads = int(arg) - elif opt == "-k": - kill_random_test = int(arg) - elif opt == "-o": - ops_per_thread = int(arg) - elif opt == "-b": - write_buf_size = int(arg) - write_buf_size_set = True - else: - print "unrecognized option " + str(opt) + "\n" - print "db_crashtest2.py -d -t <#threads> " \ - "-k -o " \ - "-b \n" - sys.exit(2) - - cur_time = time.time() - exit_time = cur_time + duration - half_time = cur_time + duration / 2 - - print "Running whitebox-crash-test with \ntotal-duration=" + str(duration) \ - + "\nthreads=" + str(threads) + "\nops_per_thread=" \ - + str(ops_per_thread) + "\nwrite_buffer_size=" \ - + str(write_buf_size) + "\n" - - total_check_mode = 4 - check_mode = 0 - kill_mode = 0 - - test_tmpdir = os.environ.get("TEST_TMPDIR") - if test_tmpdir is None or test_tmpdir == "": - dbname = tempfile.mkdtemp(prefix='rocksdb_crashtest2_') - else: - dbname = test_tmpdir + "/rocksdb_crashtest2" - shutil.rmtree(dbname, True) - - while time.time() < exit_time: - killoption = "" - if check_mode == 0: - # run with kill_random_test - if kill_mode == 0: - killoption = " --kill_random_test=" + str(kill_random_test) - elif kill_mode == 1: - # Remove kill point for normal reads and reduce kill odds - # by 3, so that it still runs about one minutes in average - # before hitting a crash point. - killoption = " --kill_random_test=" + \ - str(kill_random_test / 3 + 1) - killoption += \ - " --kill_prefix_blacklist=WritableFileWriter::Append," \ - "WritableFileWriter::WriteBuffered" - # Run kill mode 0 and 1 by turn. - kill_mode = (kill_mode + 1) % 2 - # use large ops per thread since we will kill it anyway - additional_opts = "--ops_per_thread=" + \ - str(100 * ops_per_thread) + killoption - elif check_mode == 1: - # normal run with universal compaction mode - additional_opts = "--ops_per_thread=" + str(ops_per_thread) + \ - " --compaction_style=1" - elif check_mode == 2: - # normal run with FIFO compaction mode - # ops_per_thread is divided by 5 because FIFO compaction - # style is quite a bit slower on reads with lot of files - additional_opts = "--ops_per_thread=" + str(ops_per_thread / 5) + \ - " --compaction_style=2" - else: - # normal run - additional_opts = "--ops_per_thread=" + str(ops_per_thread) - - if simple_mode: - cmd = re.sub('\s+', ' ', """ - ./db_stress - --column_families=1 - --threads=%s - --write_buffer_size=%s - --destroy_db_initially=0 - --reopen=20 - --prefixpercent=0 - --readpercent=50 - --writepercent=35 - --delpercent=5 - --iterpercent=10 - --db=%s - --max_key=100000000 - --mmap_read=%s - --block_size=16384 - --cache_size=1048576 - --open_files=500000 - --verify_checksum=1 - --sync=0 - --progress_reports=0 - --disable_wal=0 - --disable_data_sync=0 - --target_file_size_base=16777216 - --target_file_size_multiplier=1 - --max_write_buffer_number=3 - --max_background_compactions=1 - --max_bytes_for_level_base=67108864 - --filter_deletes=%s - --memtablerep=skip_list - --prefix_size=0 - --nooverwritepercent=1 - --log2_keys_per_lock=10 - %s - """ % (threads, - write_buf_size, - dbname, - random.randint(0, 1), - random.randint(0, 1), - additional_opts)) - else: - cmd = re.sub('\s+', ' ', """ - ./db_stress - --test_batches_snapshots=%s - --threads=%s - --write_buffer_size=%s - --destroy_db_initially=0 - --reopen=20 - --readpercent=45 - --prefixpercent=5 - --writepercent=35 - --delpercent=5 - --iterpercent=10 - --db=%s - --max_key=100000000 - --mmap_read=%s - --block_size=16384 - --cache_size=1048576 - --open_files=500000 - --verify_checksum=1 - --sync=0 - --progress_reports=0 - --disable_wal=0 - --disable_data_sync=0 - --target_file_size_base=2097152 - --target_file_size_multiplier=2 - --max_write_buffer_number=3 - --max_background_compactions=20 - --max_bytes_for_level_base=10485760 - --filter_deletes=%s - --memtablerep=prefix_hash - --prefix_size=7 - --nooverwritepercent=1 - --log2_keys_per_lock=10 - %s - """ % (random.randint(0, 1), - threads, - write_buf_size, - dbname, - random.randint(0, 1), - random.randint(0, 1), - additional_opts)) - - print "Running:" + cmd + "\n" - - popen = subprocess.Popen([cmd], stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - shell=True) - stdoutdata, stderrdata = popen.communicate() - retncode = popen.returncode - msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format( - check_mode, killoption, retncode)) - print msg - print stdoutdata - - expected = False - if (killoption == '') and (retncode == 0): - # we expect zero retncode if no kill option - expected = True - elif killoption != '' and retncode < 0: - # we expect negative retncode if kill option was given - expected = True - - if not expected: - print "TEST FAILED. See kill option and exit code above!!!\n" - sys.exit(1) - - stdoutdata = stdoutdata.lower() - errorcount = (stdoutdata.count('error') - - stdoutdata.count('got errors 0 times')) - print "#times error occurred in output is " + str(errorcount) + "\n" - - if (errorcount > 0): - print "TEST FAILED. Output has 'error'!!!\n" - sys.exit(2) - if (stdoutdata.find('fail') >= 0): - print "TEST FAILED. Output has 'fail'!!!\n" - sys.exit(2) - - # First half of the duration, keep doing kill test. For the next half, - # try different modes. - if time.time() > half_time: - # we need to clean up after ourselves -- only do this on test - # success - shutil.rmtree(dbname, True) - check_mode = (check_mode + 1) % total_check_mode - - time.sleep(1) # time to stabilize after a kill - -if __name__ == "__main__": - sys.exit(main(sys.argv[1:]))