Optimizer's skeleton: use advisor to optimize config options (#4169)

Summary:
In https://github.com/facebook/rocksdb/pull/3934 we introduced advisor scripts that make suggestions in the config options based on the log file and stats from a run of rocksdb. The optimizer runs the advisor on a benchmark application in a loop and automatically applies the suggested changes until the config options are optimized. This is a work in progress and the patch is the initial skeleton for the optimizer. The sample application that is run in the loop is currently dbbench.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4169

Reviewed By: maysamyabandeh

Differential Revision: D9023671

Pulled By: poojam23

fbshipit-source-id: a6192d475c462cf6eb2b316716f97cb400fcb64d
main
Pooja Malik 6 years ago committed by Facebook Github Bot
parent bdc6abd0b4
commit 134a52e144
  1. 39
      tools/advisor/advisor/bench_runner.py
  2. 134
      tools/advisor/advisor/config_optimizer_example.py
  3. 312
      tools/advisor/advisor/db_bench_runner.py
  4. 282
      tools/advisor/advisor/db_config_optimizer.py
  5. 91
      tools/advisor/advisor/db_log_parser.py
  6. 412
      tools/advisor/advisor/db_options_parser.py
  7. 421
      tools/advisor/advisor/db_stats_fetcher.py
  8. 208
      tools/advisor/advisor/db_timeseries_parser.py
  9. 4
      tools/advisor/advisor/ini_parser.py
  10. 371
      tools/advisor/advisor/rule_parser.py
  11. 145
      tools/advisor/advisor/rules.ini
  12. 5
      tools/advisor/test/input_files/LOG-0
  13. 2
      tools/advisor/test/input_files/rules_err1.ini
  14. 1
      tools/advisor/test/input_files/rules_err2.ini
  15. 1
      tools/advisor/test/input_files/rules_err3.ini
  16. 1
      tools/advisor/test/input_files/rules_err4.ini
  17. 4
      tools/advisor/test/input_files/test_rules.ini
  18. 83
      tools/advisor/test/input_files/triggered_rules.ini
  19. 98
      tools/advisor/test/test_db_log_parser.py
  20. 61
      tools/advisor/test/test_rule_parser.py

@ -0,0 +1,39 @@
# Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
# This source code is licensed under both the GPLv2 (found in the
# COPYING file in the root directory) and Apache 2.0 License
# (found in the LICENSE.Apache file in the root directory).
from abc import ABC, abstractmethod
import re
class BenchmarkRunner(ABC):
@staticmethod
@abstractmethod
def is_metric_better(new_metric, old_metric):
pass
@abstractmethod
def run_experiment(self):
# should return a list of DataSource objects
pass
@staticmethod
def get_info_log_file_name(log_dir, db_path):
# Example: DB Path = /dev/shm and OPTIONS file has option
# db_log_dir=/tmp/rocks/, then the name of the log file will be
# 'dev_shm_LOG' and its location will be /tmp/rocks. If db_log_dir is
# not specified in the OPTIONS file, then the location of the log file
# will be /dev/shm and the name of the file will be 'LOG'
file_name = ''
if log_dir:
# refer GetInfoLogPrefix() in rocksdb/util/filename.cc
# example db_path: /dev/shm/dbbench
file_name = db_path[1:] # to ignore the leading '/' character
to_be_replaced = re.compile('[^0-9a-zA-Z\-_\.]')
for character in to_be_replaced.findall(db_path):
file_name = file_name.replace(character, '_')
if not file_name.endswith('_'):
file_name += '_'
file_name += 'LOG'
return file_name

@ -0,0 +1,134 @@
# Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
# This source code is licensed under both the GPLv2 (found in the
# COPYING file in the root directory) and Apache 2.0 License
# (found in the LICENSE.Apache file in the root directory).
import argparse
from advisor.db_config_optimizer import ConfigOptimizer
from advisor.db_log_parser import NO_COL_FAMILY
from advisor.db_options_parser import DatabaseOptions
from advisor.rule_parser import RulesSpec
CONFIG_OPT_NUM_ITER = 10
def main(args):
# initialise the RulesSpec parser
rule_spec_parser = RulesSpec(args.rules_spec)
# initialise the benchmark runner
bench_runner_module = __import__(
args.benchrunner_module, fromlist=[args.benchrunner_class]
)
bench_runner_class = getattr(bench_runner_module, args.benchrunner_class)
ods_args = {}
if args.ods_client and args.ods_entity:
ods_args['client_script'] = args.ods_client
ods_args['entity'] = args.ods_entity
if args.ods_key_prefix:
ods_args['key_prefix'] = args.ods_key_prefix
db_bench_runner = bench_runner_class(args.benchrunner_pos_args, ods_args)
# initialise the database configuration
db_options = DatabaseOptions(args.rocksdb_options, args.misc_options)
# set the frequency at which stats are dumped in the LOG file and the
# location of the LOG file.
db_log_dump_settings = {
"DBOptions.stats_dump_period_sec": {
NO_COL_FAMILY: args.stats_dump_period_sec
}
}
db_options.update_options(db_log_dump_settings)
# initialise the configuration optimizer
config_optimizer = ConfigOptimizer(
db_bench_runner,
db_options,
rule_spec_parser,
args.base_db_path
)
# run the optimiser to improve the database configuration for given
# benchmarks, with the help of expert-specified rules
final_db_options = config_optimizer.run()
# generate the final rocksdb options file
print(
'Final configuration in: ' +
final_db_options.generate_options_config('final')
)
print(
'Final miscellaneous options: ' +
repr(final_db_options.get_misc_options())
)
if __name__ == '__main__':
'''
An example run of this tool from the command-line would look like:
python3 -m advisor.config_optimizer_example
--base_db_path=/tmp/rocksdbtest-155919/dbbench
--rocksdb_options=temp/OPTIONS_boot.tmp --misc_options bloom_bits=2
--rules_spec=advisor/rules.ini --stats_dump_period_sec=20
--benchrunner_module=advisor.db_bench_runner
--benchrunner_class=DBBenchRunner --benchrunner_pos_args ./../../db_bench
readwhilewriting use_existing_db=true duration=90
'''
parser = argparse.ArgumentParser(description='This script is used for\
searching for a better database configuration')
parser.add_argument(
'--rocksdb_options', required=True, type=str,
help='path of the starting Rocksdb OPTIONS file'
)
# these are options that are column-family agnostic and are not yet
# supported by the Rocksdb Options file: eg. bloom_bits=2
parser.add_argument(
'--base_db_path', required=True, type=str,
help='path for the Rocksdb database'
)
parser.add_argument(
'--misc_options', nargs='*',
help='whitespace-separated list of options that are not supported ' +
'by the Rocksdb OPTIONS file, given in the ' +
'<option_name>=<option_value> format eg. "bloom_bits=2 ' +
'rate_limiter_bytes_per_sec=128000000"')
parser.add_argument(
'--rules_spec', required=True, type=str,
help='path of the file containing the expert-specified Rules'
)
parser.add_argument(
'--stats_dump_period_sec', required=True, type=int,
help='the frequency (in seconds) at which STATISTICS are printed to ' +
'the Rocksdb LOG file'
)
# ODS arguments
parser.add_argument(
'--ods_client', type=str, help='the ODS client binary'
)
parser.add_argument(
'--ods_entity', type=str,
help='the servers for which the ODS stats need to be fetched'
)
parser.add_argument(
'--ods_key_prefix', type=str,
help='the prefix that needs to be attached to the keys of time ' +
'series to be fetched from ODS'
)
# benchrunner_module example: advisor.db_benchmark_client
parser.add_argument(
'--benchrunner_module', required=True, type=str,
help='the module containing the BenchmarkRunner class to be used by ' +
'the Optimizer, example: advisor.db_bench_runner'
)
# benchrunner_class example: DBBenchRunner
parser.add_argument(
'--benchrunner_class', required=True, type=str,
help='the name of the BenchmarkRunner class to be used by the ' +
'Optimizer, should be present in the module provided in the ' +
'benchrunner_module argument, example: DBBenchRunner'
)
parser.add_argument(
'--benchrunner_pos_args', nargs='*',
help='whitespace-separated positional arguments that are passed on ' +
'to the constructor of the BenchmarkRunner class provided in the ' +
'benchrunner_class argument, example: "use_existing_db=true ' +
'duration=900"'
)
args = parser.parse_args()
main(args)

@ -0,0 +1,312 @@
from advisor.bench_runner import BenchmarkRunner
from advisor.db_log_parser import DataSource, DatabaseLogs, NO_COL_FAMILY
from advisor.db_options_parser import DatabaseOptions
from advisor.db_stats_fetcher import (
LogStatsParser, OdsStatsFetcher, DatabasePerfContext
)
import os
import re
import shutil
import subprocess
import time
'''
NOTE: This is not thread-safe, because the output file is simply overwritten.
'''
class DBBenchRunner(BenchmarkRunner):
OUTPUT_FILE = "temp/dbbench_out.tmp"
ERROR_FILE = "temp/dbbench_err.tmp"
DB_PATH = "DB path"
THROUGHPUT = "ops/sec"
PERF_CON = " PERF_CONTEXT:"
@staticmethod
def is_metric_better(new_metric, old_metric):
# for db_bench 'throughput' is the metric returned by run_experiment
return new_metric >= old_metric
@staticmethod
def get_opt_args_str(misc_options_dict):
optional_args_str = ""
for option_name, option_value in misc_options_dict.items():
if option_value:
optional_args_str += (
" --" + option_name + "=" + str(option_value)
)
return optional_args_str
def __init__(self, positional_args, ods_args=None):
# parse positional_args list appropriately
self.db_bench_binary = positional_args[0]
self.benchmark = positional_args[1]
self.db_bench_args = None
# TODO(poojam23): move to unittest with method get_available_workloads
self.supported_benchmarks = None
if len(positional_args) > 2:
# options list with each option given as "<option>=<value>"
self.db_bench_args = positional_args[2:]
# save ods_args if provided
self.ods_args = ods_args
def _parse_output(self, get_perf_context=False):
'''
Sample db_bench output after running 'readwhilewriting' benchmark:
DB path: [/tmp/rocksdbtest-155919/dbbench]\n
readwhilewriting : 16.582 micros/op 60305 ops/sec; 4.2 MB/s (3433828\
of 5427999 found)\n
PERF_CONTEXT:\n
user_key_comparison_count = 500466712, block_cache_hit_count = ...\n
'''
output = {
self.THROUGHPUT: None, self.DB_PATH: None, self.PERF_CON: None
}
perf_context_begins = False
with open(self.OUTPUT_FILE, 'r') as fp:
for line in fp:
if line.startswith(self.benchmark):
print(line) # print output of db_bench run
token_list = line.strip().split()
for ix, token in enumerate(token_list):
if token.startswith(self.THROUGHPUT):
output[self.THROUGHPUT] = (
float(token_list[ix - 1])
)
break
elif line.startswith(self.PERF_CON):
perf_context_begins = True
elif get_perf_context and perf_context_begins:
# Sample perf_context output:
# user_key_comparison_count = 500, block_cache_hit_count =\
# 468, block_read_count = 580, block_read_byte = 445, ...
token_list = line.strip().split(',')
perf_context = {
tk.split('=')[0].strip(): tk.split('=')[1].strip()
for tk in token_list
if tk
}
# TODO(poojam23): this is a hack and should be replaced
# with the timestamp that db_bench will provide per printed
# perf_context
timestamp = int(time.time())
perf_context_ts = {}
for stat in perf_context.keys():
perf_context_ts[stat] = {
timestamp: int(perf_context[stat])
}
output[self.PERF_CON] = perf_context_ts
perf_context_begins = False
elif line.startswith(self.DB_PATH):
output[self.DB_PATH] = (
line.split('[')[1].split(']')[0]
)
return output
def get_log_options(self, db_options, db_path):
# get the location of the LOG file and the frequency at which stats are
# dumped in the LOG file
log_dir_path = None
stats_freq_sec = None
logs_file_prefix = None
# fetch the options
dump_period = 'DBOptions.stats_dump_period_sec'
log_dir = 'DBOptions.db_log_dir'
log_options = db_options.get_options([dump_period, log_dir])
if dump_period in log_options:
stats_freq_sec = int(log_options[dump_period][NO_COL_FAMILY])
if log_dir in log_options:
log_dir_path = log_options[log_dir][NO_COL_FAMILY]
log_file_name = DBBenchRunner.get_info_log_file_name(
log_dir_path, db_path
)
if not log_dir_path:
log_dir_path = db_path
if not log_dir_path.endswith('/'):
log_dir_path += '/'
logs_file_prefix = log_dir_path + log_file_name
return (logs_file_prefix, stats_freq_sec)
def _get_options_command_line_args_str(self, curr_options):
'''
This method uses the provided Rocksdb OPTIONS to create a string of
command-line arguments for db_bench.
The --options_file argument is always given and the options that are
not supported by the OPTIONS file are given as separate arguments.
'''
optional_args_str = DBBenchRunner.get_opt_args_str(
curr_options.get_misc_options()
)
# generate an options configuration file
options_file = curr_options.generate_options_config(nonce='12345')
optional_args_str += " --options_file=" + options_file
return optional_args_str
def _setup_db_before_experiment(self, curr_options, db_path):
# remove destination directory if it already exists
try:
shutil.rmtree(db_path, ignore_errors=True)
except OSError as e:
print('Error: rmdir ' + e.filename + ' ' + e.strerror)
command = "%s --benchmarks=fillrandom --db=%s --num=1000000" % (
self.db_bench_binary, db_path
)
args_str = self._get_options_command_line_args_str(curr_options)
command += args_str
self._run_command(command)
def _build_experiment_command(self, curr_options, db_path):
command = "%s --benchmarks=%s --statistics --perf_level=3 --db=%s" % (
self.db_bench_binary, self.benchmark, db_path
)
args_str = self._get_options_command_line_args_str(curr_options)
# handle the command-line args passed in the constructor
for cmd_line_arg in self.db_bench_args:
args_str += (" --" + cmd_line_arg)
command += args_str
return command
def _run_command(self, command):
# run db_bench and return the
out_file = open(self.OUTPUT_FILE, "w+")
err_file = open(self.ERROR_FILE, "w+")
print('executing... - ' + command)
subprocess.call(command, shell=True, stdout=out_file, stderr=err_file)
out_file.close()
err_file.close()
def run_experiment(self, db_options, db_path):
# type: (List[str], str) -> str
self._setup_db_before_experiment(db_options, db_path)
command = self._build_experiment_command(db_options, db_path)
self._run_command(command)
parsed_output = self._parse_output(get_perf_context=True)
# Create the LOGS object
# get the log options from the OPTIONS file
logs_file_prefix, stats_freq_sec = self.get_log_options(
db_options, parsed_output[self.DB_PATH]
)
db_logs = DatabaseLogs(
logs_file_prefix, db_options.get_column_families()
)
# Create the Log STATS object
db_log_stats = LogStatsParser(logs_file_prefix, stats_freq_sec)
# Create the PerfContext STATS object
db_perf_context = DatabasePerfContext(
parsed_output[self.PERF_CON], 0, False
)
data_sources = {
DataSource.Type.DB_OPTIONS: [db_options],
DataSource.Type.LOG: [db_logs],
DataSource.Type.TIME_SERIES: [db_log_stats, db_perf_context]
}
# Create the ODS STATS object
if self.ods_args:
data_sources[DataSource.Type.TIME_SERIES].append(OdsStatsFetcher(
self.ods_args['client_script'],
self.ods_args['entity'],
self.ods_args['key_prefix']
))
return data_sources, parsed_output[self.THROUGHPUT]
# TODO: this method is for testing, shift it out to unit-tests when ready
def get_available_workloads(self):
if not self.supported_benchmarks:
self.supported_benchmarks = []
command = '%s --help' % self.db_bench_binary
self._run_command(command)
with open(self.OUTPUT_FILE, 'r') as fp:
start = False
for line in fp:
if re.search('available benchmarks', line, re.IGNORECASE):
start = True
continue
elif start:
if re.search('meta operations', line, re.IGNORECASE):
break
benchmark_info = line.strip()
if benchmark_info:
token_list = benchmark_info.split()
if len(token_list) > 2 and token_list[1] == '--':
self.supported_benchmarks.append(token_list[0])
else:
continue
self.supported_benchmarks = sorted(self.supported_benchmarks)
return self.supported_benchmarks
# TODO: remove this method, used only for testing
def main():
pos_args = [
'/home/poojamalik/workspace/rocksdb/db_bench',
'readwhilewriting',
'use_existing_db=true',
'duration=10'
]
db_bench_helper = DBBenchRunner(pos_args)
# populate benchmarks with the available ones in the db_bench tool
benchmarks = db_bench_helper.get_available_workloads()
print(benchmarks)
print()
options_file = (
'/home/poojamalik/workspace/rocksdb/tools/advisor/temp/' +
'OPTIONS_temp.tmp'
)
misc_options = ["rate_limiter_bytes_per_sec=1024000", "bloom_bits=2"]
db_options = DatabaseOptions(options_file, misc_options)
data_sources, _ = db_bench_helper.run_experiment(db_options)
print(data_sources[DataSource.Type.DB_OPTIONS][0].options_dict)
print()
print(data_sources[DataSource.Type.LOG][0].logs_path_prefix)
if os.path.isfile(data_sources[DataSource.Type.LOG][0].logs_path_prefix):
print('log file exists!')
else:
print('error: log file does not exist!')
print(data_sources[DataSource.Type.LOG][0].column_families)
print()
print(data_sources[DataSource.Type.TIME_SERIES][0].logs_file_prefix)
if (
os.path.isfile(
data_sources[DataSource.Type.TIME_SERIES][0].logs_file_prefix
)
):
print('log file exists!')
else:
print('error: log file does not exist!')
print(data_sources[DataSource.Type.TIME_SERIES][0].stats_freq_sec)
print(data_sources[DataSource.Type.TIME_SERIES][1].keys_ts)
db_options = DatabaseOptions(options_file, None)
data_sources, _ = db_bench_helper.run_experiment(db_options)
print(data_sources[DataSource.Type.DB_OPTIONS][0].options_dict)
print()
print(data_sources[DataSource.Type.LOG][0].logs_path_prefix)
if os.path.isfile(data_sources[DataSource.Type.LOG][0].logs_path_prefix):
print('log file exists!')
else:
print('error: log file does not exist!')
print(data_sources[DataSource.Type.LOG][0].column_families)
print()
print(data_sources[DataSource.Type.TIME_SERIES][0].logs_file_prefix)
if (
os.path.isfile(
data_sources[DataSource.Type.TIME_SERIES][0].logs_file_prefix
)
):
print('log file exists!')
else:
print('error: log file does not exist!')
print(data_sources[DataSource.Type.TIME_SERIES][0].stats_freq_sec)
print(data_sources[DataSource.Type.TIME_SERIES][1].keys_ts)
print(data_sources[DataSource.Type.TIME_SERIES][1].stats_freq_sec)
if __name__ == "__main__":
main()

@ -0,0 +1,282 @@
# Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
# This source code is licensed under both the GPLv2 (found in the
# COPYING file in the root directory) and Apache 2.0 License
# (found in the LICENSE.Apache file in the root directory).
from advisor.db_log_parser import NO_COL_FAMILY
from advisor.db_options_parser import DatabaseOptions
from advisor.rule_parser import Suggestion
import copy
import random
class ConfigOptimizer:
SCOPE = 'scope'
SUGG_VAL = 'suggested values'
@staticmethod
def apply_action_on_value(old_value, action, suggested_values):
chosen_sugg_val = None
if suggested_values:
chosen_sugg_val = random.choice(list(suggested_values))
new_value = None
if action is Suggestion.Action.set or not old_value:
assert(chosen_sugg_val)
new_value = chosen_sugg_val
else:
# For increase/decrease actions, currently the code tries to make
# a 30% change in the option's value per iteration. An addend is
# also present (+1 or -1) to handle the cases when the option's
# old value was 0 or the final int() conversion suppressed the 30%
# change made to the option
old_value = float(old_value)
mul = 0
add = 0
if action is Suggestion.Action.increase:
if old_value < 0:
mul = 0.7
add = 2
else:
mul = 1.3
add = 2
elif action is Suggestion.Action.decrease:
if old_value < 0:
mul = 1.3
add = -2
else:
mul = 0.7
add = -2
new_value = int(old_value * mul + add)
return new_value
@staticmethod
def improve_db_config(options, rule, suggestions_dict):
# this method takes ONE 'rule' and applies all its suggestions on the
# appropriate options
required_options = []
rule_suggestions = []
for sugg_name in rule.get_suggestions():
option = suggestions_dict[sugg_name].option
action = suggestions_dict[sugg_name].action
# A Suggestion in the rules spec must have the 'option' and
# 'action' fields defined, always call perform_checks() method
# after parsing the rules file using RulesSpec
assert(option)
assert(action)
required_options.append(option)
rule_suggestions.append(suggestions_dict[sugg_name])
current_config = options.get_options(required_options)
# Create the updated configuration from the rule's suggestions
updated_config = {}
for sugg in rule_suggestions:
# case: when the option is not present in the current configuration
if sugg.option not in current_config:
try:
new_value = ConfigOptimizer.apply_action_on_value(
None, sugg.action, sugg.suggested_values
)
if sugg.option not in updated_config:
updated_config[sugg.option] = {}
if DatabaseOptions.is_misc_option(sugg.option):
# this suggestion is on an option that is not yet
# supported by the Rocksdb OPTIONS file and so it is
# not prefixed by a section type.
updated_config[sugg.option][NO_COL_FAMILY] = new_value
else:
for col_fam in rule.get_trigger_column_families():
updated_config[sugg.option][col_fam] = new_value
except AssertionError:
print(
'WARNING(ConfigOptimizer): provide suggested_values ' +
'for ' + sugg.option
)
continue
# case: when the option is present in the current configuration
if NO_COL_FAMILY in current_config[sugg.option]:
old_value = current_config[sugg.option][NO_COL_FAMILY]
try:
new_value = ConfigOptimizer.apply_action_on_value(
old_value, sugg.action, sugg.suggested_values
)
if sugg.option not in updated_config:
updated_config[sugg.option] = {}
updated_config[sugg.option][NO_COL_FAMILY] = new_value
except AssertionError:
print(
'WARNING(ConfigOptimizer): provide suggested_values ' +
'for ' + sugg.option
)
else:
for col_fam in rule.get_trigger_column_families():
old_value = None
if col_fam in current_config[sugg.option]:
old_value = current_config[sugg.option][col_fam]
try:
new_value = ConfigOptimizer.apply_action_on_value(
old_value, sugg.action, sugg.suggested_values
)
if sugg.option not in updated_config:
updated_config[sugg.option] = {}
updated_config[sugg.option][col_fam] = new_value
except AssertionError:
print(
'WARNING(ConfigOptimizer): provide ' +
'suggested_values for ' + sugg.option
)
return current_config, updated_config
@staticmethod
def pick_rule_to_apply(rules, last_rule_name, rules_tried, backtrack):
if not rules:
print('\nNo more rules triggered!')
return None
# if the last rule provided an improvement in the database performance,
# and it was triggered again (i.e. it is present in 'rules'), then pick
# the same rule for this iteration too.
if last_rule_name and not backtrack:
for rule in rules:
if rule.name == last_rule_name:
return rule
# there was no previous rule OR the previous rule did not improve db
# performance OR it was not triggered for this iteration,
# then pick another rule that has not been tried yet
for rule in rules:
if rule.name not in rules_tried:
return rule
print('\nAll rules have been exhausted')
return None
@staticmethod
def apply_suggestions(
triggered_rules,
current_rule_name,
rules_tried,
backtrack,
curr_options,
suggestions_dict
):
curr_rule = ConfigOptimizer.pick_rule_to_apply(
triggered_rules, current_rule_name, rules_tried, backtrack
)
if not curr_rule:
return tuple([None]*4)
# if a rule has been picked for improving db_config, update rules_tried
rules_tried.add(curr_rule.name)
# get updated config based on the picked rule
curr_conf, updated_conf = ConfigOptimizer.improve_db_config(
curr_options, curr_rule, suggestions_dict
)
conf_diff = DatabaseOptions.get_options_diff(curr_conf, updated_conf)
if not conf_diff: # the current and updated configs are the same
curr_rule, rules_tried, curr_conf, updated_conf = (
ConfigOptimizer.apply_suggestions(
triggered_rules,
None,
rules_tried,
backtrack,
curr_options,
suggestions_dict
)
)
print('returning from apply_suggestions')
return (curr_rule, rules_tried, curr_conf, updated_conf)
# TODO(poojam23): check if this method is required or can we directly set
# the config equal to the curr_config
@staticmethod
def get_backtrack_config(curr_config, updated_config):
diff = DatabaseOptions.get_options_diff(curr_config, updated_config)
bt_config = {}
for option in diff:
bt_config[option] = {}
for col_fam in diff[option]:
bt_config[option][col_fam] = diff[option][col_fam][0]
print(bt_config)
return bt_config
def __init__(self, bench_runner, db_options, rule_parser, base_db):
self.bench_runner = bench_runner
self.db_options = db_options
self.rule_parser = rule_parser
self.base_db_path = base_db
def run(self):
# In every iteration of this method's optimization loop we pick ONE
# RULE from all the triggered rules and apply all its suggestions to
# the appropriate options.
# bootstrapping the optimizer
print('Bootstrapping optimizer:')
options = copy.deepcopy(self.db_options)
old_data_sources, old_metric = (
self.bench_runner.run_experiment(options, self.base_db_path)
)
print('Initial metric: ' + str(old_metric))
self.rule_parser.load_rules_from_spec()
self.rule_parser.perform_section_checks()
triggered_rules = self.rule_parser.get_triggered_rules(
old_data_sources, options.get_column_families()
)
print('\nTriggered:')
self.rule_parser.print_rules(triggered_rules)
backtrack = False
rules_tried = set()
curr_rule, rules_tried, curr_conf, updated_conf = (
ConfigOptimizer.apply_suggestions(
triggered_rules,
None,
rules_tried,
backtrack,
options,
self.rule_parser.get_suggestions_dict()
)
)
# the optimizer loop
while curr_rule:
print('\nRule picked for next iteration:')
print(curr_rule.name)
print('\ncurrent config:')
print(curr_conf)
print('updated config:')
print(updated_conf)
options.update_options(updated_conf)
# run bench_runner with updated config
new_data_sources, new_metric = (
self.bench_runner.run_experiment(options, self.base_db_path)
)
print('\nnew metric: ' + str(new_metric))
backtrack = not self.bench_runner.is_metric_better(
new_metric, old_metric
)
# update triggered_rules, metric, data_sources, if required
if backtrack:
# revert changes to options config
print('\nBacktracking to previous configuration')
backtrack_conf = ConfigOptimizer.get_backtrack_config(
curr_conf, updated_conf
)
options.update_options(backtrack_conf)
else:
# run advisor on new data sources
self.rule_parser.load_rules_from_spec() # reboot the advisor
self.rule_parser.perform_section_checks()
triggered_rules = self.rule_parser.get_triggered_rules(
new_data_sources, options.get_column_families()
)
print('\nTriggered:')
self.rule_parser.print_rules(triggered_rules)
old_metric = new_metric
old_data_sources = new_data_sources
rules_tried = set()
# pick rule to work on and set curr_rule to that
curr_rule, rules_tried, curr_conf, updated_conf = (
ConfigOptimizer.apply_suggestions(
triggered_rules,
curr_rule.name,
rules_tried,
backtrack,
options,
self.rule_parser.get_suggestions_dict()
)
)
# return the final database options configuration
return options

@ -4,18 +4,21 @@
# (found in the LICENSE.Apache file in the root directory).
from abc import ABC, abstractmethod
from calendar import timegm
from enum import Enum
import glob
import re
from enum import Enum
import time
NO_COL_FAMILY = 'DB_WIDE'
class DataSource(ABC):
class Type(Enum):
LOG = 1
DB_OPTIONS = 2
STATS = 3
PERF_CONTEXT = 4
ODS = 5
TIME_SERIES = 3
def __init__(self, type):
self.type = type
@ -33,15 +36,30 @@ class Log:
date_regex = '\d{4}/\d{2}/\d{2}-\d{2}:\d{2}:\d{2}\.\d{6}'
return re.match(date_regex, log_line)
def __init__(self, log_line):
def __init__(self, log_line, column_families):
token_list = log_line.strip().split()
self.time = token_list[0]
self.context = token_list[1]
self.message = " ".join(token_list[2:])
self.column_family = None
# example log for 'default' column family:
# "2018/07/25-17:29:05.176080 7f969de68700 [db/compaction_job.cc:1634]
# [default] [JOB 3] Compacting 24@0 + 16@1 files to L1, score 6.00\n"
for col_fam in column_families:
search_for_str = '\[' + col_fam + '\]'
if re.search(search_for_str, self.message):
self.column_family = col_fam
break
if not self.column_family:
self.column_family = NO_COL_FAMILY
def get_time(self):
def get_human_readable_time(self):
# example from a log line: '2018/07/25-11:25:45.782710'
return self.time
def get_column_family(self):
return self.column_family
def get_context(self):
return self.context
@ -49,48 +67,65 @@ class Log:
return self.message
def append_message(self, remaining_log):
self.message = self.message + remaining_log
self.message = self.message + '\n' + remaining_log.strip()
def get_timestamp(self):
# example: '2018/07/25-11:25:45.782710' will be converted to the GMT
# Unix timestamp 1532517945 (note: this method assumes that self.time
# is in GMT)
hr_time = self.time + 'GMT'
timestamp = timegm(time.strptime(hr_time, "%Y/%m/%d-%H:%M:%S.%f%Z"))
return timestamp
def __repr__(self):
return 'time: ' + self.time + ', context: ' + self.context +\
', message: ' + self.message
return (
'time: ' + self.time + '; context: ' + self.context +
'; col_fam: ' + self.column_family +
'; message: ' + self.message
)
class DatabaseLogs(DataSource):
def __init__(self, logs_path_prefix):
def __init__(self, logs_path_prefix, column_families):
super().__init__(DataSource.Type.LOG)
self.logs_path_prefix = logs_path_prefix
self.column_families = column_families
def trigger_appropriate_conditions(self, conditions, log):
conditions_to_be_removed = []
def trigger_conditions_for_log(self, conditions, log):
# For a LogCondition object, trigger is:
# Dict[column_family_name, List[Log]]. This explains why the condition
# was triggered and for which column families.
for cond in conditions:
if re.search(cond.regex, log.get_message(), re.IGNORECASE):
cond.set_trigger(log)
conditions_to_be_removed.append(cond)
for remove_cond in conditions_to_be_removed:
conditions.remove(remove_cond)
return conditions
trigger = cond.get_trigger()
if not trigger:
trigger = {}
if log.get_column_family() not in trigger:
trigger[log.get_column_family()] = []
trigger[log.get_column_family()].append(log)
cond.set_trigger(trigger)
def check_and_trigger_conditions(self, conditions):
for file_name in glob.glob(self.logs_path_prefix + '*'):
# TODO(poojam23): find a way to distinguish between log files
# - generated in the current experiment but are labeled 'old'
# because they LOGs exceeded the file size limit AND
# - generated in some previous experiment that are also labeled
# 'old' and were not deleted for some reason
if re.search('old', file_name, re.IGNORECASE):
continue
with open(file_name, 'r') as db_logs:
new_log = None
for line in db_logs:
if not conditions:
break
if Log.is_new_log(line):
if new_log:
conditions = self.trigger_appropriate_conditions(
conditions,
new_log
self.trigger_conditions_for_log(
conditions, new_log
)
new_log = Log(line)
new_log = Log(line, self.column_families)
else:
# To account for logs split into multiple lines
new_log.append_message(line)
# Check for the last log in the file.
if new_log and conditions:
conditions = self.trigger_appropriate_conditions(
conditions,
new_log
)
if new_log:
self.trigger_conditions_for_log(conditions, new_log)

@ -3,8 +3,11 @@
# COPYING file in the root directory) and Apache 2.0 License
# (found in the LICENSE.Apache file in the root directory).
from advisor.db_log_parser import DataSource
import copy
from advisor.db_log_parser import DataSource, NO_COL_FAMILY
from advisor.ini_parser import IniParser
from advisor.rule_parser import Condition, OptionCondition
import os
class OptionsSpecParser(IniParser):
@ -16,7 +19,8 @@ class OptionsSpecParser(IniParser):
def get_section_type(line):
'''
Example section header: [TableOptions/BlockBasedTable "default"]
Here section_type returned would be 'TableOptions.BlockBasedTable'
Here ConfigurationOptimizer returned would be
'TableOptions.BlockBasedTable'
'''
section_path = line.strip()[1:-1].split()[0]
section_type = '.'.join(section_path.split('/'))
@ -29,79 +33,407 @@ class OptionsSpecParser(IniParser):
return None
return token_list[1]
@staticmethod
def get_section_str(section_type, section_name):
# Example:
# Case 1: get_section_str('DBOptions', NO_COL_FAMILY)
# Case 2: get_section_str('TableOptions.BlockBasedTable', 'default')
section_type = '/'.join(section_type.strip().split('.'))
# Case 1: section_type = 'DBOptions'
# Case 2: section_type = 'TableOptions/BlockBasedTable'
section_str = '[' + section_type
if section_name == NO_COL_FAMILY:
# Case 1: '[DBOptions]'
return (section_str + ']')
else:
# Case 2: '[TableOptions/BlockBasedTable "default"]'
return section_str + ' "' + section_name + '"]'
@staticmethod
def get_option_str(key, values):
option_str = key + '='
# get_option_str('db_log_dir', None), returns 'db_log_dir='
if values:
# example:
# get_option_str('max_bytes_for_level_multiplier_additional',
# [1,1,1,1,1,1,1]), returned string:
# 'max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1'
if isinstance(values, list):
for value in values:
option_str += (str(value) + ':')
option_str = option_str[:-1]
else:
# example: get_option_str('write_buffer_size', 1048576)
# returned string: 'write_buffer_size=1048576'
option_str += str(values)
return option_str
class DatabaseOptions(DataSource):
def __init__(self, rocksdb_options):
@staticmethod
def is_misc_option(option_name):
return '.' not in option_name
@staticmethod
def get_options_diff(opt_old, opt_new):
# type: Dict[option, Dict[col_fam, value]] X 2 ->
# Dict[option, Dict[col_fam, Tuple(old_value, new_value)]]
# note: diff should contain a tuple of values only if they are
# different from each other
options_union = set(opt_old.keys()).union(set(opt_new.keys()))
diff = {}
for opt in options_union:
diff[opt] = {}
# if option in options_union, then it must be in one of the configs
if opt not in opt_old:
for col_fam in opt_new[opt]:
diff[opt][col_fam] = (None, opt_new[opt][col_fam])
elif opt not in opt_new:
for col_fam in opt_old[opt]:
diff[opt][col_fam] = (opt_old[opt][col_fam], None)
else:
for col_fam in opt_old[opt]:
if col_fam in opt_new[opt]:
if opt_old[opt][col_fam] != opt_new[opt][col_fam]:
diff[opt][col_fam] = (
opt_old[opt][col_fam],
opt_new[opt][col_fam]
)
else:
diff[opt][col_fam] = (opt_old[opt][col_fam], None)
for col_fam in opt_new[opt]:
if col_fam in opt_old[opt]:
if opt_old[opt][col_fam] != opt_new[opt][col_fam]:
diff[opt][col_fam] = (
opt_old[opt][col_fam],
opt_new[opt][col_fam]
)
else:
diff[opt][col_fam] = (None, opt_new[opt][col_fam])
if not diff[opt]:
diff.pop(opt)
return diff
def __init__(self, rocksdb_options, misc_options=None):
super().__init__(DataSource.Type.DB_OPTIONS)
self.options_path = rocksdb_options
# Load the options from the given file to a dictionary.
self.load_from_source()
# The options are stored in the following data structure:
# Dict[section_type, Dict[section_name, Dict[option_name, value]]]
self.options_dict = None
self.column_families = None
# Load the options from the given file to a dictionary.
self.load_from_source(rocksdb_options)
# Setup the miscellaneous options expected to be List[str], where each
# element in the List has the format "<option_name>=<option_value>"
# These options are the ones that are not yet supported by the Rocksdb
# OPTIONS file, so they are provided separately
self.setup_misc_options(misc_options)
def setup_misc_options(self, misc_options):
self.misc_options = {}
if misc_options:
for option_pair_str in misc_options:
option_name = option_pair_str.split('=')[0].strip()
option_value = option_pair_str.split('=')[1].strip()
self.misc_options[option_name] = option_value
def load_from_source(self):
def load_from_source(self, options_path):
self.options_dict = {}
with open(self.options_path, 'r') as db_options:
with open(options_path, 'r') as db_options:
for line in db_options:
line = OptionsSpecParser.remove_trailing_comment(line)
if not line:
continue
if OptionsSpecParser.is_section_header(line):
curr_sec_type = OptionsSpecParser.get_section_type(line)
curr_sec_type = (
OptionsSpecParser.get_section_type(line)
)
curr_sec_name = OptionsSpecParser.get_section_name(line)
if curr_sec_name:
option_prefix = curr_sec_name + '.' + curr_sec_type
if curr_sec_type not in self.options_dict:
self.options_dict[curr_sec_type] = {}
if not curr_sec_name:
curr_sec_name = NO_COL_FAMILY
self.options_dict[curr_sec_type][curr_sec_name] = {}
# example: if the line read from the Rocksdb OPTIONS file
# is [CFOptions "default"], then the section type is
# CFOptions and 'default' is the name of a column family
# that for this database, so it's added to the list of
# column families stored in this object
if curr_sec_type == 'CFOptions':
if not self.column_families:
self.column_families = []
self.column_families.append(curr_sec_name)
else:
option_prefix = curr_sec_type
elif OptionsSpecParser.is_new_option(line):
key, value = OptionsSpecParser.get_key_value_pair(line)
if not self.options_dict:
self.options_dict = {}
self.options_dict[option_prefix + '.' + key] = value
self.options_dict[curr_sec_type][curr_sec_name][key] = (
value
)
else:
error = 'Not able to parse line in Options file.'
OptionsSpecParser.exit_with_parse_error(line, error)
def get_misc_options(self):
# these are options that are not yet supported by the Rocksdb OPTIONS
# file, hence they are provided and stored separately
return self.misc_options
def get_column_families(self):
return self.column_families
def get_all_options(self):
# This method returns all the options that are stored in this object as
# a: Dict[<sec_type>.<option_name>: Dict[col_fam, option_value]]
all_options = []
# Example: in the section header '[CFOptions "default"]' read from the
# OPTIONS file, sec_type='CFOptions'
for sec_type in self.options_dict:
for col_fam in self.options_dict[sec_type]:
for opt_name in self.options_dict[sec_type][col_fam]:
option = sec_type + '.' + opt_name
all_options.append(option)
all_options.extend(list(self.misc_options.keys()))
return self.get_options(all_options)
def get_options(self, reqd_options):
# type: List[str] -> Dict[str, Dict[str, Any]]
# List[option] -> Dict[option, Dict[col_fam, value]]
reqd_options_dict = {}
for option in reqd_options:
if DatabaseOptions.is_misc_option(option):
# the option is not prefixed by '<section_type>.' because it is
# not yet supported by the Rocksdb OPTIONS file; so it has to
# be fetched from the misc_options dictionary
if option not in self.misc_options:
continue
if option not in reqd_options_dict:
reqd_options_dict[option] = {}
reqd_options_dict[option][NO_COL_FAMILY] = (
self.misc_options[option]
)
else:
# Example: option = 'TableOptions.BlockBasedTable.block_align'
# then, sec_type = 'TableOptions.BlockBasedTable'
sec_type = '.'.join(option.split('.')[:-1])
# opt_name = 'block_align'
opt_name = option.split('.')[-1]
if sec_type not in self.options_dict:
continue
for col_fam in self.options_dict[sec_type]:
if opt_name in self.options_dict[sec_type][col_fam]:
if option not in reqd_options_dict:
reqd_options_dict[option] = {}
reqd_options_dict[option][col_fam] = (
self.options_dict[sec_type][col_fam][opt_name]
)
return reqd_options_dict
def update_options(self, options):
# An example 'options' object looks like:
# {'DBOptions.max_background_jobs': {NO_COL_FAMILY: 2},
# 'CFOptions.write_buffer_size': {'default': 1048576, 'cf_A': 128000},
# 'bloom_bits': {NO_COL_FAMILY: 4}}
for option in options:
if DatabaseOptions.is_misc_option(option):
# this is a misc_option i.e. an option that is not yet
# supported by the Rocksdb OPTIONS file, so it is not prefixed
# by '<section_type>.' and must be stored in the separate
# misc_options dictionary
if NO_COL_FAMILY not in options[option]:
print(
'WARNING(DatabaseOptions.update_options): not ' +
'updating option ' + option + ' because it is in ' +
'misc_option format but its scope is not ' +
NO_COL_FAMILY + '. Check format of option.'
)
continue
self.misc_options[option] = options[option][NO_COL_FAMILY]
else:
sec_name = '.'.join(option.split('.')[:-1])
opt_name = option.split('.')[-1]
if sec_name not in self.options_dict:
self.options_dict[sec_name] = {}
for col_fam in options[option]:
# if the option is not already present in the dictionary,
# it will be inserted, else it will be updated to the new
# value
if col_fam not in self.options_dict[sec_name]:
self.options_dict[sec_name][col_fam] = {}
self.options_dict[sec_name][col_fam][opt_name] = (
copy.deepcopy(options[option][col_fam])
)
def generate_options_config(self, nonce):
# this method generates a Rocksdb OPTIONS file in the INI format from
# the options stored in self.options_dict
this_path = os.path.abspath(os.path.dirname(__file__))
file_name = '../temp/OPTIONS_' + str(nonce) + '.tmp'
file_path = os.path.join(this_path, file_name)
with open(file_path, 'w') as fp:
for section in self.options_dict:
for col_fam in self.options_dict[section]:
fp.write(
OptionsSpecParser.get_section_str(section, col_fam) +
'\n'
)
for option in self.options_dict[section][col_fam]:
values = self.options_dict[section][col_fam][option]
fp.write(
OptionsSpecParser.get_option_str(option, values) +
'\n'
)
fp.write('\n')
return file_path
def check_and_trigger_conditions(self, conditions):
'''
For every condition, if the fields are not present set_trigger will
not be called for it. Or if all the fields are present, then the
trigger will be set to whatever the expression evaluates to.
'''
for cond in conditions:
# This contains the indices of options to whose name the column
# family name needs to be prepended in order to create the full
# option name as parsed from the options file.
reqd_options_dict = self.get_options(cond.options)
# This contains the indices of options that are specific to some
# column family and are not database-wide options.
incomplete_option_ix = []
ix = 0
options = []
for option in cond.options:
if option in self.options_dict.keys():
options.append(self.options_dict[option])
missing_reqd_option = False
for ix, option in enumerate(cond.options):
if option not in reqd_options_dict:
print(
'WARNING(DatabaseOptions.check_and_trigger): ' +
'skipping condition ' + cond.name + ' because it '
'requires option ' + option + ' but this option is' +
' not available'
)
missing_reqd_option = True
break # required option is absent
if NO_COL_FAMILY in reqd_options_dict[option]:
options.append(reqd_options_dict[option][NO_COL_FAMILY])
else:
options.append(None)
incomplete_option_ix.append(ix)
options.append(0)
ix += 1
# if all the options were present as is:
if missing_reqd_option:
continue
# if all the options are database-wide options
if not incomplete_option_ix:
if not eval(cond.eval_expr):
cond.set_trigger(cond.eval_expr)
try:
if eval(cond.eval_expr):
cond.set_trigger({NO_COL_FAMILY: options})
except Exception as e:
print(
'WARNING(DatabaseOptions) check_and_trigger:' + str(e)
)
continue
# for all the options that were not present as is, we prepend them
# their names with every column family found in options file.
# for all the options that are not database-wide, we look for their
# values specific to column families
col_fam_options_dict = {}
for col_fam in self.column_families:
present = True
for ix in incomplete_option_ix:
full_option = col_fam + '.' + cond.options[ix]
if full_option not in self.options_dict.keys():
option = cond.options[ix]
if col_fam not in reqd_options_dict[option]:
present = False
break
options[ix] = self.options_dict[full_option]
if present and not eval(cond.eval_expr):
cond.set_trigger(cond.eval_expr)
options[ix] = reqd_options_dict[option][col_fam]
if present:
try:
if eval(cond.eval_expr):
col_fam_options_dict[col_fam] = (
copy.deepcopy(options)
)
except Exception as e:
print(
'WARNING(DatabaseOptions) check_and_trigger: ' +
str(e)
)
# Trigger for an OptionCondition object is of the form:
# Dict[col_fam_name: List[option_value]]
# where col_fam_name is the name of a column family for which
# 'eval_expr' evaluated to True and List[option_value] is the list
# of values of the options specified in the condition's 'options'
# field
if col_fam_options_dict:
cond.set_trigger(col_fam_options_dict)
# TODO(poojam23): remove these methods once the unit tests for this class are
# in place
def main():
options_file = 'temp/OPTIONS_default.tmp'
misc_options = ["misc_opt1=10", "misc_opt2=100", "misc_opt3=1000"]
db_options = DatabaseOptions(options_file, misc_options)
print(db_options.get_column_families())
get_op = db_options.get_options([
'DBOptions.db_log_dir',
'DBOptions.is_fd_close_on_exec',
'CFOptions.memtable_prefix_bloom_size_ratio',
'TableOptions.BlockBasedTable.verify_compression',
'misc_opt1',
'misc_opt3'
])
print(get_op)
get_op['DBOptions.db_log_dir'][NO_COL_FAMILY] = 'some_random_path'
get_op['CFOptions.memtable_prefix_bloom_size_ratio']['default'] = 2.31
get_op['TableOptions.BlockBasedTable.verify_compression']['default'] = 4.4
get_op['misc_opt2'] = {}
get_op['misc_opt2'][NO_COL_FAMILY] = 2
db_options.update_options(get_op)
print('options updated in ' + db_options.generate_options_config(123))
print('misc options ' + repr(db_options.get_misc_options()))
options_file = 'temp/OPTIONS_123.tmp'
db_options = DatabaseOptions(options_file, misc_options)
# only CFOptions
cond1 = Condition('opt-cond-1')
cond1 = OptionCondition.create(cond1)
cond1.set_parameter(
'options', [
'CFOptions.level0_file_num_compaction_trigger',
'CFOptions.write_buffer_size',
'CFOptions.max_bytes_for_level_base'
]
)
cond1.set_parameter(
'evaluate',
'int(options[0])*int(options[1])-int(options[2])>=0'
)
# only DBOptions
cond2 = Condition('opt-cond-2')
cond2 = OptionCondition.create(cond2)
cond2.set_parameter(
'options', [
'DBOptions.max_file_opening_threads',
'DBOptions.table_cache_numshardbits',
'misc_opt2',
'misc_opt3'
]
)
cond2_expr = (
'(int(options[0])*int(options[2]))-' +
'((4*int(options[1])*int(options[3]))/10)==0'
)
cond2.set_parameter('evaluate', cond2_expr)
# mix of CFOptions and DBOptions
cond3 = Condition('opt-cond-3')
cond3 = OptionCondition.create(cond3)
cond3.set_parameter(
'options', [
'DBOptions.max_background_jobs', # 2
'DBOptions.write_thread_slow_yield_usec', # 3
'CFOptions.num_levels', # 7
'misc_opt1' # 10
]
)
cond3_expr = (
'(int(options[3])*int(options[2]))-' +
'(int(options[1])*int(options[0]))==64'
)
cond3.set_parameter('evaluate', cond3_expr)
db_options.check_and_trigger_conditions([cond1, cond2, cond3])
print(cond1.get_trigger()) # {'col-fam-B': ['4', '10', '10']}
print(cond2.get_trigger()) # {'DB_WIDE': ['16', '4']}
# {'col-fam-B': ['2', '3', '10'], 'col-fam-A': ['2', '3', '7']}
print(cond3.get_trigger())
if __name__ == "__main__":
main()

@ -0,0 +1,421 @@
# Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
# This source code is licensed under both the GPLv2 (found in the
# COPYING file in the root directory) and Apache 2.0 License
# (found in the LICENSE.Apache file in the root directory).
from advisor.db_log_parser import Log
from advisor.db_timeseries_parser import TimeSeriesData, NO_ENTITY
from advisor.rule_parser import Condition, TimeSeriesCondition
import copy
import glob
import re
import subprocess
import time
class LogStatsParser(TimeSeriesData):
STATS = 'STATISTICS:'
@staticmethod
def parse_log_line_for_stats(log_line):
# Example stat line (from LOG file):
# "rocksdb.db.get.micros P50 : 8.4 P95 : 21.8 P99 : 33.9 P100 : 92.0\n"
token_list = log_line.strip().split()
# token_list = ['rocksdb.db.get.micros', 'P50', ':', '8.4', 'P95', ':',
# '21.8', 'P99', ':', '33.9', 'P100', ':', '92.0']
stat_prefix = token_list[0] + '.' # 'rocksdb.db.get.micros.'
stat_values = [
token
for token in token_list[1:]
if token != ':'
]
# stat_values = ['P50', '8.4', 'P95', '21.8', 'P99', '33.9', 'P100',
# '92.0']
stat_dict = {}
for ix, metric in enumerate(stat_values):
if ix % 2 == 0:
stat_name = stat_prefix + metric
stat_name = stat_name.lower() # Note: case insensitive names
else:
stat_dict[stat_name] = float(metric)
# stat_dict = {'rocksdb.db.get.micros.p50': 8.4,
# 'rocksdb.db.get.micros.p95': 21.8, 'rocksdb.db.get.micros.p99': 33.9,
# 'rocksdb.db.get.micros.p100': 92.0}
return stat_dict
def __init__(self, logs_path_prefix, stats_freq_sec):
super().__init__()
self.logs_file_prefix = logs_path_prefix
self.stats_freq_sec = stats_freq_sec
self.duration_sec = 60
def get_keys_from_conditions(self, conditions):
# Note: case insensitive stat names
reqd_stats = []
for cond in conditions:
for key in cond.keys:
key = key.lower()
# some keys are prepended with '[]' for OdsStatsFetcher to
# replace this with the appropriate key_prefix, remove these
# characters here since the LogStatsParser does not need
# a prefix
if key.startswith('[]'):
reqd_stats.append(key[2:])
else:
reqd_stats.append(key)
return reqd_stats
def add_to_timeseries(self, log, reqd_stats):
# this method takes in the Log object that contains the Rocksdb stats
# and a list of required stats, then it parses the stats line by line
# to fetch required stats and add them to the keys_ts object
# Example: reqd_stats = ['rocksdb.block.cache.hit.count',
# 'rocksdb.db.get.micros.p99']
# Let log.get_message() returns following string:
# "[WARN] [db/db_impl.cc:485] STATISTICS:\n
# rocksdb.block.cache.miss COUNT : 1459\n
# rocksdb.block.cache.hit COUNT : 37\n
# ...
# rocksdb.db.get.micros P50 : 15.6 P95 : 39.7 P99 : 62.6 P100 : 148.0\n
# ..."
new_lines = log.get_message().split('\n')
# let log_ts = 1532518219
log_ts = log.get_timestamp()
# example updates to keys_ts:
# keys_ts[NO_ENTITY]['rocksdb.db.get.micros.p99'][1532518219] = 62.6
# keys_ts[NO_ENTITY]['rocksdb.block.cache.hit.count'][1532518219] = 37
for line in new_lines[1:]: # new_lines[0] does not contain any stats
stats_on_line = self.parse_log_line_for_stats(line)
for stat in stats_on_line:
if stat in reqd_stats:
if stat not in self.keys_ts[NO_ENTITY]:
self.keys_ts[NO_ENTITY][stat] = {}
self.keys_ts[NO_ENTITY][stat][log_ts] = stats_on_line[stat]
def fetch_timeseries(self, reqd_stats):
# this method parses the Rocksdb LOG file and generates timeseries for
# each of the statistic in the list reqd_stats
self.keys_ts = {NO_ENTITY: {}}
for file_name in glob.glob(self.logs_file_prefix + '*'):
# TODO(poojam23): find a way to distinguish between 'old' log files
# from current and previous experiments, present in the same
# directory
if re.search('old', file_name, re.IGNORECASE):
continue
with open(file_name, 'r') as db_logs:
new_log = None
for line in db_logs:
if Log.is_new_log(line):
if (
new_log and
re.search(self.STATS, new_log.get_message())
):
self.add_to_timeseries(new_log, reqd_stats)
new_log = Log(line, column_families=[])
else:
# To account for logs split into multiple lines
new_log.append_message(line)
# Check for the last log in the file.
if new_log and re.search(self.STATS, new_log.get_message()):
self.add_to_timeseries(new_log, reqd_stats)
class DatabasePerfContext(TimeSeriesData):
# TODO(poojam23): check if any benchrunner provides PerfContext sampled at
# regular intervals
def __init__(self, perf_context_ts, stats_freq_sec=0, cumulative=True):
'''
perf_context_ts is expected to be in the following format:
Dict[metric, Dict[timestamp, value]], where for
each (metric, timestamp) pair, the value is database-wide (i.e.
summed over all the threads involved)
if stats_freq_sec == 0, per-metric only one value is reported
'''
super().__init__()
self.stats_freq_sec = stats_freq_sec
self.keys_ts = {NO_ENTITY: perf_context_ts}
if cumulative:
self.unaccumulate_metrics()
def unaccumulate_metrics(self):
# if the perf context metrics provided are cumulative in nature, this
# method can be used to convert them to a disjoint format
epoch_ts = copy.deepcopy(self.keys_ts)
for stat in self.keys_ts[NO_ENTITY]:
timeseries = sorted(
list(self.keys_ts[NO_ENTITY][stat].keys()), reverse=True
)
if len(timeseries) < 2:
continue
for ix, ts in enumerate(timeseries[:-1]):
epoch_ts[NO_ENTITY][stat][ts] = (
epoch_ts[NO_ENTITY][stat][ts] -
epoch_ts[NO_ENTITY][stat][timeseries[ix+1]]
)
if epoch_ts[NO_ENTITY][stat][ts] < 0:
raise ValueError('DBPerfContext: really cumulative?')
# drop the smallest timestamp in the timeseries for this metric
epoch_ts[NO_ENTITY][stat].pop(timeseries[-1])
self.keys_ts = epoch_ts
def get_keys_from_conditions(self, conditions):
reqd_stats = []
for cond in conditions:
reqd_stats.extend([key.lower() for key in cond.keys])
return reqd_stats
def fetch_timeseries(self, statistics):
# this method is redundant for DatabasePerfContext because the __init__
# does the job of populating 'keys_ts'
pass
class OdsStatsFetcher(TimeSeriesData):
# class constants
OUTPUT_FILE = 'temp/stats_out.tmp'
ERROR_FILE = 'temp/stats_err.tmp'
RAPIDO_COMMAND = "%s --entity=%s --key=%s --tstart=%s --tend=%s --showtime"
ODS_COMMAND = '%s %s %s' # client, entities, keys
# static methods
@staticmethod
def _get_string_in_quotes(value):
return '"' + str(value) + '"'
@staticmethod
def _get_time_value_pair(pair_string):
# example pair_string: '[1532544591, 97.3653601828]'
pair_string = pair_string.replace('[', '')
pair_string = pair_string.replace(']', '')
pair = pair_string.split(',')
first = int(pair[0].strip())
second = float(pair[1].strip())
return [first, second]
def __init__(self, client, entities, key_prefix=None):
super().__init__()
self.client = client
self.entities = entities
self.key_prefix = key_prefix
self.stats_freq_sec = 60
self.duration_sec = 60
# Fetch last 3 hours data by default
self.end_time = int(time.time())
self.start_time = self.end_time - (3 * 60 * 60)
def execute_script(self, command):
print('executing...')
print(command)
out_file = open(self.OUTPUT_FILE, "w+")
err_file = open(self.ERROR_FILE, "w+")
subprocess.call(command, shell=True, stdout=out_file, stderr=err_file)
out_file.close()
err_file.close()
def parse_rapido_output(self):
# Output looks like the following:
# <entity_name>\t<key_name>\t[[ts, value], [ts, value], ...]
# ts = timestamp; value = value of key_name in entity_name at time ts
self.keys_ts = {}
with open(self.OUTPUT_FILE, 'r') as fp:
for line in fp:
token_list = line.strip().split('\t')
entity = token_list[0]
key = token_list[1]
if entity not in self.keys_ts:
self.keys_ts[entity] = {}
if key not in self.keys_ts[entity]:
self.keys_ts[entity][key] = {}
list_of_lists = [
self._get_time_value_pair(pair_string)
for pair_string in token_list[2].split('],')
]
value = {pair[0]: pair[1] for pair in list_of_lists}
self.keys_ts[entity][key] = value
def parse_ods_output(self):
# Output looks like the following:
# <entity_name>\t<key_name>\t<timestamp>\t<value>
# there is one line per (entity_name, key_name, timestamp)
self.keys_ts = {}
with open(self.OUTPUT_FILE, 'r') as fp:
for line in fp:
token_list = line.split()
entity = token_list[0]
if entity not in self.keys_ts:
self.keys_ts[entity] = {}
key = token_list[1]
if key not in self.keys_ts[entity]:
self.keys_ts[entity][key] = {}
self.keys_ts[entity][key][token_list[2]] = token_list[3]
def fetch_timeseries(self, statistics):
# this method fetches the timeseries of required stats from the ODS
# service and populates the 'keys_ts' object appropriately
print('OdsStatsFetcher: fetching ' + str(statistics))
if re.search('rapido', self.client, re.IGNORECASE):
command = self.RAPIDO_COMMAND % (
self.client,
self._get_string_in_quotes(self.entities),
self._get_string_in_quotes(','.join(statistics)),
self._get_string_in_quotes(self.start_time),
self._get_string_in_quotes(self.end_time)
)
# Run the tool and fetch the time-series data
self.execute_script(command)
# Parse output and populate the 'keys_ts' map
self.parse_rapido_output()
elif re.search('ods', self.client, re.IGNORECASE):
command = self.ODS_COMMAND % (
self.client,
self._get_string_in_quotes(self.entities),
self._get_string_in_quotes(','.join(statistics))
)
# Run the tool and fetch the time-series data
self.execute_script(command)
# Parse output and populate the 'keys_ts' map
self.parse_ods_output()
def get_keys_from_conditions(self, conditions):
reqd_stats = []
for cond in conditions:
for key in cond.keys:
use_prefix = False
if key.startswith('[]'):
use_prefix = True
key = key[2:]
# TODO(poojam23): this is very hacky and needs to be improved
if key.startswith("rocksdb"):
key += ".60"
if use_prefix:
if not self.key_prefix:
print('Warning: OdsStatsFetcher might need key prefix')
print('for the key: ' + key)
else:
key = self.key_prefix + "." + key
reqd_stats.append(key)
return reqd_stats
def fetch_rate_url(self, entities, keys, window_len, percent, display):
# type: (List[str], List[str], str, str, bool) -> str
transform_desc = (
"rate(" + str(window_len) + ",duration=" + str(self.duration_sec)
)
if percent:
transform_desc = transform_desc + ",%)"
else:
transform_desc = transform_desc + ")"
command = self.RAPIDO_COMMAND + " --transform=%s --url=%s"
command = command % (
self.client,
self._get_string_in_quotes(','.join(entities)),
self._get_string_in_quotes(','.join(keys)),
self._get_string_in_quotes(self.start_time),
self._get_string_in_quotes(self.end_time),
self._get_string_in_quotes(transform_desc),
self._get_string_in_quotes(display)
)
self.execute_script(command)
url = ""
with open(self.OUTPUT_FILE, 'r') as fp:
url = fp.readline()
return url
# TODO(poojam23): remove these blocks once the unittests for LogStatsParser are
# in place
def main():
# populating the statistics
log_stats = LogStatsParser('temp/db_stats_fetcher_main_LOG.tmp', 20)
print(log_stats.type)
print(log_stats.keys_ts)
print(log_stats.logs_file_prefix)
print(log_stats.stats_freq_sec)
print(log_stats.duration_sec)
statistics = [
'rocksdb.number.rate_limiter.drains.count',
'rocksdb.number.block.decompressed.count',
'rocksdb.db.get.micros.p50',
'rocksdb.manifest.file.sync.micros.p99',
'rocksdb.db.get.micros.p99'
]
log_stats.fetch_timeseries(statistics)
print()
print(log_stats.keys_ts)
# aggregated statistics
print()
print(log_stats.fetch_aggregated_values(
NO_ENTITY, statistics, TimeSeriesData.AggregationOperator.latest
))
print(log_stats.fetch_aggregated_values(
NO_ENTITY, statistics, TimeSeriesData.AggregationOperator.oldest
))
print(log_stats.fetch_aggregated_values(
NO_ENTITY, statistics, TimeSeriesData.AggregationOperator.max
))
print(log_stats.fetch_aggregated_values(
NO_ENTITY, statistics, TimeSeriesData.AggregationOperator.min
))
print(log_stats.fetch_aggregated_values(
NO_ENTITY, statistics, TimeSeriesData.AggregationOperator.avg
))
# condition 'evaluate_expression' that evaluates to true
cond1 = Condition('cond-1')
cond1 = TimeSeriesCondition.create(cond1)
cond1.set_parameter('keys', statistics)
cond1.set_parameter('behavior', 'evaluate_expression')
cond1.set_parameter('evaluate', 'keys[3]-keys[2]>=0')
cond1.set_parameter('aggregation_op', 'avg')
# condition 'evaluate_expression' that evaluates to false
cond2 = Condition('cond-2')
cond2 = TimeSeriesCondition.create(cond2)
cond2.set_parameter('keys', statistics)
cond2.set_parameter('behavior', 'evaluate_expression')
cond2.set_parameter('evaluate', '((keys[1]-(2*keys[0]))/100)<3000')
cond2.set_parameter('aggregation_op', 'latest')
# condition 'evaluate_expression' that evaluates to true; no aggregation_op
cond3 = Condition('cond-3')
cond3 = TimeSeriesCondition.create(cond3)
cond3.set_parameter('keys', [statistics[2], statistics[3]])
cond3.set_parameter('behavior', 'evaluate_expression')
cond3.set_parameter('evaluate', '(keys[1]/keys[0])>23')
# check remaining methods
conditions = [cond1, cond2, cond3]
print()
print(log_stats.get_keys_from_conditions(conditions))
log_stats.check_and_trigger_conditions(conditions)
print()
print(cond1.get_trigger())
print(cond2.get_trigger())
print(cond3.get_trigger())
# TODO(poojam23): shift this code to the unit tests for DatabasePerfContext
def check_perf_context_code():
string = (
" user_key_comparison_count = 675903942, " +
"block_cache_hit_count = 830086, " +
"get_from_output_files_time = 85088293818, " +
"seek_on_memtable_time = 0,"
)
token_list = string.split(',')
perf_context = {
token.split('=')[0].strip(): int(token.split('=')[1].strip())
for token in token_list
if token
}
timestamp = int(time.time())
perf_ts = {}
for key in perf_context:
perf_ts[key] = {}
start_val = perf_context[key]
for ix in range(5):
perf_ts[key][timestamp+(ix*10)] = start_val + (2 * ix)
db_perf_context = DatabasePerfContext(perf_ts, 10, True)
print(db_perf_context.keys_ts)
if __name__ == '__main__':
main()
check_perf_context_code()

@ -0,0 +1,208 @@
# Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
# This source code is licensed under both the GPLv2 (found in the
# COPYING file in the root directory) and Apache 2.0 License
# (found in the LICENSE.Apache file in the root directory).
from abc import abstractmethod
from advisor.db_log_parser import DataSource
from enum import Enum
import math
NO_ENTITY = 'ENTITY_PLACEHOLDER'
class TimeSeriesData(DataSource):
class Behavior(Enum):
bursty = 1
evaluate_expression = 2
class AggregationOperator(Enum):
avg = 1
max = 2
min = 3
latest = 4
oldest = 5
def __init__(self):
super().__init__(DataSource.Type.TIME_SERIES)
self.keys_ts = None # Dict[entity, Dict[key, Dict[timestamp, value]]]
self.stats_freq_sec = None
@abstractmethod
def get_keys_from_conditions(self, conditions):
# This method takes in a list of time-series conditions; for each
# condition it manipulates the 'keys' in the way that is supported by
# the subclass implementing this method
pass
@abstractmethod
def fetch_timeseries(self, required_statistics):
# this method takes in a list of statistics and fetches the timeseries
# for each of them and populates the 'keys_ts' dictionary
pass
def fetch_burst_epochs(
self, entities, statistic, window_sec, threshold, percent
):
# type: (str, int, float, bool) -> Dict[str, Dict[int, float]]
# this method calculates the (percent) rate change in the 'statistic'
# for each entity (over 'window_sec' seconds) and returns the epochs
# where this rate change is greater than or equal to the 'threshold'
# value
if self.stats_freq_sec == 0:
# not time series data, cannot check for bursty behavior
return
if window_sec < self.stats_freq_sec:
window_sec = self.stats_freq_sec
# 'window_samples' is the number of windows to go back to
# compare the current window with, while calculating rate change.
window_samples = math.ceil(window_sec / self.stats_freq_sec)
burst_epochs = {}
# if percent = False:
# curr_val = value at window for which rate change is being calculated
# prev_val = value at window that is window_samples behind curr_window
# Then rate_without_percent =
# ((curr_val-prev_val)*duration_sec)/(curr_timestamp-prev_timestamp)
# if percent = True:
# rate_with_percent = (rate_without_percent * 100) / prev_val
# These calculations are in line with the rate() transform supported
# by ODS
for entity in entities:
if statistic not in self.keys_ts[entity]:
continue
timestamps = sorted(list(self.keys_ts[entity][statistic].keys()))
for ix in range(window_samples, len(timestamps), 1):
first_ts = timestamps[ix - window_samples]
last_ts = timestamps[ix]
first_val = self.keys_ts[entity][statistic][first_ts]
last_val = self.keys_ts[entity][statistic][last_ts]
diff = last_val - first_val
if percent:
diff = diff * 100 / first_val
rate = (diff * self.duration_sec) / (last_ts - first_ts)
# if the rate change is greater than the provided threshold,
# then the condition is triggered for entity at time 'last_ts'
if rate >= threshold:
if entity not in burst_epochs:
burst_epochs[entity] = {}
burst_epochs[entity][last_ts] = rate
return burst_epochs
def fetch_aggregated_values(self, entity, statistics, aggregation_op):
# type: (str, AggregationOperator) -> Dict[str, float]
# this method performs the aggregation specified by 'aggregation_op'
# on the timeseries of 'statistics' for 'entity' and returns:
# Dict[statistic, aggregated_value]
result = {}
for stat in statistics:
if stat not in self.keys_ts[entity]:
continue
agg_val = None
if aggregation_op is self.AggregationOperator.latest:
latest_timestamp = max(list(self.keys_ts[entity][stat].keys()))
agg_val = self.keys_ts[entity][stat][latest_timestamp]
elif aggregation_op is self.AggregationOperator.oldest:
oldest_timestamp = min(list(self.keys_ts[entity][stat].keys()))
agg_val = self.keys_ts[entity][stat][oldest_timestamp]
elif aggregation_op is self.AggregationOperator.max:
agg_val = max(list(self.keys_ts[entity][stat].values()))
elif aggregation_op is self.AggregationOperator.min:
agg_val = min(list(self.keys_ts[entity][stat].values()))
elif aggregation_op is self.AggregationOperator.avg:
values = list(self.keys_ts[entity][stat].values())
agg_val = sum(values) / len(values)
result[stat] = agg_val
return result
def check_and_trigger_conditions(self, conditions):
# get the list of statistics that need to be fetched
reqd_keys = self.get_keys_from_conditions(conditions)
# fetch the required statistics and populate the map 'keys_ts'
self.fetch_timeseries(reqd_keys)
# Trigger the appropriate conditions
for cond in conditions:
complete_keys = self.get_keys_from_conditions([cond])
# Get the entities that have all statistics required by 'cond':
# an entity is checked for a given condition only if we possess all
# of the condition's 'keys' for that entity
entities_with_stats = []
for entity in self.keys_ts:
stat_missing = False
for stat in complete_keys:
if stat not in self.keys_ts[entity]:
stat_missing = True
break
if not stat_missing:
entities_with_stats.append(entity)
if not entities_with_stats:
continue
if cond.behavior is self.Behavior.bursty:
# for a condition that checks for bursty behavior, only one key
# should be present in the condition's 'keys' field
result = self.fetch_burst_epochs(
entities_with_stats,
complete_keys[0], # there should be only one key
cond.window_sec,
cond.rate_threshold,
True
)
# Trigger in this case is:
# Dict[entity_name, Dict[timestamp, rate_change]]
# where the inner dictionary contains rate_change values when
# the rate_change >= threshold provided, with the
# corresponding timestamps
if result:
cond.set_trigger(result)
elif cond.behavior is self.Behavior.evaluate_expression:
self.handle_evaluate_expression(
cond,
complete_keys,
entities_with_stats
)
def handle_evaluate_expression(self, condition, statistics, entities):
trigger = {}
# check 'condition' for each of these entities
for entity in entities:
if hasattr(condition, 'aggregation_op'):
# in this case, the aggregation operation is performed on each
# of the condition's 'keys' and then with aggregated values
# condition's 'expression' is evaluated; if it evaluates to
# True, then list of the keys values is added to the
# condition's trigger: Dict[entity_name, List[stats]]
result = self.fetch_aggregated_values(
entity, statistics, condition.aggregation_op
)
keys = [result[key] for key in statistics]
try:
if eval(condition.expression):
trigger[entity] = keys
except Exception as e:
print(
'WARNING(TimeSeriesData) check_and_trigger: ' + str(e)
)
else:
# assumption: all stats have same series of timestamps
# this is similar to the above but 'expression' is evaluated at
# each timestamp, since there is no aggregation, and all the
# epochs are added to the trigger when the condition's
# 'expression' evaluated to true; so trigger is:
# Dict[entity, Dict[timestamp, List[stats]]]
for epoch in self.keys_ts[entity][statistics[0]].keys():
keys = [
self.keys_ts[entity][key][epoch]
for key in statistics
]
try:
if eval(condition.expression):
if entity not in trigger:
trigger[entity] = {}
trigger[entity][epoch] = keys
except Exception as e:
print(
'WARNING(TimeSeriesData) check_and_trigger: ' +
str(e)
)
if trigger:
condition.set_trigger(trigger)

@ -62,8 +62,8 @@ class IniParser:
def get_key_value_pair(line):
line = line.strip()
key = line.split('=')[0].strip()
value = line.split('=')[1].strip()
if not value:
value = "=".join(line.split('=')[1:])
if value == "": # if the option has no value
return (key, None)
values = IniParser.get_list_from_value(value)
if len(values) == 1:

@ -4,11 +4,11 @@
# (found in the LICENSE.Apache file in the root directory).
from abc import ABC, abstractmethod
import argparse
from advisor.db_log_parser import DatabaseLogs, DataSource
from advisor.db_options_parser import DatabaseOptions
from advisor.db_log_parser import DataSource, NO_COL_FAMILY
from advisor.db_timeseries_parser import TimeSeriesData
from enum import Enum
from advisor.ini_parser import IniParser
import re
class Section(ABC):
@ -29,6 +29,9 @@ class Rule(Section):
super().__init__(name)
self.conditions = None
self.suggestions = None
self.overlap_time_seconds = None
self.trigger_entities = None
self.trigger_column_families = None
def set_parameter(self, key, value):
# If the Rule is associated with a single suggestion/condition, then
@ -45,6 +48,8 @@ class Rule(Section):
self.suggestions = [value]
else:
self.suggestions = value
elif key == 'overlap_time_period':
self.overlap_time_seconds = value
def get_suggestions(self):
return self.suggestions
@ -58,12 +63,133 @@ class Rule(Section):
raise ValueError(
self.name + ': rule must have at least one suggestion'
)
if self.overlap_time_seconds:
if len(self.conditions) != 2:
raise ValueError(
self.name + ": rule must be associated with 2 conditions\
in order to check for a time dependency between them"
)
time_format = '^\d+[s|m|h|d]$'
if (
not
re.match(time_format, self.overlap_time_seconds, re.IGNORECASE)
):
raise ValueError(
self.name + ": overlap_time_seconds format: \d+[s|m|h|d]"
)
else: # convert to seconds
in_seconds = int(self.overlap_time_seconds[:-1])
if self.overlap_time_seconds[-1] == 'm':
in_seconds *= 60
elif self.overlap_time_seconds[-1] == 'h':
in_seconds *= (60 * 60)
elif self.overlap_time_seconds[-1] == 'd':
in_seconds *= (24 * 60 * 60)
self.overlap_time_seconds = in_seconds
def get_overlap_timestamps(self, key1_trigger_epochs, key2_trigger_epochs):
# this method takes in 2 timeseries i.e. timestamps at which the
# rule's 2 TIME_SERIES conditions were triggered and it finds
# (if present) the first pair of timestamps at which the 2 conditions
# were triggered within 'overlap_time_seconds' of each other
key1_lower_bounds = [
epoch - self.overlap_time_seconds
for epoch in key1_trigger_epochs
]
key1_lower_bounds.sort()
key2_trigger_epochs.sort()
trigger_ix = 0
overlap_pair = None
for key1_lb in key1_lower_bounds:
while (
key2_trigger_epochs[trigger_ix] < key1_lb and
trigger_ix < len(key2_trigger_epochs)
):
trigger_ix += 1
if trigger_ix >= len(key2_trigger_epochs):
break
if (
key2_trigger_epochs[trigger_ix] <=
key1_lb + (2 * self.overlap_time_seconds)
):
overlap_pair = (
key2_trigger_epochs[trigger_ix],
key1_lb + self.overlap_time_seconds
)
break
return overlap_pair
def get_trigger_entities(self):
return self.trigger_entities
def get_trigger_column_families(self):
return self.trigger_column_families
def is_triggered(self, conditions_dict, column_families):
if self.overlap_time_seconds:
condition1 = conditions_dict[self.conditions[0]]
condition2 = conditions_dict[self.conditions[1]]
if not (
condition1.get_data_source() is DataSource.Type.TIME_SERIES and
condition2.get_data_source() is DataSource.Type.TIME_SERIES
):
raise ValueError(self.name + ': need 2 timeseries conditions')
map1 = condition1.get_trigger()
map2 = condition2.get_trigger()
if not (map1 and map2):
return False
def is_triggered(self, conditions_dict):
condition_triggers = []
for cond in self.conditions:
condition_triggers.append(conditions_dict[cond].is_triggered())
return all(condition_triggers)
self.trigger_entities = {}
is_triggered = False
entity_intersection = (
set(map1.keys()).intersection(set(map2.keys()))
)
for entity in entity_intersection:
overlap_timestamps_pair = (
self.get_overlap_timestamps(
list(map1[entity].keys()), list(map2[entity].keys())
)
)
if overlap_timestamps_pair:
self.trigger_entities[entity] = overlap_timestamps_pair
is_triggered = True
if is_triggered:
self.trigger_column_families = set(column_families)
return is_triggered
else:
all_conditions_triggered = True
self.trigger_column_families = set(column_families)
for cond_name in self.conditions:
cond = conditions_dict[cond_name]
if not cond.get_trigger():
all_conditions_triggered = False
break
if (
cond.get_data_source() is DataSource.Type.LOG or
cond.get_data_source() is DataSource.Type.DB_OPTIONS
):
cond_col_fam = set(cond.get_trigger().keys())
if NO_COL_FAMILY in cond_col_fam:
cond_col_fam = set(column_families)
self.trigger_column_families = (
self.trigger_column_families.intersection(cond_col_fam)
)
elif cond.get_data_source() is DataSource.Type.TIME_SERIES:
cond_entities = set(cond.get_trigger().keys())
if self.trigger_entities is None:
self.trigger_entities = cond_entities
else:
self.trigger_entities = (
self.trigger_entities.intersection(cond_entities)
)
if not (self.trigger_entities or self.trigger_column_families):
all_conditions_triggered = False
break
if not all_conditions_triggered: # clean up if rule not triggered
self.trigger_column_families = None
self.trigger_entities = None
return all_conditions_triggered
def __repr__(self):
# Append conditions
@ -84,6 +210,10 @@ class Rule(Section):
is_first = False
else:
rule_string += (", " + sugg)
if self.trigger_entities:
rule_string += (', entities:: ' + str(self.trigger_entities))
if self.trigger_column_families:
rule_string += (', col_fam:: ' + str(self.trigger_column_families))
# Return constructed string
return rule_string
@ -98,18 +228,27 @@ class Suggestion(Section):
super().__init__(name)
self.option = None
self.action = None
self.suggested_value = None
self.suggested_values = None
self.description = None
def set_parameter(self, key, value):
if key == 'option':
# Note:
# case 1: 'option' is supported by Rocksdb OPTIONS file; in this
# case the option belongs to one of the sections in the config
# file and it's name is prefixed by "<section_type>."
# case 2: 'option' is not supported by Rocksdb OPTIONS file; the
# option is not expected to have the character '.' in its name
self.option = value
elif key == 'action':
if self.option and not value:
raise ValueError(self.name + ': provide action for option')
self.action = self.Action[value]
elif key == 'suggested_value':
self.suggested_value = value
elif key == 'suggested_values':
if isinstance(value, str):
self.suggested_values = [value]
else:
self.suggested_values = value
elif key == 'description':
self.description = value
@ -119,33 +258,28 @@ class Suggestion(Section):
raise ValueError(self.name + ': provide option or description')
if not self.action:
raise ValueError(self.name + ': provide action for option')
if self.action is self.Action.set and not self.suggested_value:
if self.action is self.Action.set and not self.suggested_values:
raise ValueError(
self.name + ': provide suggested value for option'
)
def __repr__(self):
sugg_string = "Suggestion: " + self.name
if self.description:
return self.description
sugg_string = ""
if self.action is self.Action.set:
sugg_string = (
self.name + ' suggests setting ' + self.option +
' to ' + self.suggested_value
)
sugg_string += (' description : ' + self.description)
else:
sugg_string = self.name + ' suggests ' + self.action.name + ' in '
sugg_string += (self.option + '.')
if self.suggested_value:
sugg_string += (
' The suggested value is ' + self.suggested_value
' option : ' + self.option + ' action : ' + self.action.name
)
if self.suggested_values:
sugg_string += (
' suggested_values : ' + str(self.suggested_values)
)
return sugg_string
class Condition(Section):
def __init__(self, name):
# a rule is identified by its name, so there should be no duplicates
super().__init__(name)
self.data_source = None
self.trigger = None
@ -166,6 +300,9 @@ class Condition(Section):
def set_trigger(self, condition_trigger):
self.trigger = condition_trigger
def get_trigger(self):
return self.trigger
def is_triggered(self):
if self.trigger:
return True
@ -173,7 +310,7 @@ class Condition(Section):
def set_parameter(self, key, value):
# must be defined by the subclass
raise ValueError(self.name + ': provide source for condition')
raise NotImplementedError(self.name + ': provide source for condition')
class LogCondition(Condition):
@ -183,15 +320,9 @@ class LogCondition(Condition):
base_condition.__class__ = cls
return base_condition
class Scope(Enum):
database = 1
column_family = 2
def set_parameter(self, key, value):
if key == 'regex':
self.regex = value
elif key == 'scope':
self.scope = self.Scope[value]
def perform_checks(self):
super().perform_checks()
@ -199,10 +330,10 @@ class LogCondition(Condition):
raise ValueError(self.name + ': provide regex for log condition')
def __repr__(self):
log_cond_str = (
self.name + ' checks if the regex ' + self.regex + ' is found ' +
' in the LOG file in the scope of ' + self.scope.name
)
log_cond_str = "LogCondition: " + self.name
log_cond_str += (" regex: " + self.regex)
# if self.trigger:
# log_cond_str += (" trigger: " + str(self.trigger))
return log_cond_str
@ -215,8 +346,11 @@ class OptionCondition(Condition):
def set_parameter(self, key, value):
if key == 'options':
if isinstance(value, str):
self.options = [value]
else:
self.options = value
if key == 'evaluate':
elif key == 'evaluate':
self.eval_expr = value
def perform_checks(self):
@ -227,15 +361,77 @@ class OptionCondition(Condition):
raise ValueError(self.name + ': expression missing in condition')
def __repr__(self):
log_cond_str = (
self.name + ' checks if the given expression evaluates to true'
)
return log_cond_str
opt_cond_str = "OptionCondition: " + self.name
opt_cond_str += (" options: " + str(self.options))
opt_cond_str += (" expression: " + self.eval_expr)
if self.trigger:
opt_cond_str += (" trigger: " + str(self.trigger))
return opt_cond_str
class TimeSeriesCondition(Condition):
@classmethod
def create(cls, base_condition):
base_condition.set_data_source(DataSource.Type['TIME_SERIES'])
base_condition.__class__ = cls
return base_condition
def set_parameter(self, key, value):
if key == 'keys':
if isinstance(value, str):
self.keys = [value]
else:
self.keys = value
elif key == 'behavior':
self.behavior = TimeSeriesData.Behavior[value]
elif key == 'rate_threshold':
self.rate_threshold = float(value)
elif key == 'window_sec':
self.window_sec = int(value)
elif key == 'evaluate':
self.expression = value
elif key == 'aggregation_op':
self.aggregation_op = TimeSeriesData.AggregationOperator[value]
def perform_checks(self):
if not self.keys:
raise ValueError(self.name + ': specify timeseries key')
if not self.behavior:
raise ValueError(self.name + ': specify triggering behavior')
if self.behavior is TimeSeriesData.Behavior.bursty:
if not self.rate_threshold:
raise ValueError(self.name + ': specify rate burst threshold')
if not self.window_sec:
self.window_sec = 300 # default window length is 5 minutes
if len(self.keys) > 1:
raise ValueError(self.name + ': specify only one key')
elif self.behavior is TimeSeriesData.Behavior.evaluate_expression:
if not (self.expression):
raise ValueError(self.name + ': specify evaluation expression')
else:
raise ValueError(self.name + ': trigger behavior not supported')
def __repr__(self):
ts_cond_str = "TimeSeriesCondition: " + self.name
ts_cond_str += (" statistics: " + str(self.keys))
ts_cond_str += (" behavior: " + self.behavior.name)
if self.behavior is TimeSeriesData.Behavior.bursty:
ts_cond_str += (" rate_threshold: " + str(self.rate_threshold))
ts_cond_str += (" window_sec: " + str(self.window_sec))
if self.behavior is TimeSeriesData.Behavior.evaluate_expression:
ts_cond_str += (" expression: " + self.expression)
if hasattr(self, 'aggregation_op'):
ts_cond_str += (" aggregation_op: " + self.aggregation_op.name)
if self.trigger:
ts_cond_str += (" trigger: " + str(self.trigger))
return ts_cond_str
class RulesSpec:
def __init__(self, rules_path):
self.file_path = rules_path
def initialise_fields(self):
self.rules_dict = {}
self.conditions_dict = {}
self.suggestions_dict = {}
@ -249,9 +445,13 @@ class RulesSpec:
sugg.perform_checks()
def load_rules_from_spec(self):
self.initialise_fields()
with open(self.file_path, 'r') as db_rules:
curr_section = None
for line in db_rules:
line = IniParser.remove_trailing_comment(line)
if not line:
continue
element = IniParser.get_element(line)
if element is IniParser.Element.comment:
continue
@ -277,6 +477,8 @@ class RulesSpec:
new_cond = LogCondition.create(new_cond)
elif value == 'OPTIONS':
new_cond = OptionCondition.create(new_cond)
elif value == 'TIME_SERIES':
new_cond = TimeSeriesCondition.create(new_cond)
else:
new_cond.set_parameter(key, value)
elif curr_section is IniParser.Element.sugg:
@ -291,75 +493,36 @@ class RulesSpec:
def get_suggestions_dict(self):
return self.suggestions_dict
def get_triggered_rules(self, data_sources, column_families):
self.trigger_conditions(data_sources)
triggered_rules = []
for rule in self.rules_dict.values():
if rule.is_triggered(self.conditions_dict, column_families):
triggered_rules.append(rule)
return triggered_rules
def trigger_conditions(data_sources, conditions_dict):
for source in data_sources:
def trigger_conditions(self, data_sources):
for source_type in data_sources:
cond_subset = [
cond
for cond in conditions_dict.values()
if cond.get_data_source() is source.type
for cond in self.conditions_dict.values()
if cond.get_data_source() is source_type
]
if not cond_subset:
continue
for source in data_sources[source_type]:
source.check_and_trigger_conditions(cond_subset)
def get_triggered_rules(rules_dict, conditions_dict):
triggered_rules = []
for rule in rules_dict.values():
if rule.is_triggered(conditions_dict):
triggered_rules.append(rule)
return triggered_rules
def main(args):
# Load the rules with their conditions and suggestions.
db_rules = RulesSpec(args.rules_spec)
db_rules.load_rules_from_spec()
# Perform some basic sanity checks for each section.
db_rules.perform_section_checks()
rules_dict = db_rules.get_rules_dict()
conditions_dict = db_rules.get_conditions_dict()
suggestions_dict = db_rules.get_suggestions_dict()
print()
print('RULES')
for rule in rules_dict.values():
print(repr(rule))
print()
print('CONDITIONS')
for cond in conditions_dict.values():
print(repr(cond))
print()
print('SUGGESTIONS')
for sugg in suggestions_dict.values():
print(repr(sugg))
# Initialise the data sources.
data_sources = []
data_sources.append(DatabaseOptions(args.rocksdb_options))
data_sources.append(DatabaseLogs(args.rocksdb_log_prefix))
# Initialise the ConditionChecker with the provided data sources.
trigger_conditions(data_sources, conditions_dict)
# Check for the conditions read in from the Rules spec, if triggered.
print()
triggered_rules = get_triggered_rules(rules_dict, conditions_dict)
for rule in triggered_rules:
print('Rule: ' + rule.name + ' has been triggered and:')
rule_suggestions = rule.get_suggestions()
for sugg_name in rule_suggestions:
print(suggestions_dict[sugg_name])
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='This script is used for\
gauging rocksdb performance using as input: Rocksdb LOG, OPTIONS,\
performance context, command-line statistics and statistics published\
on ODS and providing as output: suggestions to improve Rocksdb\
performance')
parser.add_argument('--rules_spec', required=True, type=str)
parser.add_argument('--rocksdb_options', required=True, type=str)
parser.add_argument('--rocksdb_log_prefix', required=True, type=str)
args = parser.parse_args()
main(args)
def print_rules(self, rules):
for rule in rules:
print('\nRule: ' + rule.name)
for cond_name in rule.conditions:
print(repr(self.conditions_dict[cond_name]))
for sugg_name in rule.suggestions:
print(repr(self.suggestions_dict[sugg_name]))
if rule.trigger_entities:
print('scope: entities:')
print(rule.trigger_entities)
if rule.trigger_column_families:
print('scope: col_fam:')
print(rule.trigger_column_families)

@ -3,24 +3,28 @@
# COPYING file in the root directory) and Apache 2.0 License
# (found in the LICENSE.Apache file in the root directory).
#
# This ini file is very similar to the Rocksdb ini file in terms of syntax.
# FORMAT: very similar to the Rocksdb ini file in terms of syntax
# (refer rocksdb/examples/rocksdb_option_file_example.ini)
# It is made up of multiple sections and each section is made up of multiple
# key-value pairs. Each section must have a name. The recognized sections are
# Rule, Suggestion, Condition followed by their name in "" that acts as an
# identifier. There should be at least one Rule section in the file.
#
# Each rule must be associated with at least one condition and one suggestion.
# If a Rule is associated with multiple Conditions, then all the conditions
# must be triggered in order for the Rule to be triggered.
# The suggestions don't have any ordering amongst them as of now.
# The Rules INI file is made up of multiple sections and each section is made
# up of multiple key-value pairs. The recognized section types are:
# Rule, Suggestion, Condition. Each section must have a name specified in ""
# in the section header. This name acts as an identifier in that section
# type's namespace. A section header looks like:
# [<section_type> "<section_name_identifier>"]
#
# A Condition must be associated to a data source specified by the parameter
# There should be at least one Rule section in the file with its corresponding
# Condition and Suggestion sections. A Rule is triggered only when all of its
# conditions are triggered. The order in which a Rule's conditions and
# suggestions are specified has no significance.
#
# A Condition must be associated with a data source specified by the parameter
# 'source' and this must be the first parameter specified for the Condition.
# A condition can be associated with one or more Rules.
#
# A suggestion is an advised change to a database or column_family option to
# improve the performance of the database in some way. Every suggestion is
# is associated with one or more Rules.
# A Suggestion is an advised change to a Rocksdb option to improve the
# performance of the database in some way. Every suggestion can be a part of
# one or more Rules.
[Rule "stall-too-many-memtables"]
suggestions=inc-bg-flush:inc-write-buffer
@ -29,7 +33,6 @@ conditions=stall-too-many-memtables
[Condition "stall-too-many-memtables"]
source=LOG
regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
scope=column_family
[Rule "stall-too-many-L0"]
suggestions=inc-max-subcompactions:inc-max-bg-compactions:inc-write-buffer-size:dec-max-bytes-for-level-base:inc-l0-slowdown-writes-trigger
@ -38,7 +41,6 @@ conditions=stall-too-many-L0
[Condition "stall-too-many-L0"]
source=LOG
regex=Stalling writes because we have \d+ level-0 files
scope=column_family
[Rule "stop-too-many-L0"]
suggestions=inc-max-bg-compactions:inc-write-buffer-size:inc-l0-stop-writes-trigger
@ -47,7 +49,6 @@ conditions=stop-too-many-L0
[Condition "stop-too-many-L0"]
source=LOG
regex=Stopping writes because we have \d+ level-0 files
scope=column_family
[Rule "stall-too-many-compaction-bytes"]
suggestions=inc-max-bg-compactions:inc-write-buffer-size:inc-hard-pending-compaction-bytes-limit:inc-soft-pending-compaction-bytes-limit
@ -56,11 +57,11 @@ conditions=stall-too-many-compaction-bytes
[Condition "stall-too-many-compaction-bytes"]
source=LOG
regex=Stalling writes because of estimated pending compaction bytes \d+
scope=column_family
[Suggestion "inc-bg-flush"]
option=DBOptions.max_background_flushes
action=increase
suggested_values=2
[Suggestion "inc-write-buffer"]
option=CFOptions.max_write_buffer_number
@ -73,6 +74,7 @@ action=increase
[Suggestion "inc-max-bg-compactions"]
option=DBOptions.max_background_compactions
action=increase
suggested_values=2
[Suggestion "inc-write-buffer-size"]
option=CFOptions.write_buffer_size
@ -100,12 +102,113 @@ action=increase
[Rule "level0-level1-ratio"]
conditions=level0-level1-ratio
suggestions=l0-l1-ratio-health-check
suggestions=inc-base-max-bytes
[Condition "level0-level1-ratio"]
source=OPTIONS
options=CFOptions.level0_file_num_compaction_trigger:CFOptions.write_buffer_size:CFOptions.max_bytes_for_level_base
evaluate=int(options[0])*int(options[1])-int(options[2])<(-251659456) # should evaluate to a boolean
evaluate=int(options[0])*int(options[1])-int(options[2])>=1 # should evaluate to a boolean, condition triggered if evaluates to true
[Suggestion "inc-base-max-bytes"]
option=CFOptions.max_bytes_for_level_base
action=increase
[Rules "tuning-iostat-burst"]
conditions=large-db-get-p99
suggestions=bytes-per-sync-non0:wal-bytes-per-sync-non0:set-rate-limiter
#overlap_time_period=10m
[Condition "write-burst"]
source=TIME_SERIES
keys=dyno.flash_write_bytes_per_sec
behavior=bursty
window_sec=300 # the smaller this window, the more sensitivity to changes in the time series, so the rate_threshold should be bigger; when it's 60, then same as diff(%)
rate_threshold=20
[Condition "large-p99-read-latency"]
source=TIME_SERIES
keys=[]rocksdb.read.block.get.micros.p99
behavior=bursty
window_sec=300
rate_threshold=10
[Condition "large-db-get-p99"]
source=TIME_SERIES
keys=[]rocksdb.db.get.micros.p50:[]rocksdb.db.get.micros.p99
behavior=evaluate_expression
evaluate=(keys[1]/keys[0])>5
[Suggestion "bytes-per-sync-non0"]
option=DBOptions.bytes_per_sync
action=set
suggested_values=1048576
[Suggestion "wal-bytes-per-sync-non0"]
option=DBOptions.wal_bytes_per_sync
action=set
suggested_values=1048576
[Suggestion "set-rate-limiter"]
option=rate_limiter_bytes_per_sec
action=set
suggested_values=1024000
[Rule "bloom-filter-percent-useful"]
conditions=bloom-filter-percent-useful
suggestions=inc-bloom-bits-per-key
[Condition "bloom-filter-percent-useful"]
source=TIME_SERIES
keys=[]rocksdb.bloom.filter.useful.count:[]rocksdb.bloom.filter.full.positive.count:[]rocksdb.bloom.filter.full.true.positive.count
behavior=evaluate_expression
evaluate=((keys[0]+keys[2])/(keys[0]+keys[1]))<0.9 # should evaluate to a boolean
aggregation_op=latest
[Rule "bloom-not-enabled"]
conditions=bloom-not-enabled
suggestions=inc-bloom-bits-per-key
[Condition "bloom-not-enabled"]
source=TIME_SERIES
keys=[]rocksdb.bloom.filter.useful.count:[]rocksdb.bloom.filter.full.positive.count:[]rocksdb.bloom.filter.full.true.positive.count
behavior=evaluate_expression
evaluate=keys[0]+keys[1]+keys[2]==0
aggregation_op=avg
[Suggestion "inc-bloom-bits-per-key"]
option=bloom_bits
action=increase
suggested_values=2
[Rule "small-l0-files"]
conditions=small-l0-files
suggestions=dec-max-bytes-for-level-base:inc-write-buffer-size
[Condition "small-l0-files"]
source=OPTIONS
options=CFOptions.max_bytes_for_level_base:CFOptions.level0_file_num_compaction_trigger:CFOptions.write_buffer_size
evaluate=int(options[0])>(10*int(options[1])*int(options[2]))
[Rule "decompress-time-long"]
conditions=decompress-time-long
suggestions=dec-block-size:inc-block-cache-size:faster-compression-type
[Condition "decompress-time-long"]
source=TIME_SERIES
keys=block_decompress_time:block_read_time:block_checksum_time
behavior=evaluate_expression
evaluate=(keys[0]/(keys[0]+keys[1]+keys[2]))>0.3
[Suggestion "dec-block-size"]
option=TableOptions.BlockBasedTable.block_size
action=decrease
[Suggestion "inc-block-cache-size"]
option=cache_size
action=increase
suggested_values=16000000
[Suggestion "l0-l1-ratio-health-check"]
description='modify options such that (level0_file_num_compaction_trigger * write_buffer_size - max_bytes_for_level_base < 5) is satisfied'
[Suggestion "faster-compression-type"]
option=CFOptions.compression
action=set
suggested_values=kLZ4Compression

@ -23,3 +23,8 @@
2018/05/25-14:34:21.048592 7f82bd676200 [DEBUG] [db/db_impl_files.cc:261] [JOB 45] Delete /tmp/rocksdbtest-155919/dbbench/000084.sst type=2 #84 -- OK
2018/05/25-14:34:21.048603 7f82bd676200 EVENT_LOG_v1 {"time_micros": 1527284061048600, "job": 45, "event": "table_file_deletion", "file_number": 84}
2018/05/25-14:34:21.048981 7f82bd676200 [db/db_impl.cc:398] Shutdown complete
2018/05/25-14:34:21.049000 7f82bd676200 [db/db_impl.cc:563] [col-fam-A] random log message for testing
2018/05/25-14:34:21.049010 7f82bd676200 [db/db_impl.cc:234] [col-fam-B] log continuing on next line
remaining part of the log
2018/05/25-14:34:21.049020 7f82bd676200 [db/db_impl.cc:653] [col-fam-A] another random log message
2018/05/25-14:34:21.049025 7f82bd676200 [db/db_impl.cc:331] [unknown] random log message no column family

@ -5,7 +5,6 @@ conditions=missing-source
[Condition "normal-rule"]
source=LOG
regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
scope=column_family
[Suggestion "inc-bg-flush"]
option=DBOptions.max_background_flushes
@ -43,7 +42,6 @@ conditions=missing-regex
[Condition "missing-regex"]
source=LOG
regex=
scope=column_family
[Suggestion "missing-option"]
option=

@ -5,7 +5,6 @@ conditions=missing-source
[Condition "missing-source"]
source=
regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
scope=column_family
[Suggestion "inc-bg-flush"]
option=DBOptions.max_background_flushes

@ -5,7 +5,6 @@ conditions=missing-source
[Condition "normal-condition"]
source=LOG
regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
scope=column_family
[Suggestion "missing-action"]
option=DBOptions.max_background_flushes

@ -5,7 +5,6 @@ conditions=missing-source
[Condition "normal-condition"]
source=LOG
regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
scope=column_family
[Suggestion "inc-bg-flush"]
option=DBOptions.max_background_flushes

@ -17,22 +17,18 @@ conditions=log-4-false:options-1-false
[Condition "log-1-true"]
source=LOG
regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
scope=column_family
[Condition "log-2-true"]
source=LOG
regex=Stalling writes because we have \d+ level-0 files
scope=column_family
[Condition "log-3-true"]
source=LOG
regex=Stopping writes because we have \d+ level-0 files
scope=column_family
[Condition "log-4-false"]
source=LOG
regex=Stalling writes because of estimated pending compaction bytes \d+
scope=column_family
[Condition "options-1-false"]
source=OPTIONS

@ -0,0 +1,83 @@
[Rule "stall-too-many-memtables"]
suggestions=inc-bg-flush:inc-write-buffer
conditions=stall-too-many-memtables
[Condition "stall-too-many-memtables"]
source=LOG
regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
[Rule "stall-too-many-L0"]
suggestions=inc-max-subcompactions:inc-max-bg-compactions:inc-write-buffer-size:dec-max-bytes-for-level-base:inc-l0-slowdown-writes-trigger
conditions=stall-too-many-L0
[Condition "stall-too-many-L0"]
source=LOG
regex=Stalling writes because we have \d+ level-0 files
[Rule "stop-too-many-L0"]
suggestions=inc-max-bg-compactions:inc-write-buffer-size:inc-l0-stop-writes-trigger
conditions=stop-too-many-L0
[Condition "stop-too-many-L0"]
source=LOG
regex=Stopping writes because we have \d+ level-0 files
[Rule "stall-too-many-compaction-bytes"]
suggestions=inc-max-bg-compactions:inc-write-buffer-size:inc-hard-pending-compaction-bytes-limit:inc-soft-pending-compaction-bytes-limit
conditions=stall-too-many-compaction-bytes
[Condition "stall-too-many-compaction-bytes"]
source=LOG
regex=Stalling writes because of estimated pending compaction bytes \d+
[Suggestion "inc-bg-flush"]
option=DBOptions.max_background_flushes
action=increase
[Suggestion "inc-write-buffer"]
option=CFOptions.max_write_buffer_number
action=increase
[Suggestion "inc-max-subcompactions"]
option=DBOptions.max_subcompactions
action=increase
[Suggestion "inc-max-bg-compactions"]
option=DBOptions.max_background_compactions
action=increase
[Suggestion "inc-write-buffer-size"]
option=CFOptions.write_buffer_size
action=increase
[Suggestion "dec-max-bytes-for-level-base"]
option=CFOptions.max_bytes_for_level_base
action=decrease
[Suggestion "inc-l0-slowdown-writes-trigger"]
option=CFOptions.level0_slowdown_writes_trigger
action=increase
[Suggestion "inc-l0-stop-writes-trigger"]
option=CFOptions.level0_stop_writes_trigger
action=increase
[Suggestion "inc-hard-pending-compaction-bytes-limit"]
option=CFOptions.hard_pending_compaction_bytes_limit
action=increase
[Suggestion "inc-soft-pending-compaction-bytes-limit"]
option=CFOptions.soft_pending_compaction_bytes_limit
action=increase
[Rule "level0-level1-ratio"]
conditions=level0-level1-ratio
suggestions=l0-l1-ratio-health-check
[Condition "level0-level1-ratio"]
source=OPTIONS
options=CFOptions.level0_file_num_compaction_trigger:CFOptions.write_buffer_size:CFOptions.max_bytes_for_level_base
evaluate=int(options[0])*int(options[1])-int(options[2])>=-268173312 # should evaluate to a boolean, condition triggered if evaluates to true
[Suggestion "l0-l1-ratio-health-check"]
description='modify options such that (level0_file_num_compaction_trigger * write_buffer_size - max_bytes_for_level_base < -268173312) is satisfied'

@ -0,0 +1,98 @@
from advisor.db_log_parser import DatabaseLogs, Log, NO_COL_FAMILY
from advisor.rule_parser import Condition, LogCondition
import os
import unittest
class TestLog(unittest.TestCase):
def setUp(self):
self.column_families = ['default', 'col_fam_A']
def test_get_column_family(self):
test_log = (
"2018/05/25-14:34:21.047233 7f82ba72e700 [db/flush_job.cc:371] " +
"[col_fam_A] [JOB 44] Level-0 flush table #84: 1890780 bytes OK"
)
db_log = Log(test_log, self.column_families)
self.assertEqual('col_fam_A', db_log.get_column_family())
test_log = (
"2018/05/25-14:34:21.047233 7f82ba72e700 [db/flush_job.cc:371] " +
"[JOB 44] Level-0 flush table #84: 1890780 bytes OK"
)
db_log = Log(test_log, self.column_families)
db_log.append_message('[default] some remaining part of log')
self.assertEqual(NO_COL_FAMILY, db_log.get_column_family())
def test_get_methods(self):
hr_time = "2018/05/25-14:30:25.491635"
context = "7f82ba72e700"
message = (
"[db/flush_job.cc:331] [default] [JOB 10] Level-0 flush table " +
"#23: started"
)
test_log = hr_time + " " + context + " " + message
db_log = Log(test_log, self.column_families)
self.assertEqual(db_log.get_message(), message)
remaining_message = "[col_fam_A] some more logs"
db_log.append_message(remaining_message)
self.assertEqual(
db_log.get_human_readable_time(), "2018/05/25-14:30:25.491635"
)
self.assertEqual(db_log.get_context(), "7f82ba72e700")
self.assertEqual(db_log.get_timestamp(), 1527258625)
self.assertEqual(
db_log.get_message(), str(message + '\n' + remaining_message)
)
def test_is_new_log(self):
new_log = "2018/05/25-14:34:21.047233 context random new log"
remaining_log = "2018/05/25 not really a new log"
self.assertTrue(Log.is_new_log(new_log))
self.assertFalse(Log.is_new_log(remaining_log))
class TestDatabaseLogs(unittest.TestCase):
def test_check_and_trigger_conditions(self):
this_path = os.path.abspath(os.path.dirname(__file__))
logs_path_prefix = os.path.join(this_path, 'input_files/LOG-0')
column_families = ['default', 'col-fam-A', 'col-fam-B']
db_logs = DatabaseLogs(logs_path_prefix, column_families)
# matches, has 2 col_fams
condition1 = LogCondition.create(Condition('cond-A'))
condition1.set_parameter('regex', 'random log message')
# matches, multiple lines message
condition2 = LogCondition.create(Condition('cond-B'))
condition2.set_parameter('regex', 'continuing on next line')
# does not match
condition3 = LogCondition.create(Condition('cond-C'))
condition3.set_parameter('regex', 'this should match no log')
db_logs.check_and_trigger_conditions(
[condition1, condition2, condition3]
)
cond1_trigger = condition1.get_trigger()
self.assertEqual(2, len(cond1_trigger.keys()))
self.assertSetEqual(
{'col-fam-A', NO_COL_FAMILY}, set(cond1_trigger.keys())
)
self.assertEqual(2, len(cond1_trigger['col-fam-A']))
messages = [
"[db/db_impl.cc:563] [col-fam-A] random log message for testing",
"[db/db_impl.cc:653] [col-fam-A] another random log message"
]
self.assertIn(cond1_trigger['col-fam-A'][0].get_message(), messages)
self.assertIn(cond1_trigger['col-fam-A'][1].get_message(), messages)
self.assertEqual(1, len(cond1_trigger[NO_COL_FAMILY]))
self.assertEqual(
cond1_trigger[NO_COL_FAMILY][0].get_message(),
"[db/db_impl.cc:331] [unknown] random log message no column family"
)
cond2_trigger = condition2.get_trigger()
self.assertEqual(['col-fam-B'], list(cond2_trigger.keys()))
self.assertEqual(1, len(cond2_trigger['col-fam-B']))
self.assertEqual(
cond2_trigger['col-fam-B'][0].get_message(),
"[db/db_impl.cc:234] [col-fam-B] log continuing on next line\n" +
"remaining part of the log"
)
self.assertIsNone(condition3.get_trigger())

@ -5,8 +5,9 @@
import os
import unittest
from advisor.rule_parser import RulesSpec, DatabaseLogs, DatabaseOptions
from advisor.rule_parser import get_triggered_rules, trigger_conditions
from advisor.rule_parser import RulesSpec
from advisor.db_log_parser import DatabaseLogs, DataSource
from advisor.db_options_parser import DatabaseOptions
RuleToSuggestions = {
"stall-too-many-memtables": [
@ -41,16 +42,17 @@ class TestAllRulesTriggered(unittest.TestCase):
def setUp(self):
# load the Rules
this_path = os.path.abspath(os.path.dirname(__file__))
ini_path = os.path.join(this_path, '../advisor/rules.ini')
ini_path = os.path.join(this_path, 'input_files/triggered_rules.ini')
self.db_rules = RulesSpec(ini_path)
self.db_rules.load_rules_from_spec()
self.db_rules.perform_section_checks()
# load the data sources: LOG and OPTIONS
log_path = os.path.join(this_path, 'input_files/LOG-0')
options_path = os.path.join(this_path, 'input_files/OPTIONS-000005')
self.data_sources = []
self.data_sources.append(DatabaseOptions(options_path))
self.data_sources.append(DatabaseLogs(log_path))
db_options_parser = DatabaseOptions(options_path)
self.column_families = db_options_parser.get_column_families()
db_logs_parser = DatabaseLogs(log_path, self.column_families)
self.data_sources = [db_options_parser, db_logs_parser]
def test_triggered_conditions(self):
conditions_dict = self.db_rules.get_conditions_dict()
@ -59,18 +61,25 @@ class TestAllRulesTriggered(unittest.TestCase):
for cond in conditions_dict.values():
self.assertFalse(cond.is_triggered(), repr(cond))
for rule in rules_dict.values():
self.assertFalse(rule.is_triggered(conditions_dict), repr(rule))
self.assertFalse(
rule.is_triggered(conditions_dict, self.column_families),
repr(rule)
)
# Trigger the conditions as per the data sources.
trigger_conditions(self.data_sources, conditions_dict)
# # Trigger the conditions as per the data sources.
# trigger_conditions(, conditions_dict)
# Get the set of rules that have been triggered
triggered_rules = self.db_rules.get_triggered_rules(
self.data_sources, self.column_families
)
# Make sure each condition and rule is triggered
for cond in conditions_dict.values():
if cond.get_data_source() is DataSource.Type.TIME_SERIES:
continue
self.assertTrue(cond.is_triggered(), repr(cond))
# Get the set of rules that have been triggered
triggered_rules = get_triggered_rules(rules_dict, conditions_dict)
for rule in rules_dict.values():
self.assertIn(rule, triggered_rules)
# Check the suggestions made by the triggered rules
@ -94,9 +103,10 @@ class TestConditionsConjunctions(unittest.TestCase):
# load the data sources: LOG and OPTIONS
log_path = os.path.join(this_path, 'input_files/LOG-1')
options_path = os.path.join(this_path, 'input_files/OPTIONS-000005')
self.data_sources = []
self.data_sources.append(DatabaseOptions(options_path))
self.data_sources.append(DatabaseLogs(log_path))
db_options_parser = DatabaseOptions(options_path)
self.column_families = db_options_parser.get_column_families()
db_logs_parser = DatabaseLogs(log_path, self.column_families)
self.data_sources = [db_options_parser, db_logs_parser]
def test_condition_conjunctions(self):
conditions_dict = self.db_rules.get_conditions_dict()
@ -105,10 +115,13 @@ class TestConditionsConjunctions(unittest.TestCase):
for cond in conditions_dict.values():
self.assertFalse(cond.is_triggered(), repr(cond))
for rule in rules_dict.values():
self.assertFalse(rule.is_triggered(conditions_dict), repr(rule))
self.assertFalse(
rule.is_triggered(conditions_dict, self.column_families),
repr(rule)
)
# Trigger the conditions as per the data sources.
trigger_conditions(self.data_sources, conditions_dict)
self.db_rules.trigger_conditions(self.data_sources)
# Check for the conditions
conds_triggered = ['log-1-true', 'log-2-true', 'log-3-true']
@ -125,14 +138,16 @@ class TestConditionsConjunctions(unittest.TestCase):
'multiple-conds-one-false',
'multiple-conds-all-false'
]
for rule in rules_triggered:
for rule_name in rules_triggered:
rule = rules_dict[rule_name]
self.assertTrue(
rules_dict[rule].is_triggered(conditions_dict),
rule.is_triggered(conditions_dict, self.column_families),
repr(rule)
)
for rule in rules_not_triggered:
for rule_name in rules_not_triggered:
rule = rules_dict[rule_name]
self.assertFalse(
rules_dict[rule].is_triggered(conditions_dict),
rule.is_triggered(conditions_dict, self.column_families),
repr(rule)
)
@ -191,7 +206,7 @@ class TestParsingErrors(unittest.TestCase):
ini_path = os.path.join(self.this_path, 'input_files/rules_err2.ini')
db_rules = RulesSpec(ini_path)
regex = '.*provide source for condition.*'
with self.assertRaisesRegex(ValueError, regex):
with self.assertRaisesRegex(NotImplementedError, regex):
db_rules.load_rules_from_spec()
def test_suggestion_missing_action(self):
@ -204,7 +219,7 @@ class TestParsingErrors(unittest.TestCase):
def test_section_no_name(self):
ini_path = os.path.join(self.this_path, 'input_files/rules_err4.ini')
db_rules = RulesSpec(ini_path)
regex = 'Parsing error: section header be like:.*'
regex = 'Parsing error: needed section header:.*'
with self.assertRaisesRegex(ValueError, regex):
db_rules.load_rules_from_spec()

Loading…
Cancel
Save