Summary: rockuse more memory that asked to. Monitor and report. Test Plan: run the pro with conditions to simulate the overusage. It should report that the process is using more memory than needed. Reviewers: yhchiang, rven, sdong, igor Reviewed By: igor Subscribers: dhruba Differential Revision: https://reviews.facebook.net/D33249main
parent
5f00af4570
commit
5d1151deba
@ -0,0 +1,102 @@ |
||||
#!/bin/bash |
||||
# |
||||
#(c) 2004-present, Facebook Inc. All rights reserved. |
||||
# |
||||
#see LICENSE file for more information on use/redistribution rights. |
||||
# |
||||
|
||||
# |
||||
#dbench_monitor: monitor db_bench process for violation of memory utilization |
||||
# |
||||
#default usage will monitor 'virtual memory size'. See below for standard options |
||||
#passed to db_bench during this test. |
||||
# |
||||
# See also: ./pflag for the actual monitoring script that does the work |
||||
# |
||||
#NOTE: |
||||
# You may end up with some /tmp/ files if db_bench OR |
||||
# this script OR ./pflag was killed unceremoniously |
||||
# |
||||
# If you see the script taking a long time, trying "kill" |
||||
# will usually cleanly exit. |
||||
# |
||||
# |
||||
DIR=`dirname $0` |
||||
LOG=/tmp/`basename $0`.$$ |
||||
DB_BENCH="$DIR/../db_bench"; |
||||
PFLAG=${DIR}/pflag |
||||
|
||||
usage() { |
||||
cat <<HELP; exit |
||||
|
||||
Usage: $0 [-h] |
||||
|
||||
-h: prints this help message |
||||
|
||||
This program will run the db_bench script to monitor memory usage |
||||
using the 'pflag' program. It launches db_bench with default settings |
||||
for certain arguments. You can change the defaults passed to |
||||
'db_bench' program, by setting the following environment |
||||
variables: |
||||
|
||||
bs [block_size] |
||||
ztype [compression_type] |
||||
benches [benchmarks] |
||||
reads [reads] |
||||
threads [threads] |
||||
cs [cache_size] |
||||
vsize [value_size] |
||||
comp [compression_ratio] |
||||
num [num] |
||||
|
||||
See the code for more info |
||||
|
||||
HELP |
||||
|
||||
} |
||||
|
||||
[ ! -x ${DB_BENCH} ] && echo "WARNING: ${DB_BENCH} doesn't exist, abort!" && exit -1; |
||||
|
||||
[ "x$1" = "x-h" ] && usage; |
||||
|
||||
trap 'rm -f ${LOG}; kill ${PID}; echo "Interrupted, exiting";' 1 2 3 15 |
||||
|
||||
touch $LOG; |
||||
|
||||
: ${bs:=16384} |
||||
: ${ztype:=zlib} |
||||
: ${benches:=readwhilewriting} |
||||
: ${reads:=$((1*1024*1024))}; |
||||
: ${threads:=8} |
||||
: ${vsize:=2000} |
||||
: ${comp:=0.5} |
||||
: ${num:=10000} |
||||
: ${cs:=$((1*1024*1024*1024))}; |
||||
|
||||
DEBUG=1 #Set to 0 to remove chattiness |
||||
|
||||
|
||||
if [ "x$DEBUG" != "x" ]; then |
||||
# |
||||
#NOTE: under some circumstances, --use_existing_db may leave LOCK files under ${TMPDIR}/rocksdb/* |
||||
#cleanup the dir and re-run |
||||
# |
||||
echo DEBUG: Will run $DB_BENCH --block_size=$bs --compression_type=$ztype --benchmarks="$benches" --reads="$reads" --threads="$threads" --cache_size=$cs --value_size=$vsize --compression_ratio=$comp --num=$num --use_existing_db |
||||
|
||||
fi |
||||
|
||||
$DB_BENCH --block_size=$bs --compression_type=$ztype --benchmarks="$benches" --reads="$reads" --threads="$threads" --cache_size=$cs --value_size=$vsize --compression_ratio=$comp --num=$num --use_existing_db >$LOG 2>&1 & |
||||
|
||||
if [ $? -ne 0 ]; then |
||||
warn "WARNING: ${DB_BENCH} did not launch successfully! Abort!"; |
||||
exit; |
||||
fi |
||||
PID=$! |
||||
|
||||
# |
||||
#Start the monitoring. Default is "vsz" monitoring for upto cache_size ($cs) value of virtual mem |
||||
#You could also monitor RSS and CPUTIME (bsdtime). Try 'pflag -h' for how to do this |
||||
# |
||||
${PFLAG} -p $PID -v |
||||
|
||||
rm -f $LOG; |
@ -0,0 +1,217 @@ |
||||
#!/bin/bash |
||||
# |
||||
#(c) 2004-present, Facebook, all rights reserved. |
||||
# See the LICENSE file for usage and distribution rights. |
||||
# |
||||
|
||||
trap 'echo "Caught exception, dying"; exit' 1 2 3 15 |
||||
|
||||
ME=`basename $0` |
||||
SERVER=`hostname` |
||||
|
||||
#parameters used |
||||
# |
||||
Dump_Config=0 |
||||
DEBUG= |
||||
OS=`/bin/uname -s` |
||||
VMEM= |
||||
RSS= |
||||
CPU= |
||||
VERBOSE= |
||||
VAR= |
||||
LIMIT= |
||||
ACTION= |
||||
N= |
||||
WAIT= |
||||
|
||||
# |
||||
#supported OS: Linux only for now. Easy to add |
||||
# |
||||
oscheck() { |
||||
case ${OS} in |
||||
Linux) |
||||
VMEM=vsz |
||||
RSS=rss |
||||
CPU=bsdtime |
||||
;; |
||||
*) |
||||
die "Unsupported OS ${OS}. Send a bug report with OS you need supported. Thanks." |
||||
;; |
||||
esac |
||||
} |
||||
|
||||
|
||||
verbose() { |
||||
if [ "x$DEBUG" != "x" ]; then |
||||
echo "$@" >&2 |
||||
fi |
||||
} |
||||
|
||||
warn() { |
||||
echo "$@" >&2 |
||||
} |
||||
|
||||
die() { |
||||
echo "ERROR: " "$@" >&2; |
||||
exit; |
||||
} |
||||
|
||||
dump_config() { |
||||
cat <<EOCONFIG; |
||||
$ME running on ${HOSTNAME} at `date` |
||||
|
||||
Configuration for this run: |
||||
PID to monitor : ${PID} |
||||
Resource monitored : ${VAR} |
||||
Resource limit : ${LIMIT} |
||||
Check every : ${WAIT} seconds |
||||
No. of times run : ${N} |
||||
What to do : ${ACTION} |
||||
EOCONFIG |
||||
|
||||
} |
||||
|
||||
usage() { |
||||
cat <<USAGE; exit |
||||
$@ |
||||
|
||||
Usage ${ME} -p pid [-x {VMEM|RSS|CPU}] -l limit [-a {warn|die|kill}] [-n cycles] [-w wait] |
||||
|
||||
Monitor a process for set of violations. Options: |
||||
|
||||
-p: PID of process to monitor |
||||
|
||||
-x: metric to sense. Currently only VMEM/RSS/CPU are supported. Defaults to VMEM |
||||
|
||||
-l: what is the threshold/limit for the metric that is being sensed. |
||||
Examples: "-l 100m", "-l 1.5g" (for VMEM/RSS), "-l 5:04" 5:04 in BSDTIME for CPU |
||||
NOTE: defaults to 1GB |
||||
|
||||
-a: action. Currently {warn|die|kill} are supported. |
||||
The default action is to 'warn'. Here is the behavior: |
||||
|
||||
warn: complain if usage exceeds threshold, but continue monitoring |
||||
kill: complain, kill the db_bench process and exit |
||||
die: if usage exceeds threshold, die immediately |
||||
|
||||
-n: number of cycles to monitor. Default is to monitor until PID no longer exists. |
||||
|
||||
-w: wait time per cycle of monitoring. Default is 5 seconds. |
||||
|
||||
-v: verbose messaging |
||||
|
||||
USAGE |
||||
|
||||
} |
||||
|
||||
#set default values if none given |
||||
set_defaults_if_noopt_given() { |
||||
|
||||
: ${VAR:=vsz} |
||||
: ${LIMIT:=1024000} |
||||
: ${WAIT:=5} |
||||
: ${N:=999999} |
||||
: ${ACTION:=warn} |
||||
} |
||||
|
||||
validate_options() { |
||||
if [ "x$PID" = "x" -a $Dump_Config -ne 1 ]; then |
||||
usage "PID is mandatory" |
||||
fi |
||||
} |
||||
|
||||
###### START |
||||
|
||||
|
||||
while getopts ":p:x:l:a:n:t:vhd" opt; do |
||||
case $opt in |
||||
d) |
||||
Dump_Config=1 |
||||
;; |
||||
h) |
||||
usage; |
||||
;; |
||||
a) |
||||
ACTION=${OPTARG}; |
||||
;; |
||||
v) |
||||
DEBUG=1; |
||||
;; |
||||
p) |
||||
PID=$OPTARG; |
||||
;; |
||||
x) |
||||
VAR=$OPTARG; |
||||
;; |
||||
l) |
||||
LIMIT=$OPTARG; |
||||
;; |
||||
w) |
||||
WAIT=$OPTARG; |
||||
;; |
||||
n) |
||||
N=$OPTARG; |
||||
;; |
||||
\?) |
||||
usage; |
||||
;; |
||||
esac |
||||
done |
||||
|
||||
oscheck; |
||||
set_defaults_if_noopt_given; |
||||
validate_options; |
||||
|
||||
if [ $Dump_Config -eq 1 ]; then |
||||
dump_config; |
||||
exit; |
||||
fi |
||||
|
||||
Done=0 |
||||
|
||||
verbose "Trying ${N} times, Waiting ${WAIT} seconds each iteration"; |
||||
|
||||
while [ $Done -eq 0 ]; do |
||||
VAL=`/bin/ps h -p $PID -o ${VAR} | perl -pe 'chomp; s/(.*)m/$1 * 1024/e; s/(.*)g/$1 * 1024 * 1024/e;'` |
||||
if [ ${VAL:=0} -eq 0 ]; then |
||||
warn "Process $PID ended without incident." |
||||
Done=1; |
||||
break; |
||||
fi |
||||
|
||||
if [ $VAL -ge $LIMIT ]; then |
||||
Done=1; |
||||
else |
||||
echo "Value of '${VAR}' (${VAL}) is less than ${LIMIT} for PID ${PID}" |
||||
sleep $WAIT; |
||||
fi |
||||
if [ $Done -eq 1 ]; then |
||||
|
||||
if [ "$ACTION" = "kill" ]; then |
||||
kill ${PID} || kill -3 ${PID} |
||||
exit; |
||||
|
||||
elif [ "$ACTION" = "warn" ]; then |
||||
|
||||
# go back to monitoring. |
||||
|
||||
warn "`date` WARNING: ${VAR} breached threshold ${LIMIT}, actual is ${VAL}" |
||||
Done=0 #go back to monitoring |
||||
|
||||
elif [ "$ACTION" = "die" ]; then |
||||
warn "WARNING: dying without killing process ${PID} on ${SERVER}" |
||||
warn "The process details are below: " |
||||
warn "`ps -p ${PID} -o pid,ppid,bsdtime,rss,vsz,cmd,args`" |
||||
warn "" |
||||
|
||||
#should we send email/notify someone? TODO... for now, bail. |
||||
|
||||
exit -1; |
||||
|
||||
fi |
||||
else |
||||
: |
||||
#warn "INFO: PID $PID, $VAR = $VAL, limit ($LIMIT) not exceeded"; |
||||
fi |
||||
done |
||||
|
Loading…
Reference in new issue