Summary: rockuse more memory that asked to. Monitor and report. Test Plan: run the pro with conditions to simulate the overusage. It should report that the process is using more memory than needed. Reviewers: yhchiang, rven, sdong, igor Reviewed By: igor Subscribers: dhruba Differential Revision: https://reviews.facebook.net/D33249main
parent
5f00af4570
commit
5d1151deba
@ -0,0 +1,102 @@ |
|||||||
|
#!/bin/bash |
||||||
|
# |
||||||
|
#(c) 2004-present, Facebook Inc. All rights reserved. |
||||||
|
# |
||||||
|
#see LICENSE file for more information on use/redistribution rights. |
||||||
|
# |
||||||
|
|
||||||
|
# |
||||||
|
#dbench_monitor: monitor db_bench process for violation of memory utilization |
||||||
|
# |
||||||
|
#default usage will monitor 'virtual memory size'. See below for standard options |
||||||
|
#passed to db_bench during this test. |
||||||
|
# |
||||||
|
# See also: ./pflag for the actual monitoring script that does the work |
||||||
|
# |
||||||
|
#NOTE: |
||||||
|
# You may end up with some /tmp/ files if db_bench OR |
||||||
|
# this script OR ./pflag was killed unceremoniously |
||||||
|
# |
||||||
|
# If you see the script taking a long time, trying "kill" |
||||||
|
# will usually cleanly exit. |
||||||
|
# |
||||||
|
# |
||||||
|
DIR=`dirname $0` |
||||||
|
LOG=/tmp/`basename $0`.$$ |
||||||
|
DB_BENCH="$DIR/../db_bench"; |
||||||
|
PFLAG=${DIR}/pflag |
||||||
|
|
||||||
|
usage() { |
||||||
|
cat <<HELP; exit |
||||||
|
|
||||||
|
Usage: $0 [-h] |
||||||
|
|
||||||
|
-h: prints this help message |
||||||
|
|
||||||
|
This program will run the db_bench script to monitor memory usage |
||||||
|
using the 'pflag' program. It launches db_bench with default settings |
||||||
|
for certain arguments. You can change the defaults passed to |
||||||
|
'db_bench' program, by setting the following environment |
||||||
|
variables: |
||||||
|
|
||||||
|
bs [block_size] |
||||||
|
ztype [compression_type] |
||||||
|
benches [benchmarks] |
||||||
|
reads [reads] |
||||||
|
threads [threads] |
||||||
|
cs [cache_size] |
||||||
|
vsize [value_size] |
||||||
|
comp [compression_ratio] |
||||||
|
num [num] |
||||||
|
|
||||||
|
See the code for more info |
||||||
|
|
||||||
|
HELP |
||||||
|
|
||||||
|
} |
||||||
|
|
||||||
|
[ ! -x ${DB_BENCH} ] && echo "WARNING: ${DB_BENCH} doesn't exist, abort!" && exit -1; |
||||||
|
|
||||||
|
[ "x$1" = "x-h" ] && usage; |
||||||
|
|
||||||
|
trap 'rm -f ${LOG}; kill ${PID}; echo "Interrupted, exiting";' 1 2 3 15 |
||||||
|
|
||||||
|
touch $LOG; |
||||||
|
|
||||||
|
: ${bs:=16384} |
||||||
|
: ${ztype:=zlib} |
||||||
|
: ${benches:=readwhilewriting} |
||||||
|
: ${reads:=$((1*1024*1024))}; |
||||||
|
: ${threads:=8} |
||||||
|
: ${vsize:=2000} |
||||||
|
: ${comp:=0.5} |
||||||
|
: ${num:=10000} |
||||||
|
: ${cs:=$((1*1024*1024*1024))}; |
||||||
|
|
||||||
|
DEBUG=1 #Set to 0 to remove chattiness |
||||||
|
|
||||||
|
|
||||||
|
if [ "x$DEBUG" != "x" ]; then |
||||||
|
# |
||||||
|
#NOTE: under some circumstances, --use_existing_db may leave LOCK files under ${TMPDIR}/rocksdb/* |
||||||
|
#cleanup the dir and re-run |
||||||
|
# |
||||||
|
echo DEBUG: Will run $DB_BENCH --block_size=$bs --compression_type=$ztype --benchmarks="$benches" --reads="$reads" --threads="$threads" --cache_size=$cs --value_size=$vsize --compression_ratio=$comp --num=$num --use_existing_db |
||||||
|
|
||||||
|
fi |
||||||
|
|
||||||
|
$DB_BENCH --block_size=$bs --compression_type=$ztype --benchmarks="$benches" --reads="$reads" --threads="$threads" --cache_size=$cs --value_size=$vsize --compression_ratio=$comp --num=$num --use_existing_db >$LOG 2>&1 & |
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then |
||||||
|
warn "WARNING: ${DB_BENCH} did not launch successfully! Abort!"; |
||||||
|
exit; |
||||||
|
fi |
||||||
|
PID=$! |
||||||
|
|
||||||
|
# |
||||||
|
#Start the monitoring. Default is "vsz" monitoring for upto cache_size ($cs) value of virtual mem |
||||||
|
#You could also monitor RSS and CPUTIME (bsdtime). Try 'pflag -h' for how to do this |
||||||
|
# |
||||||
|
${PFLAG} -p $PID -v |
||||||
|
|
||||||
|
rm -f $LOG; |
@ -0,0 +1,217 @@ |
|||||||
|
#!/bin/bash |
||||||
|
# |
||||||
|
#(c) 2004-present, Facebook, all rights reserved. |
||||||
|
# See the LICENSE file for usage and distribution rights. |
||||||
|
# |
||||||
|
|
||||||
|
trap 'echo "Caught exception, dying"; exit' 1 2 3 15 |
||||||
|
|
||||||
|
ME=`basename $0` |
||||||
|
SERVER=`hostname` |
||||||
|
|
||||||
|
#parameters used |
||||||
|
# |
||||||
|
Dump_Config=0 |
||||||
|
DEBUG= |
||||||
|
OS=`/bin/uname -s` |
||||||
|
VMEM= |
||||||
|
RSS= |
||||||
|
CPU= |
||||||
|
VERBOSE= |
||||||
|
VAR= |
||||||
|
LIMIT= |
||||||
|
ACTION= |
||||||
|
N= |
||||||
|
WAIT= |
||||||
|
|
||||||
|
# |
||||||
|
#supported OS: Linux only for now. Easy to add |
||||||
|
# |
||||||
|
oscheck() { |
||||||
|
case ${OS} in |
||||||
|
Linux) |
||||||
|
VMEM=vsz |
||||||
|
RSS=rss |
||||||
|
CPU=bsdtime |
||||||
|
;; |
||||||
|
*) |
||||||
|
die "Unsupported OS ${OS}. Send a bug report with OS you need supported. Thanks." |
||||||
|
;; |
||||||
|
esac |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
verbose() { |
||||||
|
if [ "x$DEBUG" != "x" ]; then |
||||||
|
echo "$@" >&2 |
||||||
|
fi |
||||||
|
} |
||||||
|
|
||||||
|
warn() { |
||||||
|
echo "$@" >&2 |
||||||
|
} |
||||||
|
|
||||||
|
die() { |
||||||
|
echo "ERROR: " "$@" >&2; |
||||||
|
exit; |
||||||
|
} |
||||||
|
|
||||||
|
dump_config() { |
||||||
|
cat <<EOCONFIG; |
||||||
|
$ME running on ${HOSTNAME} at `date` |
||||||
|
|
||||||
|
Configuration for this run: |
||||||
|
PID to monitor : ${PID} |
||||||
|
Resource monitored : ${VAR} |
||||||
|
Resource limit : ${LIMIT} |
||||||
|
Check every : ${WAIT} seconds |
||||||
|
No. of times run : ${N} |
||||||
|
What to do : ${ACTION} |
||||||
|
EOCONFIG |
||||||
|
|
||||||
|
} |
||||||
|
|
||||||
|
usage() { |
||||||
|
cat <<USAGE; exit |
||||||
|
$@ |
||||||
|
|
||||||
|
Usage ${ME} -p pid [-x {VMEM|RSS|CPU}] -l limit [-a {warn|die|kill}] [-n cycles] [-w wait] |
||||||
|
|
||||||
|
Monitor a process for set of violations. Options: |
||||||
|
|
||||||
|
-p: PID of process to monitor |
||||||
|
|
||||||
|
-x: metric to sense. Currently only VMEM/RSS/CPU are supported. Defaults to VMEM |
||||||
|
|
||||||
|
-l: what is the threshold/limit for the metric that is being sensed. |
||||||
|
Examples: "-l 100m", "-l 1.5g" (for VMEM/RSS), "-l 5:04" 5:04 in BSDTIME for CPU |
||||||
|
NOTE: defaults to 1GB |
||||||
|
|
||||||
|
-a: action. Currently {warn|die|kill} are supported. |
||||||
|
The default action is to 'warn'. Here is the behavior: |
||||||
|
|
||||||
|
warn: complain if usage exceeds threshold, but continue monitoring |
||||||
|
kill: complain, kill the db_bench process and exit |
||||||
|
die: if usage exceeds threshold, die immediately |
||||||
|
|
||||||
|
-n: number of cycles to monitor. Default is to monitor until PID no longer exists. |
||||||
|
|
||||||
|
-w: wait time per cycle of monitoring. Default is 5 seconds. |
||||||
|
|
||||||
|
-v: verbose messaging |
||||||
|
|
||||||
|
USAGE |
||||||
|
|
||||||
|
} |
||||||
|
|
||||||
|
#set default values if none given |
||||||
|
set_defaults_if_noopt_given() { |
||||||
|
|
||||||
|
: ${VAR:=vsz} |
||||||
|
: ${LIMIT:=1024000} |
||||||
|
: ${WAIT:=5} |
||||||
|
: ${N:=999999} |
||||||
|
: ${ACTION:=warn} |
||||||
|
} |
||||||
|
|
||||||
|
validate_options() { |
||||||
|
if [ "x$PID" = "x" -a $Dump_Config -ne 1 ]; then |
||||||
|
usage "PID is mandatory" |
||||||
|
fi |
||||||
|
} |
||||||
|
|
||||||
|
###### START |
||||||
|
|
||||||
|
|
||||||
|
while getopts ":p:x:l:a:n:t:vhd" opt; do |
||||||
|
case $opt in |
||||||
|
d) |
||||||
|
Dump_Config=1 |
||||||
|
;; |
||||||
|
h) |
||||||
|
usage; |
||||||
|
;; |
||||||
|
a) |
||||||
|
ACTION=${OPTARG}; |
||||||
|
;; |
||||||
|
v) |
||||||
|
DEBUG=1; |
||||||
|
;; |
||||||
|
p) |
||||||
|
PID=$OPTARG; |
||||||
|
;; |
||||||
|
x) |
||||||
|
VAR=$OPTARG; |
||||||
|
;; |
||||||
|
l) |
||||||
|
LIMIT=$OPTARG; |
||||||
|
;; |
||||||
|
w) |
||||||
|
WAIT=$OPTARG; |
||||||
|
;; |
||||||
|
n) |
||||||
|
N=$OPTARG; |
||||||
|
;; |
||||||
|
\?) |
||||||
|
usage; |
||||||
|
;; |
||||||
|
esac |
||||||
|
done |
||||||
|
|
||||||
|
oscheck; |
||||||
|
set_defaults_if_noopt_given; |
||||||
|
validate_options; |
||||||
|
|
||||||
|
if [ $Dump_Config -eq 1 ]; then |
||||||
|
dump_config; |
||||||
|
exit; |
||||||
|
fi |
||||||
|
|
||||||
|
Done=0 |
||||||
|
|
||||||
|
verbose "Trying ${N} times, Waiting ${WAIT} seconds each iteration"; |
||||||
|
|
||||||
|
while [ $Done -eq 0 ]; do |
||||||
|
VAL=`/bin/ps h -p $PID -o ${VAR} | perl -pe 'chomp; s/(.*)m/$1 * 1024/e; s/(.*)g/$1 * 1024 * 1024/e;'` |
||||||
|
if [ ${VAL:=0} -eq 0 ]; then |
||||||
|
warn "Process $PID ended without incident." |
||||||
|
Done=1; |
||||||
|
break; |
||||||
|
fi |
||||||
|
|
||||||
|
if [ $VAL -ge $LIMIT ]; then |
||||||
|
Done=1; |
||||||
|
else |
||||||
|
echo "Value of '${VAR}' (${VAL}) is less than ${LIMIT} for PID ${PID}" |
||||||
|
sleep $WAIT; |
||||||
|
fi |
||||||
|
if [ $Done -eq 1 ]; then |
||||||
|
|
||||||
|
if [ "$ACTION" = "kill" ]; then |
||||||
|
kill ${PID} || kill -3 ${PID} |
||||||
|
exit; |
||||||
|
|
||||||
|
elif [ "$ACTION" = "warn" ]; then |
||||||
|
|
||||||
|
# go back to monitoring. |
||||||
|
|
||||||
|
warn "`date` WARNING: ${VAR} breached threshold ${LIMIT}, actual is ${VAL}" |
||||||
|
Done=0 #go back to monitoring |
||||||
|
|
||||||
|
elif [ "$ACTION" = "die" ]; then |
||||||
|
warn "WARNING: dying without killing process ${PID} on ${SERVER}" |
||||||
|
warn "The process details are below: " |
||||||
|
warn "`ps -p ${PID} -o pid,ppid,bsdtime,rss,vsz,cmd,args`" |
||||||
|
warn "" |
||||||
|
|
||||||
|
#should we send email/notify someone? TODO... for now, bail. |
||||||
|
|
||||||
|
exit -1; |
||||||
|
|
||||||
|
fi |
||||||
|
else |
||||||
|
: |
||||||
|
#warn "INFO: PID $PID, $VAR = $VAL, limit ($LIMIT) not exceeded"; |
||||||
|
fi |
||||||
|
done |
||||||
|
|
Loading…
Reference in new issue