#! /bin/ksh ########################################################################### # # ONINIT # # PROPRIETARY DATA # # THIS DOCUMENT CONTAINS DATA WHICH IS THE PROPERTY OF ONINIT LLC. # THIS DOCUMENT IS SUBMITTED TO RECIPIENT IN CONFIDENCE. INFORMATION # CONTAINED HEREIN MAY NOT BE USED, COPIED OR DISCLOSED IN WHOLE OR IN # PART EXCEPT AS PERMITTED BY WRITTEN AGREEMENT SIGNED BY AN OFFICER OF # ONINIT LLC # # Filename: @(#)Scripts healthcheck 1.1@(#) # Last changed: 6/4/01 11:42:33 # SCCS file: /oninit/tools/SCCS/tools/shell/s.healthcheck # # Property of Oninit LLC # Tel +1 913 674 0360 # Tel +1 913 674 0364 # email info@oninit.com # # Added function clipnum, some ksh can't do float comparisons # Changed awk to remove SUM - not in all awks # # ########################################################################### #SCCSDIR=${SCCSDIR:="/home/tools/SCCS/"} #. ${SCCSDIR}.std.env GREP=egrep # What we want to use for grep'ing AWK=awk # What we want to use for awk'ing NUMCHKPTS=60 # 5 hours at standard interval of 300 METRIC_SEG="onstat -g seg" # Memory METRIC_IOV="onstat -g iov" # IO by class METRIC_BUF="onstat -P" # Buffer usage METRIC_PRO="onstat -p" # Profile METRIC_UPT="onstat -" # Uptime METRIC_SPI="onstat -g spi" # Resource Spins METRIC_LOG="onstat -l" # Logs METRIC_LRU="onstat -F" # LRU FUZZY=0 # Are we using Fuzzy Checkpoints (0:No 1:Yes) # # All the test thresholds # KBREXTHRES=7 # KBR excellent threshold KBRTHRES=15 # KBR threshold LRUTHRES=1 # LRU threshold CHKTHRES=2 # Checkpoint threshold READEXTHRES=99 # Read is excellent threshold READTHRES=90 # Read threshold WRITEEXTHRES=95 # Write is excellent threshold WRITETHRES=85 # Write threshold PHYSTHRES=85 # Physical buffer is too large threshold PHYSIDEALTHRES=75 # Physical buffer is good threshold LOGTHRES=85 # Logical buffer is too large threshold LOGIDEALTHRES=75 # Logical buffer is good threshold LOGUSEDTHRES=50 # Logs used threshold FGTHRES=0 # FG Write threshold RAEXTHRES=99 # Read Ahead excellent threshold RATHRES=97 # Read Ahead threshold LWEXTHRES=1 # Lock wait excellent threshold LWTHRES=5 # Lock wait threshold BTERRTHRES=50 # BTree error threshold BTPRBTHRES=20 # BTree problem threshold IOTHRES=11 # Average IO/wup Threshold # # Functions # function clipnum { TESTNUM=`echo $1 | cut -d"." -f1` } function separator { echo "" echo "----------------------------------------------------------------------------" echo "" } function displine { separator # # That's a tab not spaces # echo " "${*} } function endofrun { separator echo "Healthcheck complete" separator } function startofrun { separator echo "Healthcheck run on `date`" echo "Instance: "${INFORMIXSERVER} echo "Onconfig: "${ONCONFIG} } # # End of Functions # # # Check environment variables are set, these variables are check but not # validated # if [ "x${INFORMIXSERVER}" = "x" ] then displine "Error: INFORMIXSERVER is not set!" exit 1 fi if [ "x${INFORMIXDIR}" = "x" ] then displine "Error: INFORMIXDIR is not set!" exit 1 fi if [ "x${ONCONFIG}" = "x" ] then displine "Error: ONCONFIG is not set!" exit 1 fi # # Start of main program # startofrun # # Has the instance been restarted in the last 24 hours # ${METRIC_UPT} - | ${GREP} day > /dev/null 2>&1 if [ $? -eq 0 ] then displine "No unusual downtime." else displine "The server has been restarted in the last 24 hours." fi # # Process the profile, some metrics are tested and reported immmediately # other are stored for later use. # cnt=0 for METRIC in `${METRIC_PRO} | ${GREP} -v bytes | ${GREP} '[0-9]'` do cnt=`expr $cnt + 1` case $cnt in 2) PAGREADS=${METRIC} ;; 4) clipnum ${METRIC} if [ "${TESTNUM}" -ge "${READTHRES}" ] then if [ "${TESTNUM}" -ge "${READEXTHRES}" ] then displine "At ${METRIC}% Read cache is excellent". else displine "At ${METRIC}% Read cache is satisfactory" fi else displine "At ${METRIC}% Read cache is below minimum target of ${READTHRES}%." fi ;; 7) BUFWRITS=${METRIC} ;; 8) clipnum ${METRIC} if [ "${TESTNUM}" -ge "${WRITETHRES}" ] then if [ "${TESTNUM}" -ge "${WRITEEXTHRES}" ] then displine "At ${METRIC}% Write cache is excellent." else displine "At ${METRIC}% Write cache is satisfactory." fi else displine "At ${METRIC}% Write cache is below minimum target value of ${WRITETHRES}. " fi ;; 32) BUFWAITS=${METRIC} BUFWPCT=`echo ${BUFWAITS} ${PAGREADS} ${BUFWRITS} | awk '{printf("%4.2f\n",(($1/($2+$3))*100));}'` ;; 33) LOKWAITS=${METRIC} ;; 34) LOKREQS=${METRIC} LWPCT=`echo ${LOKWAITS} ${LOKREQS} | ${AWK} '{printf("%4.2f\n",(($1 / $2) * 100));}'` ;; 35) if [ "${METRIC}" -ge 0 ] then displine "${METRIC} deadlock(s) detected." else displine "No deadlocks detected." fi ;; 36) if [ "${METRIC}" -ge 0 ] then displine "${METRIC} lock timeout(s) detected." else displine "No lock timeouts detected." fi ;; 40) IDXARA=${METRIC} ;; 41) IDXRA=${METRIC} ;; 42) DARA=${METRIC} ;; 43) RAPGSUSED=${METRIC} RAPCT=`echo ${RAPGSUSED} ${IDXARA} ${IDXRA} ${DARA} | awk '{printf("%4.2f\n",(($1/($2+$3+$4))*100));}'` ;; *) ;; esac done # # Check LRU contention # SPI=`${METRIC_SPI}|${GREP} -i lru | ${AWK} 'BEGIN {w=0} {w=w+$1} END {print w}'` BUF=`expr ${PAGREADS} + ${BUFWRITS}` if [ $BUF -eq 0 ] then LC=0 else LC=`echo "scale=2; $SPI * 100 / $BUF" |bc` fi if [ "${LC}" -le "${LRUTHRES}" ] then displine "At ${LC}% LRU Contention is excellent." else displine "At ${LC}% LRU Contention is too high (Target is ${LRUTHRES}% or less)." fi # # Check the length of the checkpoints # ONLINELOG=`${GREP} MSGPATH $INFORMIXDIR/etc/$ONCONFIG | ${AWK} '{print $2}'` cnt=0 if [ "${FUZZY}" = "1" ] then for i in `${GREP} "Fuzzy Checkpoint" ${ONLINELOG} | tail -${NUMCHKPTS} | ${AWK} '{print $7}'` do if [ "${i}" -ge "${CHKTHRES}" ] then cnt=`expr $cnt + 1` fi done else for i in `${GREP} "Checkpoint Completed" ${ONLINELOG} | tail -${NUMCHKPTS} | ${AWK} '{print $6}'` do if [ "${i}" -ge "${CHKTHRES}" ] then cnt=`expr $cnt + 1` fi done fi if [ "$cnt" -ge 0 ] then displine "$cnt of the last ${NUMCHKPTS} checkpoint(s) were longer than ${CHKTHRES} seconds." else displine "Checkpoint duration is satisfactory." fi # # check physical log buffering # PBUFSIZE=`${METRIC_LOG} | ${GREP} " P-" | ${AWK} '{print $3}'` PBUFUSE=`${METRIC_LOG} | ${GREP} " P-" | ${AWK} '{print $6}'` PBUFPCT=`echo $PBUFSIZE $PBUFUSE | ${AWK} '{printf("%4.2f\n",($2/$1)*100);}'` clipnum ${PBUFPCT} if [ "${TESTNUM}" -ge "${PHYSTHRES}" ] then displine "PHYSBUFF can be increased." else if [ "${TESTNUM}" -le "${PHYSIDEALTHRES}" ] then displine "PHYSBUFF can be decreased [${PBUFPCT}%]." else displine "PHYSBUFF appears to be sized correctly [${PBUFPCT}%]." fi fi # # check logical log buffering # LBUFSIZE=`${METRIC_LOG} | ${GREP} " L-" | ${AWK} '{print $3}'` LBUFUSE=`${METRIC_LOG} | ${GREP} " L-" | ${AWK} '{print $8}'` LBUFPCT=`echo $LBUFSIZE $LBUFUSE | ${AWK} '{printf("%4.2f\n",($2/$1)*100);}'` clipnum ${LBUFPCT} if [ "${TESTNUM}" -ge "${LOGTHRES}" ] then displine "LOGBUFF can be increased." else if [ "${TESTNUM}" -le "${LOGIDEALTHRES}" ] then displine "LOGBUFF can be decreased [${LBUFPCT}%]." else displine "LOGBUFF appears to be sized correctly [${LBUFPCT}%]." fi fi # # Review the log status # # # Are there any new logs out there # cnt=`dbaccess sysmaster - <<-EOF 2>/dev/null | ${AWK} '/ [0-9]/{print $1}' SELECT count(*) FROM syslogs WHERE is_new = 1; EOF ` if [ "$cnt" -gt 0 ] then displine "Newly added logs detected." fi # # check the free log space # LOGTOT=`dbaccess sysmaster - <<-EOF 2>/dev/null | ${AWK} '/ [0-9]/{print $1}' SELECT SUM(size) FROM syslogs WHERE is_new = 0; EOF ` LOGFREE=`dbaccess sysmaster - <<-EOF 2>/dev/null | ${AWK} '/ [0-9]/{print $1}' CREATE TEMP TABLE t_healthcheck ( logs int ); INSERT INTO t_healthcheck SELECT SUM(size - used) FROM syslogs WHERE is_new = 0; INSERT INTO t_healthcheck SELECT SUM(size) FROM syslogs WHERE is_used = 0; INSERT INTO t_healthcheck SELECT SUM(size) FROM syslogs WHERE is_used = 1 AND is_backed_up = 1; SELECT SUM(logs) FROM t_healthcheck; DROP TABLE t_healthcheck; EOF ` LOGPCT=`expr ${LOGFREE} \* 100 / ${LOGTOT}` clipnum ${LOGPCT} if [ "${TESTNUM}" -le "${LOGUSEDTHRES}" ] then displine "Less than ${LOGUSEDTHRES}% logical log space left [${LOGPCT}% free]. " else displine "Logical logs are OK [${LOGPCT}% free]." fi # # Check for foreground writes # cnt=`${METRIC_LRU} | ${GREP} '[0-9]' | ${GREP} -v '[a-z]' | ${GREP} -v '[A-Z]' | ${AWK} '{print $1}'` if [ "$cnt" -gt "${FGTHRES}" ] then displine "You have encountered $cnt foreground write(s). " else displine "No foreground writes detected." fi # # shared memory usage, very basic. You should have only one virtual segment. The maximum # varies between OS's but one is always the target # cnt=`${METRIC_SEG} | ${GREP} " V " | wc -l | ${AWK} '{print $1}'` if [ "$cnt" -gt 1 ] then displine "Too many shared memory segments [$cnt]." else displine "Shared memory segment count OK." fi # # Activity per (K)AIO VP, count how many VPs not averaging one io per wake, # check the aveage io per wakeup and compare this the threshold # IOWUP=`${METRIC_IOV} | ${GREP} '[ka]io' | ${AWK} 'BEGIN{ tot=0 cnt=0 } { tot=tot+$10 if($10<1) { cnt++ } } END{ printf("%4.2f %d",tot/NR, cnt); }'` AVGIOWUP=`echo ${IOWUP} | cut -d" " -f1` LOWIO=`echo ${IOWUP} | cut -d" " -f2` clipnum ${LOWIO} if [ "${TESTNUM}" -eq 1 ] then displine "The correct number of AIO/KAIO VPs are configured." else if [ "${LOWIO}" -ge 1 ] then displine "${LOWIO} AIO/KAIO VPs have less than 1 I/O per wakeup." else clipnum ${AVGIOWUP} if [ "${TESTNUM}" -ge "${IOTHRES}" ] then displine "Too few AIO/KAIO VPs are configured." else displine "Too few AIO/KAIO VPs are configured [no problems at this stage]." fi fi fi # # Checking buffer peformance and RA utilisation # clipnum ${RAPCT} if [ "${TESTNUM}" -ge "${RAEXTHRES}" ] then displine "At ${RAPCT}% Read Ahead buffer usage is excellent." else if [ "${TESTNUM}" -ge "${RATHRES}" ] then echo "At ${RAPCT}% Read Ahead buffer usage is good." else echo "At ${RAPCT}% Read Ahead buffer usage is poor." fi fi clipnum ${BUFWPCT} if [ "${TESTNUM}" -le "${KBREXTHRES}" ] then displine "At ${BUFWPCT}% Buffer wait ratio is excellent." else if [ "${TESTNUM}" -le "${KBRTHRES}" ] then displine "At ${BUFWPCT}% Buffer wait ratio is good." else displine "At ${BUFWPCT}% Buffer wait ratio is poor." fi fi # # Lock performance # clipnum ${LWPCT} if [ "${TESTNUM}" -le "${LWEXTHRES}" ] then displine "At ${LWPCT}% Lock wait ratio is excellent." else if [ "${TESTNUM}" -le "${LWTHRES}" ] then displine "At ${LWPCT}% Lock wait ratio is acceptable." else displine "At ${LWPCT}% Lock wait ratio is poor." fi fi # # So what are all those buffers up to then # BTREEPCT=`${METRIC_BUF} | tail -4 | ${GREP} -i btree | ${AWK} '{print $2}'` OTHERPCT=`${METRIC_BUF} | tail -4 | ${GREP} -i other | ${AWK} '{print $2}'` clipnum ${BTREEPCT} if [ "${TESTNUM}" -ge "${BTERRTHRES}" ] then displine "At ${BTREEPCT}% Buffers allocated to index data is dangerously high." else if [ "${TESTNUM}" -ge "${BTPRBTHRES}" ] then displine "At ${BTREEPCT}% Buffers allocated to indexes is high." else displine "At ${BTREEPCT}% Buffers allocated to indexes is acceptable." fi fi clipnum ${OTHERPCT} if [ "${TESTNUM}" -ge 5 ] then displine "Buffers allocated to \"other uses\" is high [${OTHERPCT}%]." else displine "Buffers allocated to \"other uses\" is acceptable [${OTHERPCT}%]." fi # # OK we are out of here # endofrun