From 36c90a55c4faa43643448f6a0cdeb17a850bba46 Mon Sep 17 00:00:00 2001 From: Michael Tate Date: Fri, 28 Mar 2014 16:18:11 -0400 Subject: [PATCH] from the "Evergreen Availability Monitoring" EG2014 conference talk --- monitoring/nagios/check_apache_cpu | 48 ++++++++ monitoring/nagios/check_at_pending | 92 ++++++++------- monitoring/nagios/check_backends2 | 99 ++++++++++++++++ monitoring/nagios/check_dbquery | 72 ++++++++++++ monitoring/nagios/check_lock | 39 ++++++ monitoring/nagios/check_mem_swap | 57 +++++++++ monitoring/nagios/check_notconnected | 126 +++++++++----------- monitoring/nagios/check_null | 209 +++++++++++++++------------------ monitoring/nagios/check_pound_rotation | 48 ++++++++ monitoring/nagios/check_slony | 28 +++++ 10 files changed, 590 insertions(+), 228 deletions(-) create mode 100644 monitoring/nagios/check_apache_cpu create mode 100644 monitoring/nagios/check_backends2 create mode 100644 monitoring/nagios/check_dbquery create mode 100644 monitoring/nagios/check_lock create mode 100644 monitoring/nagios/check_mem_swap create mode 100644 monitoring/nagios/check_pound_rotation create mode 100644 monitoring/nagios/check_slony diff --git a/monitoring/nagios/check_apache_cpu b/monitoring/nagios/check_apache_cpu new file mode 100644 index 0000000..b67f04d --- /dev/null +++ b/monitoring/nagios/check_apache_cpu @@ -0,0 +1,48 @@ +#!/bin/bash +# Copyright (C) 2008-2013 Equinox Software, Inc. +# Written by Michael Tate +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# +# Author : Michael Tate, Sys Admin, ESI +# Purpose : Check to see if any Apache processes are consuming 100% of CPU, and ennumerate them. +USAGE="check_apachecpu (WARN 1, CRIT 2+)" +if [[ $1 == *help* ]]; then + echo "Usage: $USAGE" + exit 0 +fi + +HIGHESTPROC=`ps -Ao pcpu,pid,args | grep -i apache | grep -v grep | sort -rn|cut -d"." -f1|head -n1` +TOPPROCLIST=`ps -Ao pcpu,pid,args | grep -i apache | grep -v grep | sort -rn|cut -d"." -f1|head -n4` +HIGPPROCS=0 + +for i in `ps -Ao pcpu,pid,args | grep -i apache | grep -v grep | sort -rn|cut -d"." -f1|head -n4`; do + if [ "$i" -gt "80" ]; then + HIGPPROCS=$((HIGHPROCS++)) + fi +done + + +if [ "$HIGHPROCS" -gt "1" ]; then + EXITSTATUS="CRIT: $HIGHPROCS High CPU Apache processes; Highest: $HIGHESTPROC%" + EXITCODE="2" +elif [ "$HIGHPROCS" -gt "0" ]; then + EXITSTATUS="WARN: $HIGHPROCS High CPU Apache process; Usage: $HIGHESTPROC%" + EXITCODE="1" +else + EXITSTATUS="OK: Highest CPU usage $HIGHESTPROC%" + EXITCODE="0" +fi + +echo "$EXITSTATUS" +exit $EXITCODE + diff --git a/monitoring/nagios/check_at_pending b/monitoring/nagios/check_at_pending index 36ebc0a..ae367bb 100644 --- a/monitoring/nagios/check_at_pending +++ b/monitoring/nagios/check_at_pending @@ -12,63 +12,65 @@ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # -# -# Author : MTate, Sys Admin, ESI # Purpose : Count AT pending events -# Usage : check_at_pending +USAGE="check_at_pending " if [[ $1 == *help* ]]; then - echo "Usage: check_dbquery " + echo "Usage: $USAGE" exit 0 fi -# SET/GET VARIABLES - # database name - if [ -n "$1" ]; then - DBNAME="$1" - else - DBNAME="evergreen" - fi - # database user name - if [ -n "$2" ]; then - DBUSER="$1" - else - DBUSER="evergreen" - fi +## GET/SET Variables ## +# The values below are arbitrary, change them to match your environment. +CTWARN=900000 # How many pending transactions to WARN at +CTCRIT=1000000 # How many pending transactions to CRIT at + +# database name +if [ -n "$1" ]; then + dbname="$1" +else + dbname="evergreen" +fi - # port database runs on - if [ -n "$3" ]; then - DBPORT="$3" - else - DBPORT=5432 - fi -# Execute AT Pending Count -ATPENDING=`PGUSER=postgres psql -U $DBUSER -d $DBNAME -p $DBPORT -c "select count(*) from action_trigger.event where state ='pending';"|sed -n '3'p` +# database user name +if [ -n "$2" ]; then + dbuser="$1" +else + dbuser="evergreen" +fi + +# port database runs on +if [ -n "$3" ]; then + dbport="$3" +else + dbport=5432 +fi -CTWARN=900000 # These values will need modification -CTCRIT=1000000 # to what is normal for your environment. - # Run the check manually for two weeks - # to gather the needed info +## Execute Plugin ## +# DB Query Count +ATPENDING=`PGUSER=postgres psql -U $dbuser -d $dbname -p $dbport -c "select count(*) from action_trigger.event where state ='pending';"|sed -n '3'p` -# Result Analysis - if [ $ATPENDING -gt $CTCRIT ]; then - EXITSTATUS="CRITICAL: $ATPENDING AT events pending" - EXITCODE=2 - elif [ $ATPENDING -gt $CTWARN ]; then - EXITSTATUS="WARNING: $ATPENDING AT events pending" - EXITCODE=1 - elif [ $ATPENDING -gt 0 ]; then - EXITSTATUS="OK: $ATPENDING AT events pending" +## Return results ## +if [[ $ATPENDING == "-00" ]]; then + EXITSTATUS="OK: No AT events pending" EXITCODE=0 - else - if [[ $ATPENDING == "-00" ]]; then - EXITSTATUS="OK: No AT events pending" - EXITCODE=0 - fi - fi +else + if [ $ATPENDING -gt $CTCRIT ]; then + EXITSTATUS="CRITICAL: $ATPENDING AT events pending" + EXITCODE=2 + elif [ $ATPENDING -gt $CTWARN ]; then + EXITSTATUS="WARNING: $ATPENDING AT events pending" + EXITCODE=1 + else + EXITSTATUS="OK: $ATPENDING AT events pending" + EXITCODE=0 + fi +fi -# Return results echo "$EXITSTATUS" exit $EXITCODE + + + diff --git a/monitoring/nagios/check_backends2 b/monitoring/nagios/check_backends2 new file mode 100644 index 0000000..9dd720f --- /dev/null +++ b/monitoring/nagios/check_backends2 @@ -0,0 +1,99 @@ +#!/bin/bash +# Copyright (C) 2008-2010 Equinox Software, Inc. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# postgres Backend Count +# Written by: Equinox Software, September 22, 2010 - Lee Dickens +# Modified by: Equinox Software, March 13, 2014 - Michael Tate +USAGE="check_backends2 " +if [[ $1 == *help* ]]; then + echo "Usage: $USAGE" + exit 0 +fi + +## GET/SET Variables ## +# Use "max_connections" from postgresql.conf +if [ -n "$1" ]; then + MAXCX=$1 +else + MAXCX="800" +# MAXCX=`grep max_connections $(find /etc/postgresql/ -name postgresql.conf|tail -n1)` | grep -v "^#"|tr -cd '[[:digit:]]{}'` +fi + +# Platform: plain postgres or pgpool +if [ -n "$2" ]; then + PLATCX=$2 +else + PLATCX="pg" +fi + +if [[ $PLATCX == "pg" ]]; then + PGCX=$MAXCX + # PG warn and crit, high and low + PGCXWL=$(($PGCX/10)) # PostgreSQL number of connections WARN level, low (10% of max) + PGCXCL=$(($PGCXWL/2)) # PostgreSQL number of connections CRIT level, low (5% of max) + PGCXC=$(($PGCX-$PGCXWL)) # PostgreSQL number of connections CRIT level, high (90% of max) + PGCXW=$(($PGCXC-$PGCXWL)) # PostgreSQL number of connections WARN level, high (80% of max) + PGACT=`ps ax|grep -v grep | grep -c postgres` + + if [ $PGACT -lt $PGCXCL ]; then + EXITSTATUS="CRIT: postgresql backends = $PGACT/$PGCX" + EXITCODE=2 + elif [ $PGACT -lt $PGCXWL ]; then + EXITSTATUS="WARN: postgresql backends = $PGACT/$PGCX" + EXITCODE=1 + elif [ $PGACT -gt $PGCXW ]; then + EXITSTATUS="WARN: postgresql backends = $PGACT/$PGCX" + EXITCODE=1 + elif [ $PGACT -gt $PGCXW ]; then + EXITSTATUS="CRIT: postgresql backends = $PGACT/$PGCX" + EXITCODE=2 + else + EXITSTATUS="OK: postgresql backends = $PGACT/$PGCX" + EXITCODE=0 + fi + +elif [[ $PLATCX == "pool" ]]; then + POOLCX=$MAXCX + # PGPOOL warn and crit, high and low + POOLCXWL=$(($PGCX/10)) # PostgreSQL number of connections WARN level, low (10% of max) + POOLCXCL=$(($PGCXWL/2)) # PostgreSQL number of connections CRIT level, low (5% of max) + POOLCXC=$(($PGCX-$PGCXWL)) # PostgreSQL number of connections CRIT level, high (90% of max) + POOLCXW=$(($PGCXC-$PGCXWL)) # PostgreSQL number of connections WARN level, high (80% of max) + POOLACT=`ps ax|grep -v "wait\|grep" | grep -c pgpool` + + if [ $POOLACT -lt $POOLACTCXCL ]; then + EXITSTATUS="CRIT: postgresql backends = $PGACT/$PGCX and pgpool backends = $POOLACT/$POOLACTCX" + EXITCODE=2 + elif [ $POOLACT -lt $POOLACTCXWL ]; then + EXITSTATUS="WARN: postgresql backends = $PGACT/$PGCX and pgpool backends = $POOLACT/$POOLACTCX" + EXITCODE=1 + elif [ $POOLACT -gt $POOLACTCXC ]; then + EXITSTATUS="CRIT: postgresql backends = $PGACT/$PGCX and pgpool backends = $POOLACT/$POOLACTCX" + EXITCODE=2 + elif [ $POOLACT -gt $POOLACTCXW ]; then + EXITSTATUS="WARN: postgresql backends = $PGACT/$PGCX and pgpool backends = $POOLACT/$POOLACTCX" + EXITCODE=1 + else + EXITSTATUS="OK: pgpool backends = $POOLACT/$POOLACTCX" + EXITCODE=0 + fi + +else + echo "Usage: $USAGE" + exit 0 +fi + + +echo "$EXITSTATUS" +exit $EXITCODE + diff --git a/monitoring/nagios/check_dbquery b/monitoring/nagios/check_dbquery new file mode 100644 index 0000000..f32447e --- /dev/null +++ b/monitoring/nagios/check_dbquery @@ -0,0 +1,72 @@ +#!/bin/bash +# Copyright (C) 2008-2010 Equinox Software, Inc. +# Written by Lee Dickens +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# +# Author : Lee Dickens, Sys Admin, ESI +# Updated : 07-MAR-2011, Michael Tate, Sys Admin, ESI: Added arguments +# Purpose : Look for long-running queries: Warn when older than 5 hours, Critical at 7. +USAGE="check_dbquery " + +if [[ $1 == *help* ]]; then + echo "Usage: $USAGE" + exit 0 +fi + +## GET/SET Variables ## +# database name +if [ -n "$1" ]; then + dbname="$1" +else + dbname="evergreen" +fi + +# database user name +if [ -n "$2" ]; then + dbuser="$1" +else + dbuser="evergreen" +fi + +# port database runs on +if [ -n "$3" ]; then + dbport="$3" +else + dbport=5432 +fi + + +## Execute DB Query Count ## +count=`PGUSER=postgres psql -U $dbuser -d $dbname -p $dbport -c "select now()-query_start from pg_stat_activity where current_query NOT LIKE ' +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# +# Author : Dave Brown, Sys Admin, ESI +# Last Updated : Sept 5, 2009 +# Purpose : Check status of file name and process from CL variables. +# Usage : check_lock + + +scriptname=$0 +filetocheck=$1 +proctocheck=$2 + +if [ -f $filetocheck ]; then + if [ "$(ps aux | grep -i $proctocheck | grep -v grep | grep -v $scriptname | wc -l)" -gt 0 ]; then + echo "OK: $filetocheck exists and $proctocheck running" + exit 0 + else + echo "CRIT: $filetocheck exists but $proctocheck not running" + exit 2 + fi +else + echo "OK: $filetocheck not found" + exit 0 +fi + diff --git a/monitoring/nagios/check_mem_swap b/monitoring/nagios/check_mem_swap new file mode 100644 index 0000000..146befd --- /dev/null +++ b/monitoring/nagios/check_mem_swap @@ -0,0 +1,57 @@ +#!/bin/bash +# Copyright (C) 2008-2013 Equinox Software, Inc. +# Written by Michael Tate +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# +# Author : Michael Tate, Sys Admin, ESI +# Purpose : Compare available MEM to SWAP +USAGE="check_mem_swap (CRIT if swapuse > freemem)" +if [[ $1 == *help* ]]; then + echo "Usage: $USAGE" + exit 0 +fi + +# RULE: avail + buffers + cache == freemem +# mem + buffers +MFREE=`free -m|grep Mem|cut -c30-41` +read -rd '' MFREE <<< "$MFREE" +BCFREE=`free -m|grep buffers|grep -v cached|cut -c30-41` +read -rd '' BCFREE <<< "$BCFREE" +# swap +SWUSE=`free -m|grep Swap|cut -c19-30` +read -rd '' SWUSE <<< "$SWUSE" +SWTOT=$(free -m|grep Swap|cut -c12-19) +read -rd '' SWTOT <<< "$SWTOT" +SWPCT=$((($SWUSE*100)/($SWTOT))) +scale=2 + +MFREETOT=$(($MFREE + $BCFREE)) +MFREENET=$(($MFREETOT - $SWUSE)) +MFREEWARN=$((($MFREETOT * 66)/100)) + +if [ $SWPCT -ge 66 ]; then + if [ $SWUSE -ge $MFREETOT ]; then + EXITSTATE="CRIT: Swap use: $[SWUSE]MB ($[SWPCT]%); Mem Free: $[MFREETOT]MB" + EXITCODE=2 + elif [ $(($SWUSE * 100)) -ge $(($MFREETOT * 66)) ]; then + EXITSTATE="WARN: Swap use: $[SWUSE]MB ($[SWPCT]%); Mem Free: $[MFREETOT]MB" + EXITCODE=1 + fi +else + EXITSTATE="OK: Swap use: $[SWUSE]MB ($[SWPCT]%); Mem Free: $[MFREETOT]MB" + EXITCODE=0 +fi + +echo "$EXITSTATE" +exit $EXITCODE + diff --git a/monitoring/nagios/check_notconnected b/monitoring/nagios/check_notconnected index c839127..6f43061 100644 --- a/monitoring/nagios/check_notconnected +++ b/monitoring/nagios/check_notconnected @@ -1,69 +1,57 @@ -#!/bin/sh -# Copyright (C) 2008-2013 Equinox Software, Inc. -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# Author : Michael Tate, Sys Admin, ESI, to allow for a path from the command line -# Based on code written by Don McMorris ; see check_null -# Purpose : Look for excessive NOT CONNECTEDS in the osrfsys logs - -USAGE="check_notconnected " -# : The path to the log location. -# : This plugin assumes that the logs will be dropped into folders for -# : year (4 char), month (2 char), and day (2 char). -# : If no path is entered, it will default to "/var/log/evergreen/prod/" - - -if [ -n "$1" ]; then - if [[ $1 == *help* ]]; then - EXITSTATUS="$USAGE" - EXITCODE="0" - else - - - if [ -n "$1" ]; then - LOGPATH="$1/$(date +%Y/%m/%d)" - else - LOGPATH="/var/log/evergreen/prod/$(date +%Y/%m/%d)" - fi - -LOGFILE="$LOGPATH/osrfsys.$(date +%H).log" - - -NCCOUNT=`grep -c 'IS NOT CONNECTED TO THE NETWORK' $LOGFILE` - - if [ "$NCCOUNT" -gt "0" ]; then - TOPSERVER=$(grep "IS NOT CONNECTED TO THE NETWORK" $LOGFILE | cut -d" " -f3 | sort | uniq -c | sort -nr | head -1) - SVRMSG=" (Top server this hour: $TOPSERVER)" - else - SVRMSG="." - fi - - if [ "$NCCOUNT" -ge "4" ]; then - EXITSTATUS="CRIT: $NCCOUNT NOT CONNECTEDs returned this hour:$SVRMSG" - EXITCODE="2" - elif [ "$NCCOUNT" -ge "2" ]; then - EXITSTATUS="WARN: $NCCOUNT NOT CONNECTEDs returned this hour:$SVRMSG" - EXITCODE="1" - elif [ "$NCCOUNT" -lt "2" ]; then - EXITSTATUS="OK: $NCCOUNT NOT CONNECTEDs returned this hour$SVRMSG" - EXITCODE="0" - else - EXITSTATUS="WARN: An error has occurred in the plugin" - EXITCODE="1" - fi - - fi -fi - -echo "$EXITSTATUS" -exit $EXITCODE - +#!/bin/sh +# Copyright (C) 2008-2010 Equinox Software, Inc. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# +# Author : Michael Tate, Sys Admin, ESI +# Purpose : Look for excessive NOT CONNECTEDS in the osrfsys logs in the current hour +USAGE="check_notconnected " +if [[ $1 == *help* ]]; then + echo "Usage: $USAGE" + exit 0 +fi + +## GET/SET Variables ## +CRITLIMIT=20 +WARNLIMIT=12 + +# logfile path +if [ -n "$1" ]; then + LOGPATH="$1" +else + LOGPATH="/var/log/evergreen/prod" +fi + +NCCOUNT=`grep -c "IS NOT CONNECTED TO THE NETWORK" $LOGPATH/$(date +%Y/%m/%d)/osrfsys.$(date +%H).log` +if [ $NCCOUNT -ge $CRITLIMIT ]; then + TOPSERVER=$(grep "IS NOT CONNECTED TO THE NETWORK" $LOGFILE | cut -d" " -f3 | sort | uniq -c | sort -nr | head -1) + SVRMSG=" (Top server this hour: $TOPSERVER)" + EXITSTATUS="CRIT" + EXITCODE=2 +elif [ $NCCOUNT -ge $WARNLIMIT ]; then + TOPSERVER=$(grep "IS NOT CONNECTED TO THE NETWORK" $LOGFILE | cut -d" " -f3 | sort | uniq -c | sort -nr | head -1) + SVRMSG=" (Top server this hour: $TOPSERVER)" + EXITSTATUS="WARN" + EXITCODE=1 +elif [[ $NCCOUNT -lt $WARNLIMIT ]; then + EXITSTATUS="OK" + EXITCODE=0 + SVRMSG="." +else + EXITSTATUS="WARN: An error has occurred $PREVTOT $PERIOD" + EXITCODE=1 +fi + +echo "$EXITSTATUS: $NCCOUNT NOT CONNECTEDs returned this hour$SVRMSG" +exit $EXITCODE + + diff --git a/monitoring/nagios/check_null b/monitoring/nagios/check_null index bfb9a21..16a4fe8 100644 --- a/monitoring/nagios/check_null +++ b/monitoring/nagios/check_null @@ -1,114 +1,95 @@ -#!/bin/sh -# Copyright (C) 2008-2010 Equinox Software, Inc. -# Written by Don McMorris -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# -# Author : Don McMorris, Sys Admin, ESI -# Modified : Michael Tate, Sys Admin, ESI, to allow for a path from the command line -# Purpose : Look for excessive NULLS in the gateway logs - -USAGE="check_null <# mins to check> " -# <# mins to check> : Check for errors in the last # minutes -# : The number of NULLS in the logs at which to present a warning condition. -# : The number of NULLS on the logs at which to present a critical condition. -# : The path to the log location. -# : This plugin assumes that the logs will be dropped into folders for -# : year (4 char), month (2 char), and day (2 char). -# : If no path is entered, it will default to "/var/log/evergreen/prod/" - -if [ -n "$1" ]; then - if [[ $1 == *help* ]]; then - else - -PERIOD=$1 -WARNLIMIT=$2 -CRITLIMIT=$3 - if [ -n "$4" ]; then - LOGPATH="$4/$(date +%Y/%m/%d)" - else - LOGPATH="/var/log/evergreen/prod/$(date +%Y/%m/%d)" - fi - -PREVTOT=0 -LOGFILE="$LOGPATH/gateway.$(date +%H).log" - -if [ $(date +%H | cut -b1) = 0 ]; then - CURRHOUR=$(date +%H | cut -b2) -else - CURRHOUR=$(date +%H) -fi - -if [ $(date +%M | cut -b1) = 0 ]; then - CURRMIN=$(date +%M | cut -b2 ) -else - CURRMIN=$(date +%M) -fi - -if [ $CURRMIN -lt $PERIOD ]; then - # How many minutes of the last hour do we need to check? - TMPDIFFM2=$((60 - $(($PERIOD - $CURRMIN)))) - - # This logic will mean that "Returning NULL"'s logged at the late 2300 hour will not be counted during the early Midnight hour check. - # This is acceptable for now. - if [ $CURRHOUR -gt 0 ]; then - # define LOGFILE2 (last hours' log) - if [ $CURRHOUR -gt 11 ]; then - LOGFILE2="$LOGPATH/gateway.$(($CURRHOUR - 1)).log" - else - LOGFILE2="$LOGPATH/gateway.0$(($CURRHOUR - 1)).log" - fi - - while [ $TMPDIFFM2 -lt 60 ]; do - PREVTOT=$(($PREVTOT + $(grep "Returning NULL" $LOGFILE2 | cut -d":" -f2 | grep -c $TMPDIFFM2))) - TMPDIFFM2=$(($TMPDIFFM2 + 1)) - done - fi - while [ $TMPDIFF1 -le $CURRMIN ]; do - PREVTOT=$(($PREVTOT + $(grep "Returning NULL" $LOGFILE | cut -d":" -f2 | grep -c $TMPDIFF1))) - TMPDIFF1=$(($TMPDIFF1 + 1)) - done -else - TMPDIFF1=$(($CURRMIN-$PERIOD)) - while [ $TMPDIFF1 -le $CURRMIN ]; do - PREVTOT=$(($PREVTOT + $(grep "Returning NULL" $LOGFILE | cut -d":" -f2 | grep -c $TMPDIFF1))) - TMPDIFF1=$(($TMPDIFF1 + 1)) - done - -fi - - -TOPSERVER=$(grep "Returning NULL" $LOGFILE | cut -d" " -f3 | sort | uniq -c | sort -nr | head -1) - -if [ "$TOPSERVER" != null ]; then - SVRMSG=" (Top server this hour: $TOPSERVER)" -else - SVRMSG="." -fi - -if [ $PREVTOT -ge $CRITLIMIT ]; then - echo "CRIT: $PREVTOT NULLs returned in past $PERIOD minutes$SVRMSG" - exit 2 -elif [ $PREVTOT -ge $WARNLIMIT ]; then - echo "WARN: $PREVTOT NULLs returned in the past $PERIOD minutes$SVRMSG" - exit 1 -elif [ $PREVTOT -lt $WARNLIMIT ]; then - echo "OK: $PREVTOT NULLs returned in the past $PERIOD minutes$SVRMSG" - exit 0 -else - echo "WARN: An error has occurred $PREVTOT $PERIOD" - exit 1 -fi - - -fi - +#!/bin/sh +# Copyright (C) 2008-2010 Equinox Software, Inc. +# Written by Don McMorris +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# +# Author : Don McMorris, Sys Admin, ESI +# Purpose : Look for excessive NULLS in the gateway logs +# Usage : check_null <# mins to check> + + +WARNLIMIT=$2 +CRITLIMIT=$3 +PERIOD=$1 +# Note: These should really be checked to ensure they are defined and within range... + +PREVTOT=0 +LOGFILE="/var/log/evergreen/prod/$(date +%Y/%m/%d)/gateway.$(date +%H).log" + +if [ $(date +%H | cut -b1) = 0 ]; then + CURRHOUR=$(date +%H | cut -b2) +else + CURRHOUR=$(date +%H) +fi + +if [ $(date +%M | cut -b1) = 0 ]; then + CURRMIN=$(date +%M | cut -b2 ) +else + CURRMIN=$(date +%M) +fi + +if [ $CURRMIN -lt $PERIOD ]; then + # How many minutes of the last hour do we need to check? + TMPDIFFM2=$((60 - $(($PERIOD - $CURRMIN)))) + + # This logic will mean that "Returning NULL"'s logged at the late 2300 hour will not be counted during the early Midnight hour check. + # This is acceptable for now. + if [ $CURRHOUR -gt 0 ]; then + # define LOGFILE2 (last hours' log) + if [ $CURRHOUR -gt 11 ]; then + LOGFILE2="/var/log/evergreen/prod/$(date +%Y/%m/%d)/gateway.$(($CURRHOUR - 1)).log" + else + LOGFILE2="/var/log/evergreen/prod/$(date +%Y/%m/%d)/gateway.0$(($CURRHOUR - 1)).log" + fi + + while [ $TMPDIFFM2 -lt 60 ]; do + PREVTOT=$(($PREVTOT + $(grep "Returning NULL" $LOGFILE2 | cut -d":" -f2 | grep -c $TMPDIFFM2))) + TMPDIFFM2=$(($TMPDIFFM2 + 1)) + done + fi + while [ $TMPDIFF1 -le $CURRMIN ]; do + PREVTOT=$(($PREVTOT + $(grep "Returning NULL" $LOGFILE | cut -d":" -f2 | grep -c $TMPDIFF1))) + TMPDIFF1=$(($TMPDIFF1 + 1)) + done +else + TMPDIFF1=$(($CURRMIN-$PERIOD)) + while [ $TMPDIFF1 -le $CURRMIN ]; do + PREVTOT=$(($PREVTOT + $(grep "Returning NULL" $LOGFILE | cut -d":" -f2 | grep -c $TMPDIFF1))) + TMPDIFF1=$(($TMPDIFF1 + 1)) + done + +fi + + +TOPSERVER=$(grep "Returning NULL" $LOGFILE | cut -d" " -f3 | sort | uniq -c | sort -nr | head -1) + +if [ "$TOPSERVER" != null ]; then + SVRMSG=" (Top server this hour: $TOPSERVER)" +else + SVRMSG="." +fi + +if [ $PREVTOT -ge $CRITLIMIT ]; then + echo "CRIT: $PREVTOT NULLs returned in past $PERIOD minutes$SVRMSG" + exit 2 +elif [ $PREVTOT -ge $WARNLIMIT ]; then + echo "WARN: $PREVTOT NULLs returned in the past $PERIOD minutes$SVRMSG" + exit 1 +elif [ $PREVTOT -lt $WARNLIMIT ]; then + echo "OK: $PREVTOT NULLs returned in the past $PERIOD minutes$SVRMSG" + exit 0 +else + echo "WARN: An error has occurred $PREVTOT $PERIOD" + exit 1 +fi + diff --git a/monitoring/nagios/check_pound_rotation b/monitoring/nagios/check_pound_rotation new file mode 100644 index 0000000..d0d4b06 --- /dev/null +++ b/monitoring/nagios/check_pound_rotation @@ -0,0 +1,48 @@ +#!/bin/bash +# Copyright (C) 2012 Equinox Software, Inc. +# Written by Michael Tate +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# +# Author : Michael Tate, Sys Admin, ESI +# Purpose : Check number of DISABLED pound forwarders +USAGE="check_pound_rotation (WARN if 1 brick, CRIT if 2 or more)" +if [[ $1 == *help* ]]; then + echo "Usage: $USAGE" + exit 0 +fi + + +PCOUNT=`sudo poundctl -c /var/run/pound/poundctl.socket | grep -c DISABLED` +PDETAIL=`sudo poundctl -c /var/run/pound/poundctl.socket | grep DISABLED |cut -d":" -f1|cut -c18-|sort|uniq` + +if [[ $1 == *help* ]]; then + echo "Usage: $USAGE" +else + if [ $PCOUNT -gt 0 ]; then + EXITSTATUS="WARN: $PCOUNT Services disabled; " + EXITCODE=1 + elif [ $PCOUNT -gt 2 ]; then + EXITSTATUS="CRIT: $PCOUNT Services disabled; " + EXITCODE=2 + elif [ $PCOUNT -lt 1 ]; then + EXITSTATUS="OK: $PCOUNT Services disabled." + EXITCODE=0 + fi +fi + + +echo -n $EXITSTATUS +echo $PDETAIL +exit $EXITCODE + + diff --git a/monitoring/nagios/check_slony b/monitoring/nagios/check_slony new file mode 100644 index 0000000..efad379 --- /dev/null +++ b/monitoring/nagios/check_slony @@ -0,0 +1,28 @@ +#!/bin/sh +# Copyright (C) 2008-2010 Equinox Software, Inc. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# Slony replication status nagios check +# Written by: Equinox Software, April 19, 2010 - Lee Dickens +. /etc/profile +count=`PGUSER=postgres psql -U evergreen evergreen -c "select st_lag_num_events from _replication.sl_status;"|sed -n 3p|sed 's/^[ \t]*//'` +if [ $count -gt 200 ]; then + echo "CRITICAL: Slony Replication Lag: st_lag_num_events = $count" + exit 2 +fi +if [ $count -gt 150 ]; then + echo "WARNING: Slony Replication Lag: st_lag_num_events = $count" + exit 1 +fi +echo "OK: Slony Replication In Sync: st_lag_num_events = $count" +exit 0 + -- 2.11.0