mirror of https://github.com/sean-m/sdf-mon.git
223 lines
4.9 KiB
Bash
Executable File
223 lines
4.9 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
## Checks health and heartbeat for other server
|
|
### Sean McArdle Nov 2013
|
|
|
|
|
|
# functions
|
|
|
|
function show
|
|
{
|
|
if $verbose ; then
|
|
echo $1
|
|
fi
|
|
}
|
|
|
|
function show_help
|
|
{
|
|
echo "Usage: check.sh [options]"
|
|
echo "Options:"
|
|
echo " -h : show this help"
|
|
echo ""
|
|
echo " -v : show verbose output"
|
|
echo ""
|
|
echo " -r : show report when completed"
|
|
echo ""
|
|
exit
|
|
}
|
|
|
|
|
|
# parse options, set global variables
|
|
verbose=false
|
|
report=false
|
|
|
|
notify_link=false
|
|
notify_cpu=false
|
|
notify_disk=false
|
|
notify_service=false
|
|
|
|
while getopts ":vhr" arg; do
|
|
case "${arg}" in
|
|
v)
|
|
verbose=true
|
|
;;
|
|
h)
|
|
show_help
|
|
;;
|
|
r)
|
|
report=true
|
|
;;
|
|
*)
|
|
;;
|
|
esac
|
|
done
|
|
|
|
# Load config
|
|
if [ -f check.conf ]; then
|
|
. check.conf
|
|
else
|
|
echo "config.sh file not found, exiting"
|
|
exit 1
|
|
fi
|
|
|
|
|
|
# setup email distribution group
|
|
if [ ! -f ~/.mailrc ]; then
|
|
echo "alias alertees $email_recipients" > ~/.mailrc
|
|
else
|
|
grep -qi "alias alertees" ~/.mailrc
|
|
if [ $? -eq 0 ]; then
|
|
sed -i "/alias alertees/c\alias alertees $email_recipients" ~/.mailrc
|
|
else
|
|
echo "alias alertees $email_recipients" > ~/.mailrc
|
|
fi
|
|
fi
|
|
|
|
|
|
# test network link
|
|
ping -c 2 $target > /dev/null
|
|
|
|
if [ $? -eq 0 ]; then
|
|
# target is alive, check cpu
|
|
show "host $target is up"
|
|
cpu=`ssh $ssh_user@$target sleep 2s && top -d0.5 -bn 4 | grep Cpu\(s\) | awk '{print $5}' | cut -d % -f 1 | tail -n +2 | awk '{sum+=100-$1}END{print sum/NR}'`
|
|
if [[ $(echo "if (${cpu} > ${cpu_thresh}) 1 else 0" | bc) -eq 1 ]]; then
|
|
show "cpu usage high: $cpu"
|
|
notify_cpu=true
|
|
else
|
|
show "cpu usage low: $cpu"
|
|
fi
|
|
|
|
|
|
# check % used disk space per partition
|
|
disk=`ssh $ssh_user@$target df -h | tail -n +2 | awk '{print $5 ":" $6}'`
|
|
for partition in ${disk}; do
|
|
percent=`echo $partition | cut -d % -f 1`
|
|
|
|
if [[ $(echo "if (${percent} > ${disk_thresh}) 1 else 0" | bc) -eq 1 ]]; then
|
|
show "disk usage above $disk_thresh%: $partition"
|
|
notify_disk=true
|
|
else
|
|
show "disk is fine: $partition"
|
|
fi
|
|
done
|
|
|
|
|
|
# check memory
|
|
mem_reading=`ssh $ssh_user@$target free -m | tail -n +2`
|
|
mem_total=`echo $mem_reading | awk '{print $2}'`
|
|
mem_used=`echo $mem_reading | awk '{print $10}'`
|
|
mem_percent_used=`echo "$mem_used $mem_total"| awk '{print $1/$2*100}' | cut -d . -f1`
|
|
|
|
show "mem_total: $mem_total"
|
|
show "mem_used: $mem_used"
|
|
show "mem_percent_used: $mem_percent_used"
|
|
|
|
if [[ $(echo "if (${mem_percent_used} > ${mem_thresh}) 1 else 0" | bc) -eq 1 ]]; then
|
|
show "mem usage high: $mem_percent_used"
|
|
else
|
|
show "mem usage low: $mem_percent_used"
|
|
fi
|
|
|
|
|
|
# check for required services
|
|
processes=`ssh $ssh_user@$target ps -A | tail -n +2 | awk '{print $4}'`
|
|
services=""
|
|
|
|
for proc in ${service_list}; do
|
|
show $proc
|
|
echo $processes | grep -q -e $proc
|
|
val=$?
|
|
show $val
|
|
if [ $val -ne 0 ]; then
|
|
notify_service=true
|
|
services="$services $proc:off"
|
|
else
|
|
services="$services $proc:on"
|
|
fi
|
|
done
|
|
|
|
else
|
|
show "cannot reach host $target!"
|
|
notify_link=true
|
|
fi
|
|
|
|
|
|
# send notifications if needed
|
|
dir=/tmp/.notifications
|
|
filename=$(date +%Y%m%d%H%M%S)-alert.log
|
|
send_alert=false
|
|
|
|
if [ ! -d /tmp/.notifications ]; then
|
|
mkdir /tmp/.notifications
|
|
fi
|
|
|
|
alert_sub="alert: "
|
|
|
|
|
|
if $notify_link ; then
|
|
alert_sub="$alert_sub link"
|
|
echo "Cannot resolve $target" > $dir/$filename
|
|
send_alert=true
|
|
else
|
|
if $notify_cpu ; then
|
|
alert_sub="$alert_sub cpu"
|
|
send_alert=true
|
|
fi
|
|
|
|
if $notify_cpu ; then
|
|
alert_sub="$alert_sub disk"
|
|
send_alert=true
|
|
fi
|
|
|
|
if $notify_service ; then
|
|
alert_sub="$alert_sub service"
|
|
send_alert=true
|
|
fi
|
|
|
|
# write alert message
|
|
echo "-- CPU Usage --" > $dir/$filename
|
|
echo $cpu >> $dir/$filename
|
|
echo "" >> $dir/$filename
|
|
echo "-- Disk Usage --" >> $dir/$filename
|
|
echo "$disk" >> $dir/$filename
|
|
echo "" >> $dir/$filename
|
|
echo "-- Mem Usage --" >> $dir/$filename
|
|
echo "total: $mem_total used: $mem_used percentage: $mem_percent_used" >> $dir/$filename
|
|
echo "" >> $dir/$filename
|
|
echo "-- Process Check" >> $dir/$filename
|
|
echo "$services" >> $dir/$filename
|
|
echo "" >> $dir/$filename
|
|
fi
|
|
|
|
if $send_alert ; then
|
|
if [ -s $dir/$filename ]; then
|
|
mail -S smtp=$smtp_server -s "${alert_sub} on ${target}" alertees < $dir/$filename
|
|
show "send $alert_sub email to $email_recipients"
|
|
else
|
|
for addr in ${email_recipient}; do
|
|
mail -S smtp=$smtp_server -s "${alert_sub} on ${target}" alertees < $dir/$filename
|
|
show "send link error to $email_recipients"
|
|
done
|
|
fi
|
|
fi
|
|
|
|
|
|
# if -r passed, write sensor report
|
|
if $report ; then
|
|
echo "-- CPU Usage --"
|
|
echo "${cpu}"
|
|
echo ""
|
|
echo "-- Disk Usage --"
|
|
echo "${disk}"
|
|
echo ""
|
|
echo "-- Mem Usage --"
|
|
echo "total: $mem_total used: $mem_used percentage: $mem_percent_used"
|
|
echo ""
|
|
echo "-- Process Check"
|
|
echo "$services"
|
|
echo ""
|
|
fi
|
|
|
|
|