blob: a5fc6d5136607aebdc1e8c7bc8db08bcb4252ace [file] [log] [blame]
#!/bin/sh
# This is a "repair binary" for watchdog that allows the tests to fail N times
# within a given period before a reboot is called. Note that this "grace
# period" should really be a functionality of watchdog itself, IMHO.
#
# Erik Rossen <rossen@prolibre.com>
# If one does not change the default watchdog loop time of 10 secords, N=12
# will allow two minutes of failures before a reboot is signaled.
N=12
# CMAXAGE is the age in seconds that the counter file may have before it is
# considered too old and is wiped out.
CMAXAGE=20
ERR=$1
COUNTER=/var/run/watchdog.counter
if test -f $COUNTER; then
COUNTERAGE=$(stat -c %Y $COUNTER)
NOW=$(date +%s)
if test $(($COUNTERAGE+$CMAXAGE)) -lt $NOW ; then
rm $COUNTER
else
I=$(cat $COUNTER)
fi
fi
I=${I:-0}
I=$(($I+1))
logger -t "watchdog[$$]" "Failure $I of $N"
logger -t "watchdog[$$]" "PROCESS LIST:"
ps auxww | logger -t "watchdog[$$]"
if test "$I" -ge "$N" ; then
logger -t "watchdog[$$]" Too many failures. Signalling reboot.
rm $COUNTER
exit $ERR
fi
echo $I > $COUNTER
exit 0