examples/another-chance.sh - manifest_repos/watchdog - Git at Google

 #!/bin/sh
 # This is a "repair binary" for watchdog that allows the tests to fail N times
 # within a given period before a reboot is called.  Note that this "grace
 # period" should really be a functionality of watchdog itself, IMHO.
 #
 # Erik Rossen <rossen@prolibre.com>

 # If one does not change the default watchdog loop time of 10 secords, N=12
 # will allow two minutes of failures before a reboot is signaled.
 N=12

 # CMAXAGE is the age in seconds that the counter file may have before it is
 # considered too old and is wiped out.
 CMAXAGE=20

 ERR=$1

 COUNTER=/var/run/watchdog.counter

 if test -f $COUNTER; then
 	COUNTERAGE=$(stat -c %Y $COUNTER)
 	NOW=$(date +%s)
 	if test $(($COUNTERAGE+$CMAXAGE)) -lt $NOW ; then
 		rm $COUNTER
 	else
 		I=$(cat $COUNTER)
 	fi
 fi

 I=${I:-0}

 I=$(($I+1))

 logger -t "watchdog[$$]" "Failure $I of $N"
 logger -t "watchdog[$$]" "PROCESS LIST:"
 ps auxww | logger -t "watchdog[$$]"

 if test "$I" -ge "$N" ; then
 	logger -t "watchdog[$$]" Too many failures.  Signalling reboot.
 	rm $COUNTER
 	exit $ERR
 fi

 echo $I > $COUNTER
 exit 0
	#!/bin/sh
	# This is a "repair binary" for watchdog that allows the tests to fail N times
	# within a given period before a reboot is called. Note that this "grace
	# period" should really be a functionality of watchdog itself, IMHO.
	#
	# Erik Rossen <rossen@prolibre.com>

	# If one does not change the default watchdog loop time of 10 secords, N=12
	# will allow two minutes of failures before a reboot is signaled.
	N=12

	# CMAXAGE is the age in seconds that the counter file may have before it is
	# considered too old and is wiped out.
	CMAXAGE=20

	ERR=$1

	COUNTER=/var/run/watchdog.counter

	if test -f $COUNTER; then
	COUNTERAGE=$(stat -c %Y $COUNTER)
	NOW=$(date +%s)
	if test $(($COUNTERAGE+$CMAXAGE)) -lt $NOW ; then
	rm $COUNTER
	else
	I=$(cat $COUNTER)
	fi
	fi

	I=${I:-0}

	I=$(($I+1))

	logger -t "watchdog[$$]" "Failure $I of $N"
	logger -t "watchdog[$$]" "PROCESS LIST:"
	ps auxww \| logger -t "watchdog[$$]"

	if test "$I" -ge "$N" ; then
	logger -t "watchdog[$$]" Too many failures. Signalling reboot.
	rm $COUNTER
	exit $ERR
	fi

	echo $I > $COUNTER
	exit 0