#! /bin/ksh
#
# Hourlystuff - local commands to be run once an hour (from cron).
#
# Version: 1995/05/03.
#
# Usage: Hourlystuff
#
# Problems found by this script should be mailed to a system
# administrator.
#
host=`hostname`
admin=system
#
# Set registry master node name.
#
rgymaster=aurum
#
umask 22
#
# Get a process listing.
# ----------------------
#
pslisting=/tmp/pslisting.$$
ps ax >$pslisting 2>&1
grep -s /etc/init $pslisting >/dev/null 2>&1
if [ $? != 0 ]; then
   mail -s "Trouble: 'ps ax' failed on $host" $admin <<EOF1
$0 'ps ax' failed on $host.

ps listing:
-----------

`cat $pslisting`
EOF1
   exit 0
fi
#
# Watch out for Hourlystuff running during a boot/reboot.
#
bootdone=/tmp/bootdone
#
# Check that required daemons are still running.
# ----------------------------------------------
#
# Start /usr/lib/lpd if one is not already running.
#
grep -s /usr/lib/lpd $pslisting >/dev/null 2>&1
if [ $? != 0 ]; then
   /usr/lib/lpd &
fi
#
# Check if /etc/Xapollo is running - if not, start one.
# Also kill /sys/dm if Xapollo has died.
# Be careful that the system is not in the process of booting,
# so wait one pass of Hourlystuff before starting another Xapollo.
#
grep -s /etc/Xapollo $pslisting >/dev/null 2>&1
if [ $? != 0 -a -f /etc/daemons/X ]; then
   if [ -f $bootdone -a -f /tmp/Hourly.Xapollo ]; then
      (nice --5 /etc/Xapollo -D1 s+r+ ) &
      set `grep 'dm$' $pslisting`
      if [ "$#" = "6" ]; then
         kill $1
      fi
      /bin/rm -f /tmp/Hourly.Xapollo
      mail -s "Trouble: Xapollo died on $host" $admin <<EOF2
$0 restarted Xapollo on $host, and killed
DM (process status "$*").

ps listing:
-----------

`cat $pslisting`
EOF2
   else
      mail -s "Trouble: Xapollo missing on $host" $admin <<EOF3
$0 can't find Xapollo on $host, and the file
$bootdone does not exist (indicating this is the first Hourlystuff run
after a reboot), or /tmp/Hourly.Xapollo does not exist. No action taken.

/tmp contents:
--------------

`cd /tmp; ls -lg`

ps listing:
-----------

`cat $pslisting`
EOF3
   fi
fi
#
# Check the all the /etc/daemons are still running, with some
# exceptions.
#
for f in /etc/daemons/*
do
   name=`basename $f`
   namelist=$name
   if [ "$name" = "README" ]; then
      continue
   fi
   if [ "$name" = "X" ]; then
      namelist="Xapollo"
   fi
   if [ "$name" = "nmconfig" ]; then
      continue
   fi
   if [ "$name" = "rtsvc" ]; then
      continue
   fi
   if [ "$name" = "uucico" ]; then
      continue
   fi
   if [ "$name" = "xpager2" -o "$name" = "xpager4" ]; then
      namelist="xpager"
   fi
   for nn in $namelist
   do
      grep -s "$nn" $pslisting >/dev/null 2>&1
      if [ "$?" != "0" ]; then
         if [ ! -f /tmp/Hourly.$nn ]; then
	    touch /tmp/Hourly.$nn
            mail -s "Trouble: $nn died on $host" $admin <<EOF4
$0 could not find the $nn daemon on $host.

ps listing:
-----------

`cat $pslisting`
EOF4
	 fi
      else
	 /bin/rm -f /tmp/Hourly.$nn
      fi
   done
done
#
# Check for NFS servers that have crashed or come back online.
# ------------------------------------------------------------
#
# Check for NFS servers that appear to be down. This may cause
# problems with scripts that scan the entire disk.
# Just move the link to a temporary directory (/tmp/.NFSdown) while
# the remote NFS server is unavailable.
#
if [ -f /etc/fstab ]; then
   fstab=/etc/fstab
elif [ -f /etc/checklist ]; then
   fstab=/etc/checklist
fi
NFSdowndir=/tmp/.NFSdown
if [ ! -d $NFSdowndir ]; then
   mkdir $NFSdowndir
   chown root $NFSdowndir
   chgrp staff $NFSdowndir
   chmod 755 $NFSdowndir
   chacl -B $NFSdowndir
fi
cd $NFSdowndir
ruptimetmp=/tmp/ruptimetmp.$$
ruptime -a >$ruptimetmp
#
# Check all files in $NFSdowndir to make sure they are links
# and that the same link does not exist in /.
# Re-establish the link in / if the NFS server is up.
# Don't use 'for x in *' because that will hang.
#
for nfshost in `ls`
do
   if [ ! -h $nfshost ]; then
      mail -s "Bad file in $NFSdowndir on $host" $admin <<EOF5
Hourlystuff on $host has found a problem in $NFSdowndir:
   $nfshost is not a link.

$NFSdowndir directory contents:

`ll`
EOF5
      continue
   fi
#
   if [ -r /$nfshost ]; then
      mail -s "Duplicate link in $NFSdowndir on $host" $admin <<EOF6
Hourlystuff on $host has found a problem in $NFSdowndir:
   $nfshost also exists in /.

$NFSdowndir directory contents:

`ll`

/$nfshost:

`ll -d /$nfshost`
EOF6
      continue
   fi
#
   grep -s "^$nfshost *up " $ruptimetmp >/dev/null
   if [ "$?" = "0" ]; then
      mv $nfshost /
      mail -s "NFS server $nfshost now online for $host" $admin <<EOF7
Hourlystuff on $host has found that NFS server $nfshost is
back online - link /$nfshost restored.
EOF7
   fi
done
#
# Check all the down hosts to see if they are mounted on this node.
# If so, move the link in / to $NFSdowndir.
#
for nfshost in `grep ' *down ' $ruptimetmp | awk '{print $1}'`
do
   grep -s "^$nfshost:/" $fstab >/dev/null
   if [ "$?" = "0" ]; then
      if [ -h /$nfshost ]; then
         mv /$nfshost .
         mail -s "NFS server $nfshost down for $host" $admin <<EOF8
Hourlystuff on $host has found that NFS server $nfshost appears
to be down - link /$nfshost moved to $NFSdowndir.
EOF8
      elif [ ! -h $nfshost ]; then
         mail -s "Lost NFS link on $host" $admin <<EOF9
Hourlystuff on $host has found a problem in $NFSdowndir:
   /$nfshost does not exist, but $NFSdowndir/$nfshost is not a link.

$NFSdowndir directory contents:

`ll`
EOF9
      fi
   fi
done
#
cd /
/bin/rm -f $ruptimetmp
#
# Gaussian 88 checks.
# -------------------
#
# Check that there aren't any Gaussian 88 scratch files lying around.
# Wait for one more "Hourlystuff" check before doing anything. This
# avoids deleting the G88 files for jobs started between the time the
# 'ps' command was run and the following check is made.
#
g88scratch=/g88/scratch
tmpg88=/tmp/tmpg88
g88pidlist=
if [ -d $g88scratch ]; then
   cd $g88scratch
   oldfilelist=/tmp/oldfilelist.$$
   for f in *
   do
      if [ "$f" = "*" ]; then
	 /bin/rm -f $tmpg88.*
	 continue
      fi
      g88pid=`echo $f | sed -e 's/^g88-....0*//'`
      grep -s "^ *$g88pid " $pslisting >/dev/null 2>&1
#
# Found a left-over file. If the $tmpg88.$g88pid file already
# exists, then we've seen this left-over file before, so delete it.
# Otherwise, add this pid to the list of left-over pids, and wait for
# the next pass.
#
      if [ "$?" != 0 ]; then
	 if [ -f $tmpg88.$g88pid ]; then
	    ls -l $f >>$oldfilelist
	    /bin/rm -f $f
         else
            g88pidlist="$g88pidlist $g88pid"
	 fi
      fi
   done
#  if [ -s $oldfilelist ]; then
#     mail -s "G88 scratch files deleted on $host" $admin <$oldfilelist
#  fi
   /bin/rm -f $oldfilelist
   cd /
#
# Set the left-over flag files for the pids in the list (if any).
#
   /bin/rm -f $tmpg88.*
   if [ "$g88pidlist" != "" ]; then
      for g88pid in $g88pidlist
      do
         touch $tmpg88.$g88pid
      done
   fi
fi
#
# News checks - first on LOCKinput for newsrun.
# ---------------------------------------------
#
# Check that there aren't any news LOCKinput files lying around that
# don't have an active process associated with them.
# If the LOCKinput file exists, first create a copy of it, then wait
# for the next "Hourlystuff" check before doing anything. This avoids
# removing the news lock files when 'newsrun' got started between the
# time the 'ps' command was run and the following check is made.
#
newsdir=/usr/spool/news
newslib=/usr/local/lib/news
lockfile1=$newslib/LOCKinput
tmplockfile=/tmp/tmpLOCKinput
tblisting=/tmp/tblisting.$$
#
# Both LOCKinput and the temporary copy exist - time to check things out.
#
if [ -f $lockfile1 -a -f $tmplockfile ]; then
   deletetmplock=true
   cd $newslib
   newspid=`head -1 $lockfile1 | sed -e 's/^0*//'`
   grep -s "^ *$newspid " $pslisting >/dev/null 2>&1
#
# If the PID is not in the ps listing, try to clean up since
# news processing has failed somehow.
#
   if [ "$?" != 0 ]; then
      lockfile2=$newslib/L.$newspid
      lockfile3=$newsdir/in.coming/nruntmp.$newspid
      tb $newspid >$tblisting 2>&1
#
# If everything is consistent (all lockfiles exist, the real lock file
# and the temporary lock file contain the same PID, but the PID was not
# found in the ps listing), just clean up the mess.
#
      if [ -f $lockfile2 -a -f $lockfile3 \
       -a "`head -1 $lockfile1`" = "$newspid" \
       -a "`head -1 $tmplockfile`" = "$newspid" ]; then
         /bin/rm -f $lockfile1 $lockfile2 $lockfile3
         mail -s "News lock files deleted on $host" $admin <<EOF10
The following news lock files were deleted on $host:

$lockfile1
$lockfile2
$lockfile3

$tmplockfile contains "`head -1 $tmplockfile`".

tb listing:
-----------

`cat $tblisting`

ps listing:
-----------

`cat $pslisting`
EOF10
#
# If the temporary lock file contents don't match the real lock
# file contents, just copy the lock file over and wait for the next
# "Hourlystuff" check.
#
      elif [ "`head -1 $lockfile1`" = "$newspid" \
       -a "`head -1 $tmplockfile`" != "$newspid" ]; then
         cp $lockfile1 $tmplockfile
	 deletetmplock=false
#
# Things are not consistent - try to log the mess but touch nothing.
#
      else
         mail -s "Possible news lock file problem on $host" $admin <<EOF11
The following news lock files should be investigated/deleted on $host:

$lockfile1
$lockfile2
$lockfile3

but either one or more files does not exist, or the contents of
$lockfile1 (which is "`head -1 $lockfile1`") does not match
the news process id "$newspid". Also, the temporary lock file
$tmplockfile contains "`head -1 $tmplockfile`".
(Note: there is a small window where news can be trying to create the
lock files while this check is being done, but every thing is OK.)

ls listings:
------------

`ls -lg $lockfile1`
`ls -lg $lockfile2`
`ls -lg $lockfile3`

tb listing:
-----------

`cat $tblisting`

ps listing:
-----------

`cat $pslisting`
EOF11
      fi
   fi
   if [ "$deletetmplock" = "true" ]; then
      /bin/rm -f $tmplockfile
   fi
   cd /
#
# Only the LOCKinput file found: copy it and wait for next check.
#
elif [ -f $lockfile1 ]; then
   cp $lockfile1 $tmplockfile
#
# No LOCKinput file found: delete any temporary copy.
#
else
   /bin/rm -f $tmplockfile
fi
#
# News checks - second on LOCK for general news stuff.
# ----------------------------------------------------
#
# Check that there isn't a news LOCK file lying around that
# don't have an active process associated with it. This is complicated
# since the 'locknews' program doesn't put a process id into the file,
# just links LOCK to the 'sys' file; so if LOCK exists, and contains
# more than 1 word, just mail to the system manager and give up.
# If the LOCK file exists, first create a copy of it, then wait
# for the next "Hourlystuff" check before doing anything. This avoids
# removing the news LOCK file when a process got started between the
# time the 'ps' command was run and the following check is made.
#
lockfile1=$newslib/LOCK
tmplockfile=/tmp/tmpLOCK
#
# If there is the news system LOCK file contains anything other than
# a simple process id, give up after alerting the sysadmin. 
#
newscheck=no
if [ -f $lockfile1 ]; then
   set `wc -w $lockfile1`
   if [ "$#" = "2" -a "$1" = "1" ]; then
      newscheck=yes
   fi
fi
if [ "$newscheck" = "no" -a -f $lockfile1 -a -f $tmplockfile ]; then
   mail -s "Strange news LOCK file on $host" $admin <<EOF12
The following strange news LOCK file was found on $host:

$lockfile1 contains:
"`head -1 $lockfile1`"

ps listing:
-----------

`cat $pslisting`
EOF12
fi
#
# Both LOCK and the temporary copy exist - time to check things out.
#
if [ "$newscheck" = "yes" -a -f $lockfile1 -a -f $tmplockfile ]; then
   deletetmplock=true
   cd $newslib
   newspid=`head -1 $lockfile1 | sed -e 's/^0*//'`
   grep -s "^ *$newspid " $pslisting >/dev/null 2>&1
#
# Try to clean up - news processing has failed somehow.
#
   if [ "$?" != 0 ]; then
      tb $newspid >$tblisting 2>&1
#
# If everything is consistent, just clean up the mess.
#
      if [ "`head -1 $lockfile1`" = "$newspid" \
       -a "`head -1 $tmplockfile`" = "$newspid" ]; then
         /bin/rm -f $lockfile1
         mail -s "News LOCK file deleted on $host" $admin <<EOF20
The following news LOCK file was deleted on $host:

$lockfile1

$tmplockfile contains "`head -1 $tmplockfile`".

tb listing:
-----------

`cat $tblisting`

ps listing:
-----------

`cat $pslisting`
EOF20
#
# If everything is consistent except that the temporary lock file
# contents doesn't match the real lock file contents, just copy the lock
# file over and wait for the next "Hourlystuff" check.
#
      elif [ "`head -1 $lockfile1`" = "$newspid" ]; then
         cp $lockfile1 $tmplockfile
	 deletetmplock=false
#
# Things are not consistent - try to log the mess but touch nothing.
#
      else
         mail -s "News LOCK file problem on $host" $admin <<EOF21
The following news lock files should be deleted on $host:

$lockfile1

but the contents of
$lockfile1 (which is "`head -1 $lockfile1`") does not match
the news process id "$newspid". Also, the temporary lock file
$tmplockfile contains "`head -1 $tmplockfile`".

ls listings:
------------

`ls -lg $lockfile1`

tb listing:
-----------

`cat $tblisting`

ps listing:
-----------

`cat $pslisting`
EOF21
      fi
   fi
   if [ "$deletetmplock" = "true" ]; then
      /bin/rm -f $tmplockfile
   fi
   cd /
#
# Only the LOCK file found: copy it and wait for next check.
#
elif [ -f $lockfile1 ]; then
   cp $lockfile1 $tmplockfile
#
# No LOCK file found: delete any temporary copy.
#
else
   /bin/rm -f $tmplockfile
fi
#
# Batchd checks - look for batch jobs that have disappeared.
# ----------------------------------------------------------
#
# Check for jobs being run by 'batchd' that don't have an active
# process associated with them. If such jobs exist, first create
# a list of them, then wait for the next "Hourlystuff" check before
# doing anything. This avoids removing the job when a process got
# started between the time the 'ps' command was run and the following
# check is made.
#
batchdir=/usr/spool/batch
tmpbatchfile=/tmp/tmpbatch
oldtmpbatchfile=$tmpbatchfile.old
#
# Loop over all the running batch jobs, which are indicated by the
# $batchdir/*/ef* files (which contain the pid of the process),
# making sure each one is still running. If not, look in the list
# disappeared processes from last time: if it was there, delete the
# input and output files; if not, just add it to the list
# of disappeared processes (it will be deleted next time around
# if it is still hanging around). Batchd will clean up once it discovers
# that the files associated with the job have been deleted.
#
if [ -d $batchdir ]; then
   /bin/rm -f $oldtmpbatchfile
   if [ -f $tmpbatchfile ]; then
      mv $tmpbatchfile $oldtmpbatchfile
   fi
   cd $batchdir
   for queue in *
   do
      if [ "$queue" = "*" ]; then
	 continue
      fi
      cd $batchdir/$queue
      for file in ef*
      do
	 if [ "$file" = "ef*" ]; then
	    continue
	 fi
         batchpid=`cat $file | sed -e 's/^pgrp //'`
         grep -s "^ *$batchpid " $pslisting >/dev/null 2>&1
#
# If this batch pid is not in the current ps listing, keep checking.
#
         if [ "$?" != 0 ]; then
            grep -s "^pgrp $batchpid$" $oldtmpbatchfile >/dev/null 2>&1
#
# If the batch pid was in the old list, delete the job files;
# batchd will clean up and re-sync itself sooner or later.
#
            if [ "$?" = "0" ]; then
               jobid=`echo $file | sed -e 's/^ef//'`
               mail -s "Batch files deleted on $host" $admin <<EOF30
The following batch files in queue $queue were deleted on $host:

`ls -l ef$jobid`
`ls -l of$jobid`
`ls -l CFDIR/cf$jobid`

The batch job process group is $batchpid.

ps listing:
-----------

`cat $pslisting`
EOF30
               /bin/rm -f ef$jobid of$jobid CFDIR/cf$jobid
#
# If the batch pid was not in the old list, add it to the current
# list of lost jobs and wait for Hourlystuff to run again before
# doing anything.
#
            else
	       cat $file >>$tmpbatchfile
            fi
         fi
      done
   done
   cd /
   /bin/rm -f $oldtmpbatchfile
fi
#
# Check for processes stuck in the CPU.
# -------------------------------------
#
# Check for processes running at nice 0 (or negative nice values)
# that appear stuck in the cpu. If such a job exists, create
# a record of it, then wait for the next "Hourlystuff" check before
# doing anything. This avoids removing a process that is temporarily
# hogging the cpu.
# This is somewhat complicated by the version of 'top' for Domain,
# which has to run at least 2 passes to reflect the current situation
# (the first pass uses the total cpu time since process start).
#
stuckprocfile=/tmp/tmpstuckproc
topout=/tmp/top.out.$$
ntopargs=12
ntoploops=0
maxtoploops=5
savetopdata=1
#
load=`uptime | sed -e 's/^.*: *//' -e 's/\..*$//'`
if [ "$load" -eq "0" ]; then
   savetopdata=0
else
   top -i 3 >$topout &
   npasses=0
   while [ "$npasses" -lt "2" -a "$ntoploops" -lt "$maxtoploops" ]; do
      sleep 5
      npasses=`grep '^  Pid ' $topout | wc -l`
      let ntoploops="$ntoploops+1"
   done
   kill $!
   ex $topout <<'EXINPUT' >/dev/null
%g/^  Pid /1,.d
wq
EXINPUT
   set `head -1 $topout`
   if [ "$#" -ge "$ntopargs" -a "$ntoploops" -lt "$maxtoploops" ]; then
      newpid=$1
      newusername=$3
      newniceval=$6
      newtopline="$*"
      shift 11
# If top process has NICE>0, all is OK.
      if [ "$newniceval" -gt "0" ]; then
         savetopdata=0
# If top process is a "Null Process", all is OK.
      elif [ "$#" -eq "4" -a "$1 $2 $3 $4" = "[ Null Process ]" ]; then
         savetopdata=0
# Top process has NICE<=0, this may indicate a stuck process.
      else
# There was a stuck process last time - check if it was the same one.
         if [ -f $stuckprocfile ]; then
            . $stuckprocfile
	    if [ "$oldpid" = "$newpid" -a \
	     "$oldusername" = "$newusername" ]; then
	       savetopdata=0
	       renice 19 $newpid
               mail -s "Possible stuck process on $host" $admin <<EOF40
Hourlystuff on $host has found a possible stuck process:

$newtopline

The process has been reniced to 19.
EOF40
               if [ "$newusername" != "root" -a \
	        "$newusername" != "$admin" ]; then
	          mail -s "Possible stuck process on $host" $newusername <<EOF41
An automatic system check on $host has found a process that has been
running at priority (nice value) $newniceval for quite a while:

$newtopline

The process has been reniced to 19. Jobs longer than 5 minutes
should be submitted to the 'batch' queue(s).
EOF41
	       fi
	    fi
         fi
      fi
   else
      mail -s "'Top' failure in Hourlystuff on $host" $admin <<EOF42
'Top' on $host returned $# arguments ($ntopargs expected):

`echo $*`

and number of 'top' loops was $ntoploops ($maxtoploops is the limit).
Stuck process check abandoned.
EOF42
      savetopdata=0
   fi
fi
# Save data for the current "top" process, if necessary.
if [ "$savetopdata" = "0" ]; then
   /bin/rm -f $stuckprocfile
else
   echo "# Previous 'stuck' process info:" >$stuckprocfile
   echo "# $newtopline" >>$stuckprocfile
   echo "oldpid=$newpid" >>$stuckprocfile
   echo "oldusername=$newusername" >>$stuckprocfile
   echo "oldniceval=$newniceval" >>$stuckprocfile
fi
/bin/rm -f $topout
#
# If the node is not running a sendmail daemon, try to run the queue.
#
if [ ! -f /etc/daemons/sendmail ]; then
   /usr/lib/sendmail -q &
fi
#
# Sync the master registry with the NIS passwd map,
# but only on the registry master node.
# Note that information on missing/extra accounts is mailed
# directly from the 'ypxfr.passwd' script, so not mailing the
# output of the command should only contain modified accounts.
#
if [ "$rgymaster" = "$host" ]; then
   yplisting=/tmp/ypxfr.out.$$
   /bin/rm -f $yplisting
   if [ -f /usr/local/bin/ypxfr.passwd ]; then
      /usr/local/bin/ypxfr.passwd 2>&1 >>$yplisting
   else
      mail -s "Can't sync registry with NIS passwd map on $host" $admin <<EOF98
$0 can't find /usr/local/bin/ypxfr.passwd
to sync the registry with NIS passwd map.
EOF98
   fi
   let i="`grep -v '^#### ypxfr' $yplisting | wc -l`"
#   if [ $i -ne 0 ]; then
#      mail -s "Registry sync with NIS passwd/group maps on $host" $admin <<EOF99
#$0 registry sync gave:
#
#`cat $yplisting`
#EOF99
#   fi
fi
#
# Clean up.
#
/bin/rm -f $pslisting $tblisting $yplisting
#
# Prepare for next time.
#
touch $bootdone
#
exit 0
#
# End of Hourlystuff.
#
