#!/usr/bin/perl

# mysql_watchdog.pl     Monitor Mysql Daemon for possible lockups.
# ---------------------------------------------------------------------------
# Author: Yermo Lamers yml@yml.com
#
# Copyright (c) 1997 Yermo Lamers. All rights reserved.
#
# Redistribution and use, with or without modification, are permitted 
# provided that redistributions must retain the above copyright notice
# and the following disclaimer:
#
# This  software  is provided by Yermo Lamers ``as is'' and any express
# or  implied  warranties,  including,  but  not  limited  to, the implied
# warranties  of  merchantability and fitness for a particular purpose are
# disclaimed.  In  no  event  shall  Yermo Lamers  be liable
# for   any   direct,   indirect,   incidental,   special,  exemplary,  or
# consequential  damages  (including,  but  not limited to, procurement of
# substitute  goods  or  services;  loss  of  use,  data,  or  profits; or
# business  interruption)  however  caused and on any theory of liability,
# whether  in  contract,  strict  liability, or tort (including negligence
# or  otherwise)  arising in any way out of the use of this software, even
# if advised of the possibility of such damage.
#
# ---------------------------------------------------------------------------
# PACKAGE:       Mysqld Watchdog Script
# ---------------------------------------------------------------------------
# DESCRIPTION:			      
# 
#   The mysqld daemon is prone to lockups causing queries to hang
#   indefinitely.
#
#   This script polls mysqld at regular intervals. If the server
#   does not respond promptly, mysql_watchdog will kill mysqld
#   safe_mysqld should then notice this event and restart it.
#
# USAGE:
#
#   ./mysql_watchdog.pl | tee --append /tmp/watchdog.log
#
#   This will allow you to see the output in a window and keep it logged
#   to a file.
#
# ASSUMPTIONS:
# COMMENTS:
#
#   1. A log of events is kept in /tmp/mysql_watchdog.log
#
#   2. If you are having alot of problems with mysqld I find it useful
#      to leave this script running in a window.
#
#   3. This script generates ALOT of output over time.
#
# SEE ALSO:
# REVISION: 0.01.2dev
#
# REVISION HISTORY/COMMENTS:
# 
# 7 Oct 1997 YmL : 
#    . initial revision.
#
# 10 Oct 1997 YmL:
#    . now works correctly on systems that implement POSIX restartable
#      system calls.
#    . reduced restart wait down to 15 seconds.
#
# 20 Oct 1997 YmL:
#    . removed sleep from signal handler, now interrupts system call
#      correctly.
#
# --------------------------------------------------------------------------
#

# SETTINGS:
#
# set this to reflect your site. YOU MUST EDIT THIS LINE FOR THIS TO WORK!

# path to the mysql pid file 

$mysqld_pidfilename = "/usr/local/var/YOURPIDFILEHERE";

# some globals  - don't edit

$reconnect = "no";

# -------------------------------------------------------------------------

use Mysql;

# kill_mysqld()
# -----------------------------------------------------------------------
#
# DESCRIPTION:
#
#   This subroutine is invoked by a SIGALRM, gets the pid of mysqld
#   and sends it a SIGINT.
#
# COMMENTS:

sub kill_mysqld
   {

   my( $signame ) = shift;
   		
   my( $mysqld_pid, $count );

   # log file event
   
   print "handler : Mysqld stopped responding at " . localtime() . "\n";
      
   # get the pid of mysqld

   unless ( open( PIDFILE, $mysqld_pidfilename ) )
      {
      
      print  "Unable to open pid file '$mysqld_pidfilename' - $!\n";
      
      die "Restart";
      }
		     
   print "handler : Reading PID file\n";

   $mysqld_pid = <PIDFILE>;

   print  "handler : Killing mysqld process '$mysqld_pid'\n";

   # site specific, try killing httpd also. - you may want to comment
   # this out.
   
   # system( "killall httpd" );
   
   # kill off the server - the safe_mysqld script should restart the
   # server once it's terminated.
   
   $count = kill( 9, $mysqld_pid );
   
   print "handler : kill returned '$count'\n";

   # This subroutine gets called inside an eval() so that it
   # can interrupt the hung system call. Note that the value here
   # will be returned to the caller.

   print "handler : calling die\n";

   die "Restart\n";

   }  # end of kill_mysqld()

# ------------------------------------------------------------------------
# main

# make sure we can see what's going on by flushing STDOUT

$| = 1;

print  "\nmysql_watchdog.pl started on " . localtime() . "\n";

# wait 30 seconds to let the server come up (assuming we're being
# called from safe_mysqld )

sleep( 30 );

# establish a connection to the server - the server may already be 
# hanging at this point so make sure to wrap this call in 
# an alarm call.

print "Attempting initial connection at " . localtime() . "\n";

# From the Perl 5 book, we wrap the system call inside an eval 
# so that the hung system call can exit when the signal handler
# calls die() - this is one case where system V signal semantics are
# nicer IHMO.

eval 
   {

   local $SIG{ ALRM } = \&kill_mysqld;

   alarm( 15 );

   unless( $dbh = Mysql->connect( "localhost" ) )
      {

      alarm( 0 );

      # unable to connect to mysqld server.

      print "Unable to make initial connection to mysql server at " . localtime() . "due to error '" . $dbh->errmsg . "'\n";

      # if it's hung kill it off.

      kill_mysqld("ALRM");

      }
   else
      {

      alarm( 0 );

      print "Successful initial connection established at " . localtime() . "\n";

      $dbh->SelectDB( "mysql" );

      }

   };  # end of initital connection eval.

# check to see whether we need to reconnect or not

if ( $@ =~ /Restart/ )
   {

   print "Need to attempt a reconnect\n";

   $reconnect = "yes";

   }

# loop trying a show tables every 30 seconds

while ( 1 )
   {
   
   if ( $reconnect eq "yes" )
      {
      	 
      # the signal handler has run, so we need to reconnect to the
      # server - I believe it may be able to hang here.
 
      # note again the we have to do this in an eval because POSIX
      # system calls are restartable. - note the die() in the signal handler
   
      print "Attempting to reconnect at " . localtime() . "\n";

      eval 
         {

         local $SIG{ALRM} = \&kill_mysqld;

         alarm( 15 );
         
         unless ( $dbh = Mysql->connect( "localhost" ) )
            {
	 		   
            alarm( 0 );

	    # there was some problem trying to connect to mysqld
	 
	    print  "Unable to reconnect to Mysql - '" . $dbh->errmsg . "'";
   
            kill_mysqld("ALRM");

      	    }
         else
            {
	 
            alarm( 0 );

	    # the connection has been established.
	 
	    $reconnect = "no";	 
        
            print "Successful reconnection to mysqld at " . localtime() . "\n";

            # site specific - restart httpd - you may want to comment this
            # out
	 
            # print "Restarting httpd\n";

	    # system( "/usr/local/sbin/httpd" );
	 	 
	    }
     
         $dbh->SelectDB( "mysql" );
	  
         };  # end of reconnection eval.

      if ( $@ =~ /Restart/ )
         {

         print "Need to reconnect\n";

         $reconnect = "yes";

         }

      }  # end of if we had to reconnect to the server
      
   # execute a getserverstats() every 30 seconds - if 
   # we don't get a response the server needs to be restarted.
   
   # again, we have to do each operation in an eval so we don't
   # hang indefinitely.

   eval 
      {

      local $SIG{ALRM} = \&kill_mysqld;
      alarm( 15 );
   
      $status = $dbh->getserverstats();

      if ( $dbh->errmsg ne "" )
         {
      
         alarm( 0 );

         print "getserverstats() produced error '" . $dbh->errmsg . "'\n";
      
         # the only errors here would require a restart (I believe)

         kill_mysqld("ALRM");

         }

      alarm( 0 );
      
      };  # end of eval of getserverstats()

   if ( $@ =~ /Restart/ )
      {

      print "getservstats() was locked. Need to do a reconnect";

      $reconnect = "yes";

      }
   
   print $status . "\n";

   # now wait to run another query
   
   sleep( 30 );
   
   }
