#!/usr/bin/perl
# 
# Author: Christopher Browne
# Copyright 2004-2009 Afilias Canada

use Getopt::Long;

# Defaults
$CONFIG_FILE = '/etc/slony1/slon_tools.conf';
$SHOW_USAGE  = 0;
$WATCHDOG_VERBOSE = 1;

# Read command-line options
GetOptions("config=s"  => \$CONFIG_FILE,
           "help"      => \$SHOW_USAGE);

my $USAGE =
"Usage: slon_watchdog2 [--config file] node# sleep_seconds

    --config file  Location of the slon_tools.conf file

    sleep_seconds  Number of seconds for the watchdog process to sleep
                   between checks

";

if ($SHOW_USAGE or scalar(@ARGV) != 2) {
  die $USAGE;
}

require '/usr/share/slony1/slon-tools.pm';
require $CONFIG_FILE;

$node  = $ARGV[0];
$sleep = $ARGV[1];

if ($node =~ /^(?:node)?(\d+)$/) {
  $node = "node$1";
  $nodenum = $1;
} else {
  die $USAGE;
}

my $logfile = "$LOGDIR/slon-watchdog.log";

log_to_file( $logfile , "Invoking watchdog for $CLUSTER_NAME node $nodenum, sleep time = $sleep +/- " . int($sleep/2) . " seconds");

# When slon daemon is just started, may not have time to start syncronization
# and the watchdog will kill the process with no mercy.
# So sleep to give time to slony try to do their job.
sleep $sleep;

while (1) {
  my $res = query_slony_status($nodenum);    # See where the node stands
  my $eventsOK;
  if ($res =~ /^\s*t\s*\|/) {
    $eventsOK = "YES";
    if ( $WATCHDOG_VERBOSE ) {
      log_to_file( $logfile , "Query_slony_status returns true for $CLUSTER_NAME node $nodenum" );
    }
  } else {
    $res = node_is_subscribing($nodenum);
    if ($res =~ /SUBSCRIBE_SET/) {
      $eventsOK = "YES";
    } else {
      $eventsOK = "NO";
    }
    if ( $WATCHDOG_VERBOSE ) {
      log_to_file( $logfile , "Query_slony_status returns false for $CLUSTER_NAME node $nodenum, node_is_subscribing : $eventsOK" );
    }
  }
  my $pid = get_pid($node);  # See if the slon process is alive
  my ($restart, $kick);
  $kick = "NO";   # Initially, assume we don't need to submit a "restart node" command
  if ($pid) {  # PID is alive...
    if ($eventsOK eq "YES") {
      # All is well - do nothing!
      $restart = "NO";
    } else {
      $restart = "YES";
    }
  } else {
    $restart = "YES";

#  The message searched isn't generated as bellow anymore...
#  and may be exist another better way to know if the restart of node is necessary
#  so this is a TODO, commenting the code since this is not working
#
#    # See if the slon log ends with "FATAL  localListenThread: Another slon daemon is serving this node already"
#    my $lastlog=`/bin/ls -t $LOGDIR/node$nodenum/$dbname*log | head -1`;
#    my $lastline=`tail -1 $lastlog`;
#    if ($lastline =~ /Another slon daemon is serving this node already/) {
#      $kick = "YES";   # Yup, need to tell slonik to reset this node
#    }

    # Kicking allways as slon_watchdog.pl do
    $kick = "YES";
  }

  # If the node needs a swift kick in the "RESTART", then submit that to slonik
  if ($kick eq "YES") {
    log_to_file($logfile,"submit slonik to restart $CLUSTER_NAME node $nodenum");
    if ($CONFIG_FILE ne "") {
      system "(/usr/bin/slonik_restart_node --config=${CONFIG_FILE} $node | /usr/bin/slonik) >> $logfile 2>> $logfile";
    } else {
      system "(/usr/bin/slonik_restart_node $node | /usr/bin/slonik) >> $logfile 2>> $logfile";
    }
  }
  if ($restart eq "YES") {
    if ($pid) {
      log_to_file($logfile,"terminate slon daemon for $CLUSTER_NAME node $nodenum, PID $pid");
      # Kill slon until dead...
      kill 2, $pid;
      sleep 3;
      kill 15, $pid;
      sleep 3;
      # if killed with 9 the pid file isn´t deleted and the service don´t restart
      # kill 9, $pid;
    }
    log_to_file($logfile,"restart slon for $CLUSTER_NAME node $nodenum");
    start_slon($nodenum);
  }
  sleep $sleep + (rand($sleep) - $sleep/2);
}

sub log_to_file {
  my ($logfile,$message) = @_;
  chomp $message;
  my $date = `date`;
  chomp $date;
  open (SLONLOG, ">>$logfile");
  print SLONLOG $date, "|", $message, "\n";
  close SLONLOG;
}
