Monday, December 17, 2007

Simple Sun Cluster Monitoring Script

Hey There,

Today, I've included a script I wrote to monitor a small SunCluster 3.1 environment (Running Oracle Parallel Server - OPS) we have set up at our shop. It basically runs through the output of various "scstat" command variations and reports on all combinations of errors it encounters. I built some functionality in it to be as specific as possible about the error states and to make sure that it returns an "All Clear" once the error condition no longer exists.

I also wrote this to run in cron. I run it every 5 minutes. Any time period is acceptable, depending upon what you need, and, of course, you could always put this script in a wrapper so that it runs constantly (although that generally necessitates writing another script to make sure that this script is running, restarting it as necessary and vice versa - For Veritas Cluster Server fans out there, this is a cheap and quick way to imitate the relationship between had and hashadow).

This was also written for a small environment (2 machines with 2 "live" network connections each). I didn't include monitoring of the heartbeats, since the script is meant to be run locally (with "exactly" the same values in the customizable section) on all nodes in a cluster, and scstat's indication of failure on any of these tests is, in and of itself, a guarantee that loss of a heartbeat connection is the very least of your problems ;)

Enjoy!


Creative Commons License


This work is licensed under a
Creative Commons Attribution-Noncommercial-Share Alike 3.0 United States License

#!/usr/bin/perl

###################################
# suncluster_mon - check cluster health
# 2007 - Mike Golvach - eggi@comcast.net
#
# Creative Commons Attribution-Noncommercial-Share Alike 3.0 United States License
#
# simple parser for:
# 1. scstat -n
# 2. scstat -i
# 3. scstat -g
# 4. scstat -D
#
###################################

###################################
# CUSTOMIZE SETTINGS HERE
# $debug can be set with the -d
# switch when invoking the script
###################################
$primary_node="";
$secondary_node="";
$primary_eth="";
$secondary_eth="";
$date=`/usr/bin/date "+%A, %B %d - %H:%M:%S"`;
$hostname=`/usr/bin/hostname`;
$debug=0;
###################################
# PLEASE DO NOT EDIT BELOW
# EXCEPT MAIL SETTINGS AT END
###################################

chomp($date);

if ( $#ARGV >= 0 ) {
foreach $arg (@ARGV) {
if ( $arg =~ /-d/ ) {
$debug=1;
print "Debug Output Set\n";
} else {
push(@nogo, $arg)
}
}
if ( @nogo > 0 ) {
print "Unrecognized options: @nogo\n";
print "Ignoring and continuing\n";
}
}

@scstatn=`/usr/cluster/bin/scstat -n|/usr/bin/egrep 'node:'`;
for $nodestat (@scstatn) {
if ( $nodestat =~ /Cluster node:/ ) {
$nodestat =~ s/Cluster node://;
@nodestat = split(" ", $nodestat);
if ( $nodestat[1] eq "Offline" && $nodestat[0] eq $secondary_node ) {
if ( $nodefailure ) {
$alert="\nCluster Alert:\n$primary_node and $secondary_node are both in Offline state. Cluster Failed\n";
} else {
$alert="\nCluster Alert:\n$primary_node is in Offline state. $secondary_node is Online - Cluster Crippled\n";
}
$alert="\nCluster Alert:\n$nodestat[0] has switched to $nodestat[1] Status - Cluster Crippled - Checking For Dual Failure\n";
$nodefailure=1;
push(@alert, $alert);
} elsif ( $nodestat[1] eq "Offline" && $nodestat[0] eq $primary_node ) {
if ( $nodefailure ) {
$alert="\nCluster Alert:\n$primary_node and $secondary_node are both in Offline state. Cluster Failed\n";
} else {
$alert="\nCluster Alert:\n$primary_node is in Offline state. $secondary_node is Online - Cluster Crippled\n";
}
$alert="\nCluster Alert:\n$nodestat[0] has switched to $nodestat[1] Status - Cluster Crippled - Checking For Dual Failure\n";
$nodefailure=1;
push(@alert, $alert);
} else {
if ( $debug ) {
$alert="Cluster Info: $nodestat[0] and $nodestat[1] OK\n";
push(@debug, $alert);
}
$beautiful++;
}
} else {
print "WTF ---- SCSTAT -D\n";
}
}
if ( $beautiful == 2 ) {
if ( $debug ) {
$alert="Cluster Info: Both nodes OK\n";
push(@debug, $alert);
}
}

@scstati=`/usr/cluster/bin/scstat -i|/usr/bin/egrep 'Group:'`;
for $ipmpstat (@scstati) {
if ( $ipmpstat =~ /IPMP Group:/ ) {
$ipmpstat =~ s/IPMP Group://;
@ipmpstat = split(" ", $ipmpstat);
if ( $ipmpstat[0] == $primary_node ) {
if ( $ipmpstat[3] eq $primary_eth && $ipmpstat[4] ne "Online" ) {
$alert="\nIPMP Alert:\n$ipmpstat[3] is in $ipmpstat[4] mode on $ipmpstat[0] - Checking for Online of $ipmpstat[0] $secondary_eth\n";
$ethprimaryfailure=1;
push(@alert, $alert);
} elsif ( $ipmpstat[3] eq $secondary_eth && $ipmpstat[4] ne "Online" && $ethprimaryfailure == 1 ) {
$alert="\nIPMP Alert:\n$ipmpstat[0] $secondary_eth is not in an Online state on $ipmpstat[0] - IPMP Group Down on both Interfaces!\n";
push(@alert, $alert);
} elsif ( $ipmpstat[3] eq $secondary_eth && $ipmpstat[4] ne "Standby" && ! $ethprimaryfailure ) {
$alert="\nIPMP Alert:\n$ipmpstat[3] is in $ipmpstat[4] mode on $ipmpstat[0] - Failover interface is not UP - IPMP Group Crippled on $ipmpstat[0]!\n";
push(@alert, $alert);
} else {
if ( $debug ) {
$alert="IPMP Info: IPMP Group $ipmpstat[0] $ipmpstat[3] OK\n";
push(@debug, $alert);
}
}
} elsif ( $ipmpstat[0] == $secondary_node ) {
if ( $ipmpstat[3] eq $primary_eth && $ipmpstat[4] ne "Online" ) {
$alert="\nIPMP Alert:\n$primary_eth is in $ipmpstat[4] mode on $ipmpstat[0] - Checking for Online of $ipmpstat[0] $secondary_eth\n";
$ethprimaryfailure=1;
push(@alert, $alert);
} elsif ( $ipmpstat[3] eq $secondary_eth && $ipmpstat[4] ne "Online" && $ethprimaryfailure == 1 ) {
$alert="\nIPMP Alert:\n$ipmpstat[0] ipmpstat[3] not in an Online state on ipmpstat[0] - IPMP Group Down on both Interfaces!\n";
push(@alert, $alert);
} elsif ( $ipmpstat[3] eq $secondary_eth && $ipmpstat[4] ne "Standby" && ! $ethprimaryfailure ) {
$alert="\nIPMP Alert:\n$ipmpstat[3] is in $ipmpstat[4] mode on $ipmpstat[0] - Failover interface is not UP - IPMP Group Crippled on $ipmpstat[0]!\n";
push(@alert, $alert);
} else {
if ( $debug ) {
$alert="IPMP Info: IPMP Group $ipmpstat[0] $ipmpstat[3] OK\n";
push(@debug, $alert);
}
}
}
} else {
print "WTF ---- SCSTAT -I\n";
}
}

@scstatg=`/usr/cluster/bin/scstat -g|/usr/bin/egrep 'Resource:|Resources:|Group:'`;
for $rgstat (@scstatg) {
if ( $rgstat =~ /Resources:/ ) {
$rgstat =~ s/Resources://;
@rgstat = split(" ", $rgstat);
if ( @rgstat != 5 ) {
shift @rgstat;
$alert="\nOracle Resource Alert:\nOracle resource group $rgstat[0] is not running all resources on the cluster\n--Only running -- @rgstat\n";
push(@alert, $alert);
} else {
if ( $debug ) {
$alert="Oracle Resource Info: Oracle resource group $rgstat[0] OK -- @rgstat OK\n";
push(@debug, $alert);
}
}
} elsif ( $rgstat =~ /Group:/ ) {
$rgstat =~ s/Group://;
@rgstat = split(" ", $rgstat);
if ( $rgstat[2] ne "Online" && $rgstat[1] eq $primary_node ) {
$alert="\nOracle Resource Alert:\nResource Group $rgstat[0] is in $rgstat[2] state on primary node $rgstat[1] - Checking for failover\n";
$rgprimaryfailure=1;
push(@alert, $alert);
} elsif ( $rgstat[2] ne "Online" && $rgstat[1] eq $secondary_node && $rgprimaryfailure == 1 ) {
$alert="\nOracle Resource Alert:\nResource Group $rgstat[0] is in $rgstat[2] state on cluster - Resource Group Down on both Nodes!\n";
push(@alert, $alert);
} else {
if ( $debug ) {
$alert="Oracle Resource Info: $rgstat[1] Resource Group $rgstat[0] OK\n";
push(@debug, $alert);
}
}
} elsif ( $rgstat =~ /Resource:/ ) {
$rgstat =~ s/Resource://;
@rgstat = split(" ", $rgstat);
if ( $rgstat[2] ne "Online" && $rgstat[1] eq $primary_node ) {
$alert="\nOracle Resource Alert:\nResource$rgstat[0] is in $rgstat[2] state on primary node $rgstat[1] - Checking for failover\n";
$rgprimaryfailure=1;
push(@alert, $alert);
} elsif ( $rgstat[2] ne "Online" && $rgstat[1] eq $secondary_node && $rgprimaryfailure == 1 ) {
$alert="\nOracle Resource Alert:\nResource $rgstat[0] is in $rgstat[2] state on cluster - Resource Group Down on both Nodes!\n";
push(@alert, $alert);
} else {
if ( $debug ) {
$alert="Oracle Resource Info: $rgstat[1] Resource $rgstat[0] OK\n";
push(@debug, $alert);
}
}
} else {
print "WTF ---- SCSTAT -G\n";
}
}

@scstatD=`/usr/cluster/bin/scstat -D|/usr/bin/egrep 'servers|status'`;
for $diskstat (@scstatD) {
if ( $diskstat =~ /servers/ ) {
$diskstat =~ s/Device group servers://;
@diskstat = split(" ", $diskstat);
if ( $diskstat[1] ne $primary_node ) {
$alert="\nDisk Resource Alert:\n$diskstat[0] has switched primary node to $diskstat[1] from $diskstat[2]\n";
push(@alert, $alert);
} elsif ( $diskstat[1] ne $primary_node && $diskstat[1] ne $secondary_node ) {
$alert="\nDisk Resource Alert:\n$diskstat[0] has failed on all nodes!\n";
push(@alert, $alert);
} else {
if ( $debug ) {
$alert="Disk Resource Info: $diskstat[1] primary - $diskstat[0] OK\n";
push(@debug, $alert);
}
}
} elsif ( $diskstat =~ /status/ ) {
$diskstat =~ s/Device group status://;
@diskstat = split(" ", $diskstat);
if ( $diskstat[1] ne "Online" ) {
$alert="\nDisk Resource Group Alert:\n$diskstat[0] has switched to $diskstat[1] state on the cluster\n";
push(@alert, $alert);
} elsif ( $diskstat[1] eq "Online" ) {
if ( $debug) {
$alert="Disk Resource Group Info: $diskstat[0] OK\n";
push(@debug, $alert);
}
}
} else {
print "WTF ---- SCSTAT -D\n";
}
}

###################################
# EDIT To: Reply-To: and From:
# if you want mail to go somewhere
# useful and-or helpful!
###################################

if ( @alert > 0 || @debug > 0 ) {
open(CMAIL, "|/usr/lib/sendmail -t");
print CMAIL "Subject: CLUSTER ALERT - $hostname - $date\n";
print CMAIL "From: you\@yourdomain.com\n";
print CMAIL "Reply-To: you\@yourdomain.com\n";
print CMAIL "To: recipients\@yourdomain.com\n";
print CMAIL "\n\n";
foreach $warning (@alert) {
print CMAIL $warning;
}
if ( $debug > 0 ) {
foreach $message (@debug) {
print CMAIL $message;
}
}
close(CMAIL);
system("touch /tmp/cfail_ihot_stat");
} elsif ( @alert == 0 && -f "/tmp/cfail_ihot_stat" ) {
open(CMAIL, "|/usr/lib/sendmail -t");
print CMAIL "Subject: CLUSTER RESTORED - $hostname - $date\n";
print CMAIL "From: you\@yourdomain.com\n";
print CMAIL "Reply-To: you\@yourdomain.com\n";
print CMAIL "To: recipients\@yourdomain.com\n";
print CMAIL "\n\n";
print CMAIL "All Cluster Services Back To Good State\n";
print CMAIL "All Cluster Nodes: OK\n";
print CMAIL "All IPMP Groups: OK\n";
print CMAIL "All Oracle Resources: OK\n";
print CMAIL "All Storage Groups: OK\n";
close(CMAIL);
unlink("/tmp/cfail_ihot_stat");
}


, Mike