#!/usr/bin/perl -w # See copyright, etc in below POD section. ###################################################################### require 5.005; use IO::File; use Pod::Usage; use Getopt::Long; use Schedule::Load::Hosts; use Time::localtime; use strict; ###################################################################### use vars qw($Debug $Debug_User $Nice19 $Sendmail $Opt_Renice_Min $Opt_Cpu_Min $Opt_Reserved_Min %Complaints @Opt_Cmds ); $Nice19 = "renice19 -only_if_at"; $Sendmail = "/usr/lib/sendmail"; ###################################################################### my %opt_server_params = (); $Debug_User = $ENV{USER}; # Who to send --debug mail to Getopt::Long::config ("no_auto_abbrev"); if (!GetOptions ( "help" => \&usage, "debug!" => \$Debug, "debug-user=s" => \$Debug_User, "cpu_min=i" => \$Opt_Cpu_Min, # Old _'s "cpu-min=i" => \$Opt_Cpu_Min, "renice_min=i" => \$Opt_Renice_Min, # Old _'s "renice-min=i" => \$Opt_Renice_Min, "reserved_min=i"=> \$Opt_Reserved_Min, # Old _'s "reserved-min=i"=> \$Opt_Reserved_Min, "port=i" => sub {shift; $opt_server_params{port} = shift;}, "dhost=s" => sub {shift; push @{$opt_server_params{dhost}}, split(':',shift);}, "<>" => \¶meter, )) { die "%Error: Bad usage, try 'slpolice --help'\n"; } my $scheduler = Schedule::Load::Hosts->fetch(%opt_server_params); gather_loads($scheduler); gather_reserveds($scheduler); complain(); #---------------------------------------------------------------------- sub usage { print "Version: $Schedule::Load::VERSION\n"; pod2usage(-verbose=>2, -exitval => 2); exit(1); } sub parameter { my $param = shift; if ($param =~ /^(.*)=([0-9]+)$/) { my $re = $1; my $minutes = $2; if ($re !~ /[.?*^\$]/) { $re = '^'.$re.'$'; } print "Regexp for cmd: $re $minutes\n" if $Debug; push @Opt_Cmds, [qr/$re/, $minutes]; } else { die "%Error: Unknown argument (missing=): $param\n"; } } ###################################################################### sub gather_loads { my $scheduler = shift; (my $FORMAT = "%-12s %6s %-8s %4s %6s %-5s %6s %5s%% %s\n") =~ s/\s\s+/ /g; foreach my $host ($scheduler->hosts_sorted) { foreach my $p ( sort {$b->pctcpu <=> $a->pctcpu} @{$host->top_processes} ) { my $mach = $host->hostname; my $name = $p->uname; #print "ck $line\n"; next if !$p->time; next if $name eq "root"; my $pid = $p->pid; my $cmd = $p->fname||""; my $min = $p->time/60.0; $min = 0 if ($Debug); $min = 9999 if ($Debug && $p->uid eq $<); # Test... Every one of runner's violates my $line = sprintf ($FORMAT, $host->hostname, $p->pid, $p->uname, $p->nice0, int(($p->size||0)/1024/1024)."M", $p->state, $p->time_hhmm, sprintf("%3.1f", $p->pctcpu), $p->fname); print "Min $min Name $name Cmd $cmd Pid $pid\n" if $Debug; my $renice_limit = $Opt_Renice_Min; my $cpu_limit = $Opt_Cpu_Min; foreach my $cmdminref (@Opt_Cmds) { my $re = $cmdminref->[0]; my $minutes = $cmdminref->[1]; if ($cmd =~ /$re/ || $p->uname =~ /$re/) { print " Command_line_regexp match\n" if $Debug; if ($Opt_Renice_Min) { $renice_limit = $minutes; } if ($Opt_Cpu_Min) { $cpu_limit = $minutes; } } } if ($renice_limit && $min >= $renice_limit) { my $lowered = 0; my $succ = `ssh $mach $Nice19 $pid 2>&1`; print "Lowering $mach $pid $succ\n" if $Debug; if ($succ !~ /%/) { $Complaints{$name}{niced}{$pid} = {one_subject => sprintf("Reniced Process %5d on $mach", $pid), many_subject => sprintf("Reniced Processes"), body_header => ("The following processes were reniced to 19\n" ."Use `renice10 ` to prevent this.\n"), body_line => " ".$line, }; } } if ($cpu_limit && $min >= $cpu_limit) { $Complaints{$name}{cpu}{$pid} = {one_subject => sprintf("High CPU Time Process %5d on $mach", $pid), many_subject => sprintf("CPU Consuming Processes"), body_header => ("The following processes have large CPU times\n" ."Please consider killing them.\n"), body_line => " ".$line, }; } } } } sub gather_reserveds { my $scheduler = shift; (my $FORMAT = "%-12s %-25s %s\n") =~ s/\s\s+/ /g; foreach my $host ($scheduler->hosts_sorted) { if ($host->reserved) { my $ostype = $host->archname ." ". $host->osvers; $ostype =~ s/enterprise//; $ostype .= " (on ".$host->slreportd_hostname.")" if $host->slreportd_hostname ne $host->hostname; # Really the scheduler should provide preparsed information.... if ($host->reserved !~ /(\S+) at (\d\d)-(\S+) (\d\d):(\d\d)(;?.*)$/) { print "Res Parse Failed: ".$host->reserved."\n" if $Debug; next; } my ($name,$mday,$mon, $hr,$dmin,$cmt) = ($1, $2,$3, $4,$5, $6); print "Compare then $mday $hr:$dmin now ",localtime->mday," ",localtime->hour,":",localtime->min,"\n" if $Debug; my $min = localtime->min - $dmin; $min += 60 * (localtime->hour - $hr); $min += 24 * 60 * (localtime->mday - $mday); $min = 0 if $min < 0; # Too lazy to check months. Let user get away with it. #$min = 9999 if ($Debug); # Test... Every one of runner's violates my $res = "Reserved: ".$host->reserved; my $line = sprintf ($FORMAT, $host->hostname, $ostype, $res); print "Res: $min: $line\n" if $Debug; if ($name ne "root" && $Opt_Reserved_Min && !$cmt && $min >= $Opt_Reserved_Min) { $Complaints{$name}{reserved}{$host->hostname} = {one_subject => sprintf("Long Reservation of %s", $host->hostname), many_subject => sprintf("Long Reservations"), body_header => ("The following reservations have been around for a long time\n" ."Please consider releasing them. Or, use a --comment with\n" ."the reservation explaining your reservation reason.\n"), body_line => " ".$line, }; } } } } sub complain { for my $to (sort (keys %Complaints)) { my $body = ""; $body .= "DEBUGGING. Really-To: $to\n\n" if $Debug; my $mailto = $to; $mailto = $Debug_User if $Debug; my $subj = undef; for my $topic (sort (keys %{$Complaints{$to}})) { my $newtopic = 1; for my $proc (sort (keys %{$Complaints{$to}{$topic}})) { my $procref = $Complaints{$to}{$topic}{$proc}; if (!defined $subj) { $subj = $procref->{one_subject}; } else { $subj = $procref->{many_subject}; } if ($newtopic) { $newtopic = 0; $body .= "\n"; $body .= $procref->{body_header}; $body .= "\n"; } $body .= $procref->{body_line}; } } # Some cleanups $body =~ s/\n\n+/\n\n/mg; print "To: $to\nSubject: $subj\n$body\n" if $Debug; # Send the mail my $cmd = "$Sendmail -F 'Rschedule Police' -f root $mailto"; my $fh = IO::File->new("|$cmd") or die "%Error: $! $cmd"; print $fh "To: $mailto\n"; print $fh "From: Rschedule Police \n"; print $fh "Subject: Rschedule Police: $subj\n"; print $fh "\n"; print $fh "To see latest status, use: rloads or rhosts\n"; print $fh "\n"; print $fh "$body\n"; $fh->close; } } ###################################################################### ###################################################################### __END__ =pod =head1 NAME slpolice - Warn and renice top CPU hogs =head1 SYNOPSIS B [ B<--help> ] [ B<--port=>I ] [ B<--dhost=>I ] [ B<--cpu-hours> ] [ B<--version> ] [ B<--version> ] =head1 DESCRIPTION slpolice will determine the top cpu users across a cluster of hosts. It will send mail if a process has over a specified amount of cpu time. It will also mail if a user has a reservation for a long period of time. Usually slpolice is run with a crontab entry similar to: 5 8-21 * * * /usr/local/bin/slpolice --cpu_min 120 --reserved_min 120 long=999 >/dev/null 2>&1 This sends warnings each hour after 2 hours of CPU time. It does not check at night so that long overnight jobs will not receive warnings. Additional non-parameter arguments specify specific command regular expressions. When a process' command matches that regexp, the specified number of minutes will be used to determine when to send mail instead of the default. This program is most valuable when used with the L program, or a operating system where nice 19 processes get only leftover cpu resources. It requires a program called L which is a version of nice that is setgid root and renices a job to 19. This comes with L. =head1 ARGUMENTS =over 4 =item --help Displays this message and program version and exits. =item --debug-user With --debug, who to send the mail to instead of the process owner. =item --port Specifies the port number that slchoosed uses. =item --dhost Specifies the host name that slchoosed uses. May be specified multiple times to specify backup hosts. Defaults to SLCHOOSED_HOST environment variable, which contains colon separated host names. =item --cpu-min Number of cpu minutes the job should have before being reported to the user. Defaults to 0, which is off. =item --renice-min Number of minutes after which the nice value of a high cpu using process that is not at 1 or 10 is reniced to 19. Defaults to 0, which is off. =item --reserved-min Number of minutes a host may be reserved before reporting it to the user. Defaults to 0, which is off. =item --version Displays program version and exits. =back =head1 DISTRIBUTION The latest version is available from CPAN and from L. L is available from L. Copyright 1998-2009 by Wilson Snyder. This package is free software; you can redistribute it and/or modify it under the terms of either the GNU Lesser General Public License Version 3 or the Perl Artistic License Version 2.0. =head1 AUTHORS Wilson Snyder =head1 SEE ALSO L, L, L, =cut ###################################################################### ### Local Variables: ### compile-command: "./slpolice --debug --cpu_min 120 --reserved_min 120 " ### End: