#!/usr/bin/perl -w ## FIXME : remove the -w above when you find the code is fixed ! # #This is spamstats.pl v0.5 # # #Changelog #0.5b 5 March 2004 # Fixes a typo which lead to confusion in the volume counting. # Thanks to Matthew McGehrin for the bugreport # #0.5a 25 February 2004 # Two patches from Radko Keves, with these : # * Support for BSDs' sendmail daemons (sm-mta) # * Added the -firstdate option, useful for multiple files parsing # #0.5 30 January 2004 # Many thanks to Cyril Chaboisseau (http://www.obs.coe.int) who did : # * -minmax option that display min and max values for each displayed value # * some cleanups I removed :-p (renaming "clean message" into "ham" would # break some existing configs :( ) # * a lot of cleanup in the displaying code, that was very awful # * tidied and reordered the code # Few other changes: # * Fixed the bug where the -number and the -html options were leading to a # poor output # #0.4b5 11 August 2003 #Fixed the bug when a month starting with a zero is entered as start/enddate. # #0.4b4 10 June 2003 # Fixed the infile == 0 bug, thanks to Yen-Ming Lee # Fixes sendmail parsing when email is delivered through procmail, raised by Dirk Kuypers # # #0.4b3 2 June 2003 #Applied patches from Bob Apthorpe for : # * more elegant fix of the two digits month intput problem # * better input handling, now files to process can be specified in @ARGV without the --file switch # * Added documentation and scripts to graph spamstats output with cricket. # #0.4b2 30 May 2003 #Regexp bugfix in exim mailer_in handling #Regexp bugfix in spamd ("processing message" seems to have changed to "checking message") on some setups. #Updated README into a more english (and less french) syntax # #0.4b1 19 May 2003 #This is a very tiny bugfix. #Fixes parsing mistakes on sendmail setups that relay emails as outputs. #Emails were undetected on those setups. # #0.4b 10 Mar 2003 #WARNING : this release changes the default behaviour of spamstats calculations !! #From this version on spamstats counts spams and non-spams per recipient, not per mailer ID. #(Until this version, a multirecipient message sent to both "foo@yourdomain.com" and "bar@yourdomain.com" #counted only as one spam. From now on it counts as two. #New option : -agglo-recipients uses spamstats "old" mode : one count per mailer ID, not per recipient. #WARNING : FOR NOW EXIM USERS PROBABLY WANT TO USE THIS OPTION, ON SOME EXIM CONFIGS # THERE ARE RISKS LOG ANALYSIS BE BROKEN IF NOT USED! #Applied patch from Jim Breton for a better display. # #0.4 25 Feb 2003 #[Probably very incomplete] sendmail support #Only sendmail regexp were added, no code modification ! #This is not a very important release in terms of work. Hopefully it is in terms #of capabilities :-) # #0.3b2 30 Jan 2003 #Fix a problem where script will issue warnings when parsed log file is empty or #contains no reference to used mailer (only contains spamd messages). # #0.3b 04 Jan 2003 #Added a (hopefully) useful time filter specification to be used : duration specification. # #0.3a 29 Dec 2002 #Date/Time filter now works. #Some tiny code cleanup. #HTML output support. # #0.3alpha 17 Dec 2002 #Exim support #Some work on date/time filtering support, far from complete. These options are useless for now. # #0.2f 26 Nov 2002 #If one input file does not exist, mentions which! # #0.2e 26 Nov 2002 #Option "-noabsolute" makes spamstats not complain if argument log file names are not absolute. #Now reports total Volume of Spam and Volume of ham in general statistics. # #0.2d #Local recipients were not counted, only relayed ones. #Regexp was modified to just match both. #Thanks to Jon Gabrielson for bug report # #0.2c #No more lower/upper case distinction in top recipients classification #Thanks to Kenneth Nerhood for bug report # #0.2b #Fixes stupid bug from 0.2 where spamd process had to run as user "spamd" #Thanks to Kenneth Nerhood for bug report # # #Parses Postfix a spamd log file (or several) and extract top Spam receivers. #Also displays spam statistics. # #Author : Vincent Deffontaines #Script Basis, Postfix support Copyright : Vincent Deffontaines # KDX (www.kdx.fr) # Council of Europe (www.coe.int) # #Exim support Copyright : Vincent Deffontaines. #Sendmail support Copyright : Vincent Deffontaines. # #Please send me contributions/ modifications/ comments that could be useful to this script! #Others mailers than Postfix/Exim support shouldn't be hard to implement. #Author will help and include modifications to this script as long as mailers are free software. # #This program is free software; you can redistribute it and/or #modify it under the terms of the GNU General Public License #as published by the Free Software Foundation; version 2 #of the License. # #This program is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with this program; if not, write to the Free Software #Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # # # use strict; use Time::Local; use Date::Manip qw (ParseDate UnixDate); use Getopt::Long; #Only useful for Debugging, useless if you don't hack through this code :-) #use Data::Dumper; use Compress::Zlib; my %infile; undef %infile; my $number = 0; my $help = 0; my $nogeneral = 0; my $debug = 0; my $noabsolute = 0; my $error = 0; my $starttime = "none"; my $endtime = "none"; my $startdate = "none"; my $enddate = "none"; my $startdate_d = 1; my $firstdate = 0; my $firstdate_d = "Dec 31 23:59:59"; my $lastdate_d = "Jan 1 00:00:00"; my $skipstarttest = 0; my $skipendtest = 0; my $mailerlogtype = undef; my $html = 0; my $duration = 0; my $agglo_rcpt = 0; my %html_tags = ( 'br' => '', 'b' => '', 'i' => '', 'html' => '', 'body' => '', 'endtag' => '', 'starttag' => '', 'vspace' => '' ); my $spam_percent = 0; my $clean_percent = 0; my $minmax = undef; my %Defs = (); #Parse regexp definitions for each mailer and for spamd $Defs{'mailer_in'}{'postfix'} = '^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+postfix\/cleanup\[(\d*)\]:\s+([^:]+):\s*message-id=(.*)$'; #$Defs{'mailer_in'}{'exim'} = '^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+exim\[\d+\]:\s+\d{4}-\d{2}-\d{2}\s+\d+:\d+:\d+\s+<=\s+.*\@\S+\s+[^\[]+\[\d+\.\d+\.\d+\.\d+\]\s+P=\S+\s+S=\d+\s+id=(.*)$'; #$Defs{'mailer_in'}{'exim'} = '^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+exim\[(\d+)\]:\s+\d{4}-\d{2}-\d{2}\s+\d+:\d+:\d+\s+(\S+)\s+<=\s+.*\@\S+\s+(?:U=\S+|H=.*)\s+P=\S+\s+S=\S+\s+id=(.*)$'; $Defs{'mailer_in'}{'exim'} = '^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+exim\[(\d+)\]:\s+\d{4}-\d{2}-\d{2}\s+\d+:\d+:\d+\s+(\S+)\s+<=\s+.*\@\S+\s+H=.*\s+P=\S+.*\s+S=\S+(.*)$'; $Defs{'mailer_in'}{'sm-mta'} = '^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+sm-mta\[(\d+)\]:\s+(\S+):\s+from=<[^>]*>,\s+size=\d+,\s+class=\S+,\s+nrcpts=\d+,\s+msgid=<([^>]+)>.*,\s+proto=\S+,\s+daemon=\S+,\s+relay=.*$'; $Defs{'mailer_in'}{'sendmail'} = '^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+sendmail\[(\d+)\]:\s+(\S+):\s+from=<[^>]*>,\s+size=\d+,\s+class=\S+,\s+nrcpts=\d+,\s+msgid=<([^>]+)>.*,\s+proto=\S+,\s+daemon=\S+,\s+relay=.*$'; $Defs{'spamd_in'}{'postfix'} = '^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+spamd\[(\d+)\]:\s+(?:processing|checking)\s+message\s*(.*)\s+for\s+\S+'; $Defs{'spamd_in'}{'exim'} = '^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+spamd\[(\d+)\]:\s+(?:processing|checking)\s+message\s*<(.*)>\s+for\s+\S+'; $Defs{'spamd_in'}{'sm-mta'} = '^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+spamd\[(\d+)\]:\s+(?:processing|checking)\s+message\s*<(.*)>\s+for\s+'; $Defs{'spamd_in'}{'sendmail'} = '^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+spamd\[(\d+)\]:\s+(?:processing|checking)\s+message\s*<(.*)>\s+for\s+'; $Defs{'spamd_clean'} = '^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+spamd\[(\d+)\]:\s+clean\s+message\s*\(([^\/]+)\/[^\)]+\)\s+for\s+\S+\d+\s+in\s+(\S+)\s+seconds,\s+(\d+)\s+bytes\.'; $Defs{'spamd_spam'} = '^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+spamd\[(\d+)\]:\s+identified\s+spam\s*\(([^\/]+)\/[^\)]+\)\s+for\s+\S+\d+\s+in\s+(\S+)\s+seconds,\s+(\d+)\s+bytes\.'; $Defs{'mailer_out'}{'postfix'} = '^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+postfix\/(?:pipe|local)\[\d+\]:\s+([^:]+):\s+to=<([^>]+)>'; #'Mar 10 02:11:24 barrel postfix/smtp[20611]: 5A9BF22E04: to=, relay=127.0.0.1[127.0.0.1], delay=2, status=sent (250 ok 1047280284 qp 20787)' $Defs{'mailer_out'}{'exim'} = '^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+exim\[\d+\]:\s+\d{4}-\d{2}-\d{2}\s+\d+:\d+:\d+\s+(\S+)\s+=>\s+(.*\@\S+)\s+'; $Defs{'mailer_out'}{'sm-mta'} = '^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+sm-mta\[\d+\]:\s+(\S+):\s+to=(?:\|.*ctladdr=<|<)([^>]+)>.*,\s+delay=\S+,\s+xdelay=\S+,\s+mailer=\S+,\s+pri=\d+.*,\s+dsn=\S+,\s+stat=\S+'; $Defs{'mailer_out'}{'sendmail'} = '^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+sendmail\[\d+\]:\s+(\S+):\s+to=(?:\|.*ctladdr=<|<)([^>]+)>.*,\s+delay=\S+,\s+xdelay=\S+,\s+mailer=\S+,\s+pri=\d+.*,\s+dsn=\S+,\s+stat=\S+'; sub Print_Usage() { print "{Exim/Postfix/Sendmail} & spamd logfile analyser. Extracts top N Spam receivers\n"; print "$0 [-help] [-debug][-file=/path/to/filename] [-file=...] [-number=...] [-nogeneral]\n"; print " [-startdate=dd-mm] [-starttime=hh:mm:ss] [-enddate=dd-mm] [-endtime=hh:mm:ss]\n"; print " [-duration=number of seconds] /path/to/file1 [/path/to/file2] [/path/to/file3.gz]\n"; print " [-firstdate]\n"; print "GENERAL OPTIONS\n"; print "\t-debug\t\t\t: Displays informations that _might_ indicate problems while parsing.\n"; print "\t-help\t\t\t: Displays this help and exits.\n"; print "\t-file /path/file\t: Analyses mail log file for spam results (as logged by spamd) :\n" . "\t\t\t\t Several files can be asked for parsing at a time, including .gz files\n" . "\t\t\t\t Default /var/log/mail.log\n" . "\t\t\t\t This switch is DEPRECATED, simply specify filenames after all options,\n" . "\t\t\t\t without any switch.\n"; print "\t-number number\t\t: specifies number of top spam receivers to display (default : 0).\n"; print "\t-nogeneral\t\t: do not display general stats.\n"; print "\t-noabsolute\t\t: lets non-absolute named logfiles be processed.\n"; print "\t-html\t\t\t: HTML output\n"; print "\t-minmax\t\t\t: Display minimum and maximum values\n"; print "TIME FILTER OPTIONS (no time filter used if no option specified)\n"; print "\t-startdate dd-mm\t: Process only data logged from that date\n"; print "\t\t\t\t Default : today if starttime specified, else unused\n"; print "\t-enddate dd-mm\t: Process only data logged until that date\n"; print "\t\t\t\t Default : today if endtime specified, else unused\n"; print "\t-starttime hh:mm:ss\t: Process only data logged from that time (default time : 0:00:00)\n"; print "\t-endtime hh:mm:ss\t: Process only data logged until that time (default time : current time)\n"; print "\t-firstdate\t\t: Displays only first and last date of log messages\n"; print "\t\t\t\t (useful for multiple files parsing).\n"; print "\t-duration seconds\t: Work only on specified duration.\n"; print "\t\t\t\t To be used with start XOR end{time/date}, obviously not with both.\n"; print "\t\t\t\t Default : unused\n"; print "\t\t\t\t Default if no other time switch : process n seconds until current time.\n"; print "\tWhy no year in dates input? Just because there is no year reported in postfix mail logs\n"; print "\tThis will obviously cause time filter problems around new year!\n"; print "\t-agglo-recipients\t: Old spamstats counting. One count by mail ID, not by actual recipient.\n"; print "\t\t\t\t EXIM users WANT to set this for now!\n"; } sub unify($$); sub unify($$) #Converts (value, unit) from bytes, kilobytes, megabytes into a more human readable expression { my $volume = shift @_; my $unit = shift @_; if ( ( eval( $volume / 1024 ) > 5 ) and ( ( $unit eq "bytes" ) or ( $unit eq "kbytes" ) ) ) { $volume = $volume / 1024; $unit eq "Mbytes" and $unit = "Gbytes"; $unit eq "kbytes" and $unit = "Mbytes"; $unit eq "bytes" and $unit = "kbytes"; unify( $volume, $unit ); } else { return ( $volume, $unit ); } } sub c_d($) { if ($1 eq "none" ) { return 0; } else { return 1; } return 0; } sub check_date($) #Checks given date is correct (expected format string: "d[d]-m[m]" or "none"). #Returns 0 if correct, 1 if not. { my $date = shift @_; $date eq 'none' and return 0; unless ( $date =~ /^(\d{1,2})-(\d{1,2})$/ ) { return 1; } my $day = int($1); my $month = int($2); unless ( ( $day < 32 ) and ( $month < 13 ) and ( ( $day * $month ) > 0 ) ) { return 1; } return 0; } sub check_time($) #Checks given time is correct (expected format string: "h[h]:mm:ss" or "none"). #Returns 0 if correct, 1 if not. { my $time = shift @_; $time eq 'none' and return 0; unless ( $time =~ /^(\d{1,2}):(\d{2}):(\d{2})$/ ) { return 1; } my $hour = $1; my $minute = $2; my $second = $3; unless ( ( $hour < 25 ) and ( $minute < 60 ) and ( $second < 60 ) ) { return 1; } return 0; } sub convert_d_t_e($) { my $string = shift @_;; $string =~ s/\s+\(.*\)\s*$//; my $date = ParseDate($string); my $result = UnixDate($date,"%s"); return $result; } sub convert_date_time_to_epoch($$) #Input : date "d[d]-m[m]", time "h[h]:mm:ss" #Output : Pseudo epoch (no year included in input) #Returns -1 in case of trouble. #This function will ALWAYS be BUGGY around new year days #This function also presents a bug in case of "bisexctial" (correct word?) year (when Feb 29 exists). #This is due to year not being logged, which indeeds confuse things on such days. { my $date = shift @_; my $time = shift @_; unless ( $date =~ /^(\d{1,2})-(\d{1,2})$/ ) { return -1; } my $day = int($1); my $month = int($2); #Remove leading 0 in mounth is there is one unless ( $time =~ /^(\d{1,2}):(\d{2}):(\d{2})$/ ) { return -1; } my $hour = int($1); my $minute = int($2); my $second = int($3); my %months = (); $months{1} = 31; $months{2} = 28; $months{3} = 31; $months{4} = 30; $months{5} = 31; $months{6} = 30; $months{7} = 31; $months{8} = 31; $months{9} = 30; $months{10} = 31; $months{11} = 30; $months{12} = 31; my $result = ( $months{$month} + $day ) * 24 * 3600 + $hour * 3600 + $minute * 60 + $second; return ($result); } sub fill_zeros($) #Input : a list of numbers #Output : same numbers list, each preceeded by a zero if originally less than 2 characters long { my $list = shift @_; #print "DEBUG : $list\n"; #return $list; foreach my $number (@$list) { $number =~ /^\d$/ and $number = "0" . $number; } return @$list; } my @infiles = (); GetOptions( "file=s" => \@infiles, "number=i" => \$number, "help" => \$help, "debug" => \$debug, "noabsolute" => \$noabsolute, "nogeneral" => \$nogeneral, "html" => \$html, "minmax" => \$minmax, "startdate=s" => \$startdate, "enddate=s" => \$enddate, "firstdate" => \$firstdate, "starttime=s" => \$starttime, "endtime=s" => \$endtime, "duration=s" => \$duration, "agglo-recipients" => \$agglo_rcpt ); push @infiles, @ARGV if (@ARGV); my $defmaillog = '/var/log/mail.log'; push @infiles, $defmaillog if ( $#infiles == -1 && -f $defmaillog ); foreach my $fn (@infiles) { if ( -f $fn ) { $infile{$fn} = 1; } } if ($help) { Print_Usage (); exit 0; } #Sanity checks unless ( check_date($startdate) == 0 ) { print STDERR "Bad input format start date was entered\n"; $error++; } unless ( check_date($enddate) == 0 ) { print STDERR "Bad input format end date was entered\n"; $error++; } unless ( check_time($starttime) == 0 ) { print STDERR "Bad input format start time was entered\n"; $error++; } unless ( check_time($endtime) == 0 ) { print STDERR "Bad input format end date was entered\n"; $error++; } unless ( $duration =~ /^\d+$/ ) { print STDERR "Bad input : duration is supposed to be numeric\n"; $error++; } $html and %html_tags = ( 'br' => 'br>', 'b' => 'b>', 'i' => 'i>', 'html' => 'html>', 'body' => 'body>', 'endtag' => ' '<', 'vspace' => '' ); foreach my $file ( sort keys %infile ) { unless ( $file =~ /[a-zA-Z\.\/ \\0-9]+/ ) { die "Illegal characters read in parameter file name!\n"; } unless ( -f $file ) { print STDERR "$file : File does not exist!\n"; $error++; } if ( ( $file !~ /^\// ) and ( $noabsolute == 0 ) ) { print STDERR "$file : Path to file must be absolute, or you must specify the \"-noabsolute\" option\n"; $error++; } } if ( ( $starttime eq "none" ) and ( $startdate eq "none" ) ) { $skipstarttest = 1; } if ( ( $enddate eq "none" ) and ( $endtime eq "none" ) ) { $skipendtest = 1; } if ( ( $duration > 0 ) and ( $skipstarttest == 0 ) and ( $skipendtest == 0 ) ) { print STDERR "Input redundancy : You may not specify starttime, endtime and duration\n"; $error++; } $error and exit 1; if ( $startdate eq 'none' ) { my ( $day, $month ) = (localtime)[ 3, 4 ]; $month++; $startdate = $day . "-" . $month; $startdate_d = 0; } if ( $enddate eq 'none' ) { my ( $day, $month ) = (localtime)[ 3, 4 ]; $month++; $enddate = $day . "-" . $month; } $starttime eq 'none' and $starttime = '00:00:00'; if ( $endtime eq 'none' ) { my @tab = (localtime)[ 0, 1, 2 ]; @tab = fill_zeros( \@tab ); $endtime = join ( ':', reverse(@tab) ); } my $min_clean_score = 99; my $max_clean_score = 0; my $min_clean_time = 99; my $max_clean_time = 0; my $min_clean_msgsize = 99999; my $max_clean_msgsize = 0; my $min_spam_score = 99; my $max_spam_score = 0; my $min_spam_time = 99; my $max_spam_time = 0; my $min_spam_msgsize = 99999; my $max_spam_msgsize = 0; # and $endtime = join(':',reverse(fill_zeros((localtime)[0,1,2]))); print $html_tags{'starttag'} . $html_tags{'html'} . $html_tags{'starttag'} . $html_tags{'body'}; #print "Time filter used : From $startdate $starttime to $enddate $endtime\n"; my $epoch_start = convert_date_time_to_epoch( $startdate, $starttime ); my $epoch_end = convert_date_time_to_epoch( $enddate, $endtime ); if ( $duration > 0 ) { if ( ( $skipstarttest == 1 ) and ( $skipendtest == 1 ) ) { $epoch_start = $epoch_end - $duration; $duration = 0; $skipstarttest = 0; $skipendtest = 0; } elsif ( $skipstarttest == 1 ) { $epoch_start = $epoch_end - $duration; $duration = 0; $skipstarttest = 0; $skipendtest = 0; } elsif ( $skipendtest == 1 ) { $epoch_end = $epoch_start + $duration; $duration = 0; $skipstarttest = 0; $skipendtest = 0; } } #print "DEBUG : $epoch_start to $epoch_end\n"; $epoch_start > $epoch_end and print STDERR "WARNING : time filter seems incorrect : it starts after it ends! $epoch_start > $epoch_end\n"; my %mounths = ( "Jan" => 1, "Feb" => 2, "Mar" => 3, "Apr" => 4, "May" => 5, "Jun" => 6, "Jul" => 7, "Aug" => 8, "Sep" => 9, "Oct" => 10, "Nov" => 11, "Dec" => 12 ); my $is_gz = 0; my $gz; my $gzerrno; my %spam = (); my %clean = (); my %mailer_table = (); my %spamd_table = (); my %spamd_pid = (); my $spam_score = 0; my $clean_score = 0; my $spam_time = 0; my $clean_time = 0; my $basic_spam_nb = 0; my $basic_clean_nb = 0; my $spam_volume = 0; my $clean_volume = 0; my $incorrect_lines = 0; #Count unparsable lines my $correct_lines = 0; #Count parsable lines #Processing FILELOOP: foreach my $file ( keys %infile ) { undef $mailerlogtype; my $first_date = ""; my $last_date = ""; my $line; my $linetime = 0; $is_gz = 0; #$eof = 0; if ( $file =~ /\.gz$/ ) #We have a gz file { #print BLUE "Opening $file\n".$Stag.$NewLinetag; $gz = gzopen( $file, "r" ) or die "Cannot open $file : $gzerrno\n"; $is_gz = 1; } else { #print BLUE "Opening $file\n".$Stag.$NewLinetag; open( FILE, $file ) or die "Unable to open file!\n"; } while (1) #while (not $eof) { if ($is_gz) { die "File not open!\n" if not defined $gz; unless ( $gz->gzreadline($line) > 0 ) { #$eof = 1; $gz->gzclose(); # printf "$startdate\n"; if ( $firstdate == 0 ){ print $html_tags{'starttag'} . $html_tags{'br'}; print "File $file : from $first_date to $last_date\n"; } else { #find firstdate if (convert_d_t_e($first_date) < convert_d_t_e($firstdate_d)) { $firstdate_d = $first_date; } if ($startdate_d) { $firstdate_d = $startdate; } #find lastdate if ( convert_d_t_e($last_date) ge convert_d_t_e($lastdate_d) ) { $lastdate_d = $last_date; } if ($startdate_d) { $lastdate_d = $enddate; } } next FILELOOP; } } else { { unless ( defined( $line = ) ) { #$eof = 1; close FILE; if ($firstdate == 0 ){ print $html_tags{'starttag'} . $html_tags{'br'}; print "File $file : from $first_date to $last_date\n"; } else { #find firstdate if (convert_d_t_e($first_date) < convert_d_t_e($firstdate_d)) { $firstdate_d = $first_date; } if ($startdate_d) { $firstdate_d = $startdate; } #find lastdate if ( convert_d_t_e($last_date) ge convert_d_t_e($lastdate_d) ) { $lastdate_d = $last_date; } if ($startdate_d) { $lastdate_d = $enddate; } } next FILELOOP; } } } unless ( $line =~ /^\s*([a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+)\s+\S+\s+/ ) { $incorrect_lines++; next; } unless ( defined $mailerlogtype ) { if ( $line =~ /^\s*([a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+)\s+\S+\s+(exim|postfix|sendmail|sm-mta)/ ) { $mailerlogtype = $2; #print $mailerlogtype." style log file detected\n"; } } $correct_lines++; if ( $first_date eq "" ) { $first_date = $1; } $last_date = $1; if ( ( $skipstarttest * $skipendtest ) == 0 ) { $last_date =~ /^([a-zA-Z]{3})\s+(\d+)\s+(\d+:\d+:\d+)$/ and $linetime = convert_date_time_to_epoch( $2 . "-" . $mounths{$1}, $3 ); } unless ( $skipstarttest == 1 ) { $linetime < $epoch_start and next; } unless ( $skipendtest == 1 ) { $linetime > $epoch_end and next; } #Here is Mailer analysis section. Spamd analysis is below. #We are not running this code unless we know which mailer we are having: if ( defined $mailerlogtype ) { #Email IN #if ($line =~ /^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+postfix\/cleanup\[(\d*)\]:\s+([^:]+):\s*message-id=(.*)$/) if ( $line =~ /$Defs{'mailer_in'}{$mailerlogtype}/ ) { if ( defined $mailer_table{$2} ) { delete $mailer_table{$2}; if ($debug) { print $html_tags{'starttag'} . $html_tags{'br'}; print "INFO: A message \"id\" already existed as $2. Deleted it from mailer_table before renew.\n"; } } #Exim specific :-( if ( $mailerlogtype eq 'exim' ) { my $dollar2 = $2; my $id = undef; if ( $3 =~ /^\s*id=(.*)$/ ) { $id = $1; } else { #print "TWO\n"; $id = "I_have_no_id_за:-(" ; #Hope this will never be a real id ... } $mailer_table{$dollar2} = $id; } else { $mailer_table{$2} = $3; } #print "DEBUG : postfix received message on ID $3, message code $2\n"; next; } #EMAIL SENT #if ($line =~ /^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+postfix\/(?:pipe|local)\[\d+\]:\s+([^:]+):\s+to=<([^>]+)>/) if ( $line =~ /$Defs{'mailer_out'}{$mailerlogtype}/ ) { if ( defined $mailer_table{$1} ) { #Exim specific code :-( if ( $mailerlogtype eq "exim" ) { my $blah = $1; my $tmp_email = $2; if ( $mailer_table{$1} =~ /^I_have_no_id_за:-\($/ ) { foreach my $key ( keys %spamd_table ) { if ( $key =~ /$blah/ ) { #print "I think I maybe resolved a floating Exim ID\n"; if ( $spamd_table{$key} eq "spam" ) { $spam{ lc($tmp_email) }++; # print "SPAM for $tmp_email\n"; } elsif ( $spamd_table{$key} eq "clean" ) { $clean{ lc($tmp_email) }++; # print "CLEAN for $tmp_email\n"; } } } } } #End exim specific code if ( defined $spamd_table{ $mailer_table{$1} } ) { if ( $spamd_table{ $mailer_table{$1} } eq "spam" ) { $spam{ lc($2) }++; } elsif ( $spamd_table{ $mailer_table{$1} } eq "clean" ) { $clean{ lc($2) }++; } if ($agglo_rcpt) { delete $spamd_table{ $mailer_table{$1} }; delete $mailer_table{$1}; } } } else { if ($debug) { print $html_tags{'starttag'} . $html_tags{'br'} . $html_tags{'starttag'} . $html_tags{'b'}; print "CRITICAL : Warning : Mailer delivered a message it never received? id $1"; print $html_tags{'endtag'} . $html_tags{'b'} . "\n"; } } next; } } #if ($line =~ /^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+spamd\[(\d+)\]:\s+processing\s+message\s*(.*)\s+for\s+\S+/) if ( defined $mailerlogtype ) { if ( $line =~ /$Defs{'spamd_in'}{$mailerlogtype}/ ) { #foreach my $key(keys %spamd_pid) #{ # if ($spamd_pid{$key} eq $2) # { # delete $spamd_pid{$key}; # print "INFO: A message \"id\" already existed as $2. Deleted it from spamd_pid before renew.\n"; # } #} $spamd_pid{$1} = $2; next; } } #Detected as NON spam - Lets delete all its references from the buffer #if ($line =~ /^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+spamd\[(\d+)\]:\s+clean\s+message\s*\(([^\/]+)\/[^\)]+\)\s+for\s+\S+\d+\s+in\s+(\S+)\s+seconds,\s+(\d+)\s+bytes\./) if ( $line =~ /$Defs{'spamd_clean'}/ ) { if ( defined $spamd_pid{$1} ) { $spamd_table{ $spamd_pid{$1} } = "clean"; delete( $spamd_pid{$1} ); } else { if ($debug) { print $html_tags{'starttag'} . $html_tags{'br'} . $html_tags{'starttag'} . $html_tags{'b'}; print "CRITICAL : spamd sent an answer for a message it did not receive? pid $1"; print $html_tags{'endtag'} . $html_tags{'b'} . "\n"; } } $basic_clean_nb++; $clean_score += $2; $clean_time += $3; $clean_volume += $4; if ( defined $minmax ) { $min_clean_score = $2 if ( $2 < $min_clean_score ); $max_clean_score = $2 if ( $2 > $max_clean_score ); $min_clean_time = $3 if ( $3 < $min_clean_time ); $max_clean_time = $3 if ( $3 > $max_clean_time ); $min_clean_msgsize = $4 if ( $4 < $min_clean_msgsize ); $max_clean_msgsize = $4 if ( $4 > $max_clean_msgsize ); } next; } #SPAM FOUND #if ($line =~ /^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+spamd\[(\d+)\]:\s+identified\s+spam\s*\(([^\/]+)\/[^\)]+\)\s+for\s+\S+\d+\s+in\s+(\S+)\s+seconds,\s+(\d+)\s+bytes\./) if ( $line =~ /$Defs{'spamd_spam'}/ ) { if ( defined $spamd_pid{$1} ) { $spamd_table{ $spamd_pid{$1} } = "spam"; #print "spamd_table {".$spamd_pid{$1}."} is spam\n"; delete( $spamd_pid{$1} ); } else { if ($debug) { print $html_tags{'starttag'} . $html_tags{'br'} . $html_tags{'starttag'} . $html_tags{'b'}; print "CRITICAL : spamd sent an answer for a message it did not receive? pid $1"; print $html_tags{'endtag'} . $html_tags{'b'} . "\n"; } } $basic_spam_nb++; $spam_score += $2; $spam_time += $3; $spam_volume += $4; if ( defined $minmax ) { $min_spam_score = $2 if ( $2 < $min_spam_score ); $max_spam_score = $2 if ( $2 > $max_spam_score ); $min_spam_time = $3 if ( $3 < $min_spam_time ); $max_spam_time = $3 if ( $3 > $max_spam_time ); $min_spam_msgsize = $4 if ( $4 < $min_spam_msgsize ); $max_spam_msgsize = $4 if ( $4 > $max_spam_msgsize ); } next; } } #We are in a non-existent case! print STDERR "WARNING, a piece of the program that shouldnt be run was reached!\nInvestigate!\n"; } my %stats = (); foreach my $key ( keys %spam ) { push @{ $stats{ $spam{$key} } }, $key; } unless ($nogeneral) { my $nb_spam = 0; my $nb_clean = 0; foreach my $key ( keys %spam ) { $nb_spam += $spam{$key}; } foreach my $key ( keys %clean ) { $nb_clean += $clean{$key}; } #output if firstdate is called if ($firstdate == 1) { print $html_tags{'starttag'} . $html_tags{'br'}; print "Statistic from $firstdate_d to $lastdate_d\n"; } #General stats #Ok, not beautiful code. But its only run once... print $html_tags{'starttag'} . $html_tags{'br'} . $html_tags{'starttag'} . $html_tags{'br'}; print "Total number of emails processed by the spam filter : " . $html_tags{'starttag'} . $html_tags{'b'} . eval( $nb_spam + $nb_clean ) . $html_tags{'endtag'} . $html_tags{'b'} . "\n"; print $html_tags{'starttag'} . $html_tags{'br'}; print $html_tags{'starttag'} . $html_tags{'b'}; if ( $nb_spam + $nb_clean > 0 ) { $spam_percent = eval( 100 * $nb_spam / ( $nb_spam + $nb_clean ) ); printf( "%-40s:%10d (%6.2f%%)\n", "Number of spams", $nb_spam, $spam_percent ); } else { printf( "%-40s:%10s\n", "Number of spams", "n/a" ); } print $html_tags{'endtag'} . $html_tags{'b'}; print $html_tags{'starttag'} . $html_tags{'br'}; print $html_tags{'starttag'} . $html_tags{'b'}; if ( $nb_spam + $nb_clean > 0 ) { $clean_percent = eval( 100 * $nb_clean / ( $nb_spam + $nb_clean ) ); printf( "%-40s:%10d (%6.2f%%)\n", "Number of clean messages", $nb_clean, $clean_percent ); } else { printf( "%-40s:%10s\n", "Number of clean messages", "n/a" ); } print $html_tags{'endtag'} . $html_tags{'b'}; print $html_tags{'starttag'} . $html_tags{'br'}; printf( "%-40s:", "Average message analysis time" ); print $html_tags{'starttag'} . $html_tags{'b'}; if ( $basic_spam_nb + $basic_clean_nb > 0 ) { printf "%10.2f", eval( ( $spam_time + $clean_time ) / ( $basic_spam_nb + $basic_clean_nb ) ); } else { print "n/a"; } print $html_tags{'endtag'} . $html_tags{'b'}; print " seconds\n"; print $html_tags{'starttag'} . $html_tags{'br'}; printf( "%-40s:", "Average spam analysis time" ); print $html_tags{'starttag'} . $html_tags{'b'}; if ( $basic_spam_nb > 0 ) { printf "%10.2f", eval( $spam_time / $basic_spam_nb ); } else { print "n/a"; } print $html_tags{'endtag'} . $html_tags{'b'}; print " seconds"; print "\n".$html_tags{'starttag'} . $html_tags{'br'} . $html_tags{'vspace'}. "\t(min time = $min_spam_time, max time = $max_spam_time)" if ( defined $minmax ); print "\n"; print $html_tags{'starttag'} . $html_tags{'br'}; printf( "%-40s:", "Average clean message analysis time" ); print $html_tags{'starttag'} . $html_tags{'b'}; if ( $basic_clean_nb > 0 ) { printf "%10.2f", eval( $clean_time / $basic_clean_nb ); } else { print "n/a"; } print $html_tags{'endtag'} . $html_tags{'b'}; print " seconds"; print "\n".$html_tags{'starttag'} . $html_tags{'br'} . $html_tags{'vspace'}. "\t(min time = $min_clean_time, max time = $max_clean_time)" if ( defined $minmax ); print "\n"; # #Spam with multiple recipients count only as one in the average... print $html_tags{'starttag'} . $html_tags{'br'}; printf( "%-40s:", "Average message score" ); print $html_tags{'starttag'} . $html_tags{'b'}; if ( $basic_clean_nb + $basic_spam_nb > 0 ) { printf "%10.2f", eval( ( $spam_score + $clean_score ) / ( $basic_clean_nb + $basic_spam_nb ) ); } else { print "n/a"; } print $html_tags{'endtag'} . $html_tags{'b'} . "\n"; #Spam with multiple recipients count only as one in the average... print $html_tags{'starttag'} . $html_tags{'br'}; printf( "%-40s:", "Average spam score" ); print $html_tags{'starttag'} . $html_tags{'b'}; if ( $basic_spam_nb > 0 ) { printf "%10.2f", eval( $spam_score / $basic_spam_nb ); } else { print "n/a"; } print $html_tags{'endtag'} . $html_tags{'b'}; print "\n".$html_tags{'starttag'} . $html_tags{'br'} . $html_tags{'vspace'}. "\t\t(min score = $min_spam_score, max score = $max_spam_score)" if ( defined $minmax ); print "\n"; #Spam with multiple recipients count only as one in the average... print $html_tags{'starttag'} . $html_tags{'br'}; printf( "%-40s:", "Average clean message score" ); print $html_tags{'starttag'} . $html_tags{'b'}; if ($basic_clean_nb) { printf "%10.2f", eval( $clean_score / $basic_clean_nb ); } else { print "n/a"; } print $html_tags{'endtag'} . $html_tags{'b'}; print "\n".$html_tags{'starttag'} . $html_tags{'br'} . $html_tags{'vspace'}. "\t\t(min score = $min_clean_score, max score = $max_clean_score)" if ( defined $minmax ); print "\n"; my $unit = "bytes"; ( $spam_volume, $unit ) = unify( $spam_volume, $unit ); print $html_tags{'starttag'} . $html_tags{'br'}; printf( "%-40s:", "Total spam volume" ); print $html_tags{'starttag'} . $html_tags{'b'}; printf "%10d ", $spam_volume; print $html_tags{'endtag'} . $html_tags{'b'} . $unit; if ( defined $minmax ) { my $unitmin = "bytes"; my $unitmax = "bytes"; ( $min_spam_msgsize, $unitmin ) = unify( $min_spam_msgsize, $unitmin ); ( $max_spam_msgsize, $unitmax ) = unify( $max_spam_msgsize, $unitmax ); printf( "\n".$html_tags{'starttag'} . $html_tags{'br'} . $html_tags{'vspace'}. "\t(min size = %2d $unitmin, max size = %2d $unitmax)", $min_spam_msgsize, $max_spam_msgsize ); } print "\n"; $unit = "bytes"; ( $clean_volume, $unit ) = unify( $clean_volume, $unit ); print $html_tags{'starttag'} . $html_tags{'br'}; printf( "%-40s:", "Total clean volume" ); print $html_tags{'starttag'} . $html_tags{'b'}; printf "%10d ", $clean_volume; print $html_tags{'endtag'} . $html_tags{'b'} . $unit; if ( defined $minmax ) { my $unitmin = "bytes"; my $unitmax = "bytes"; ( $min_clean_msgsize, $unitmin ) = unify( $min_clean_msgsize, $unitmin ); ( $max_clean_msgsize, $unitmax ) = unify( $max_clean_msgsize, $unitmax ); printf( "\n".$html_tags{'starttag'} . $html_tags{'br'} . $html_tags{'vspace'}. "\t(min size = %2d $unitmin, max size = %2d $unitmax)", $min_clean_msgsize, $max_clean_msgsize ); } print "\n"; } #Top spammed addresses if ($number) { print $html_tags{'starttag'} . $html_tags{'br'}; print $html_tags{'starttag'} . $html_tags{'br'}; print "Recipients with highest number of spams : (top $number)\n"; foreach my $key ( sort { $b <=> $a } keys %stats ) { $number <= 0 and last; print $html_tags{'starttag'} . $html_tags{'br'}; print $key. " spams : \n"; foreach my $email ( @{ $stats{$key} } ) { print $html_tags{'starttag'} . $html_tags{'br'} .$html_tags{'vspace'}; print "\t" . $email . "\n"; $number--; } } } if ( ( $correct_lines == 0 ) or ( ( $incorrect_lines / $correct_lines ) > 0.1 ) ) { print $html_tags{'starttag'} . $html_tags{'br'}; print $html_tags{'starttag'} . $html_tags{'br'}; print "INFO: It seems at least one input file contains other things that {exim/postfix} or spamd lines!\n"; } print $html_tags{'endtag'} . $html_tags{'body'}; print $html_tags{'endtag'} . $html_tags{'html'};