#!/usr/bin/perl -Tw require 5.006_001; use strict; # This is a fork of Tom Anderson's spamitarium.pl by Jonathan Kamens # . The changes here have been submitted back to Tom Anderson # for incorporation into his master version. In the meantime, you may wish to # use this version instead. Here are the changes in it: # # * Command-line parsing has been updated to use the standard Perl Getopt::Long # syntax, while continuing to support the "bundled" options syntax supported # by previous versions of the script. # * Options have been added to specify information that may be missing from the # message header, specifically for when spamitarium is being called from a # Milter and therefore the message does not yet have the Return-Path or local # Received lines that will be inserted by the MTA before final delivery. See # the documentation below for the options --return-path, --no-local-received, # --remote-ip, --remote-name, --helo, --local-ip, --local-name, --rcpt, and # --add-local-received. # * A --timeout option has been added for specifying how long the script should # wait for input, or 0 to disable the timeout completely. This is primarily # useful for debugging the script. # * The header parsing code has been refactored to be cleaner and more robust. # * In particular, empty header fields are now handled correctly (previously, # they were appended to the previous header field!). # * Empty header fields are now included in the output, for more accurate # bogofilter'ing. # * Typos and such have been cleaned up in the documentation. # * A date-parsing bug which was causing the time zone to be ignored, thus # causing the X-Date-Check header to report an inaccurate delta, has been # fixed. # * A bug which could cause spamitarium to crash upon encountering a far into # the future date in the message header has been fixed. # * A bug which was causing some domain names to be truncated (e.g., # "omta01-mdp.westchester.pa.bo.comcast.net" became "omta01-mdp.west") has # been fixed. # * A couple of Perl uninitialized variable warnings have been fixed. # * Mail::SPF is used now instead of Mail::SPF::Query, for IPv6 support. # * Code has been added to work around the fact that some emails generated by # Constant Contact have CR rather than CRLF at the end of their Date: header # lines (this is a very specific workaround for a very specific problem # because technically the workaround is a violation of the SMTP RFC, though # nobody else seems to care about that. :-/). =head1 NAME Spamitarium - evaluates and repairs the sanity of email headers... =cut my $version = "0.5.2"; ################################################ ############### Copyleft Notice ################ ################################################ # Copyright © 2004 Order amid Chaos, Inc. # Author: Tom Anderson # neo+spamitarium@orderamidchaos.com # # This program is open-source software; you can redistribute it # and/or modify it under the terms of the GNU General Public # License, v2, as published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public # License along with this program; if not, write to: # # Free Software Foundation # 59 Temple Place, Suite 330 # Boston, MA 02111-1307 USA # # http://www.gnu.org/ ################################################# ################# Documentation ################# ################################################# # use "perldoc spamitarium" or "spamitarium -h" to read this =head1 SYNOPSIS =head2 Command line usage: B [I] < [I] =head2 Procmail usage (recommended): Add to your .procmailrc the following recipe: :0 { :0 fhw | spamitarium -sreadxtp # filter through bogofilter, tagging as spam # or not and updating the word lists :0 fw | bogofilter -uep # add back the "From" header for proper delivery :0 fhw | formail -I "From " -a "From " } =head2 Command line options: =over 4 =item B<-h> display this help file =item B<-s> [I] Allow standard fields only (RFC 822/2822/1049/1341/1521/2183/1864) ... all others are stripped from the email. To exclude some headers from stripping, specify a comma-separated list as an argument to this option. =item B<-r> insert new received line containing verified received-line tokens =item B<-e> include helo string in received line =item B<-d> allow DNS lookups (forward and reverse) to help fill in all necessary received fields =item B<-f> force rDNS lookups even when provided already by the MTA =item B<-a> perform ASN lookups and include in received lines =item B<-p> perform SPF lookups and include in received lines =item B<-x> include custom x-headers for additional header validations: =item B<-t> validate that the date header is within close proximity to the received date (see $date_limit global variable to configure) =item B<-w> parse and display the body of the email in addition to the headers =item B<-b> display benchmarking info =item B<--return-path> I Specify the sender of the email for SPF lookups, overriding the "Return-Path" field. =item B<--no-local-received> Tell spamitarium that there is no Received line in the message from the local mailer, e.g., because spamitarium is being called from inside a Milter. The following options are only used when B<--no-local-received> is specified. =over 4 =item B<--remote-ip> I Specify the IP address of the remote server sending the email. =item B<--remote-name> I Specify the host name of the remote server sending the email, as determined by a reverse DNS lookup of its IP address. =item B<--helo> I Specify the host name sent by the remote server in the SMTP HELO (or EHLO) command. =item B<--local-ip> I Specify the IP address of the server receiving the email. =item B<--local-name> I Specify the host name of the server receiving the email. =item B<--rcpt> I Specify the envelope address of the recipient of the email. =item B<--add-local-received> Indicate that spamitarium should add an auto-generated local Received line. =back =item B<--timeout> I How long to wait for input before giving up. Specify 0 to disable the timeout completely. Primarily for debugging purposes. =back =head2 User-defined header field list: If using the B option, you may want to allow certain header fields other than those specified by RFCs. These might include fields set by your mailing list or proxy or other custom application. You may specify such a list of fields by appending them comma-delimited at the end of your command line. For example, if you wanted the I and I fields passed through, you would change your procmail recipe as follows: | spamitarium -readxtp -s list-id,encrypted =head1 REQUIRES =over 4 =item * Perl 5.6.1 Net::DNS::Resolver Mail::SPF Net::CIDR DB_File POSIX Getopt::Long =back =head1 DESCRIPTION Spamitarium helps to remove unnecessary noise from email headers and to highlight just the portions which contribute positively to spam filtering using statistical methods. The only non-spoofable, -forgable, or -tweakable part of an email is the received line, as it is generated by the receiving mail server which ought to have no reason to munge it. Every other part of an email can be influenced directly by the sender. Received line tokens, when verified authentic, are therefore highly indicative of whether or not a given message is spam. Spamitarium reads the received headers, determines which ones are authentic, and then prints tokens into the header which may be keyed on by statistical filters. This works much like a blacklist/whitelist, but when coupled with a statistical filter such as bogofilter, these lists are automatically generated and require no manual maintenance other than normal training. Moreover, headers which do not directly influence the email in any functional way, nor are visible to the end-user in a standard graphical MUA, are highly likely to contain information which spammers think will detract from normal statistical filtering. It is therefore desireable to remove these elements, specifically X-headers, prior to filtering. Spamitarium removes all invisible, non-functional header lines. Spamitarium also looks up any IP addresses or rDNS addresses which are not provided in order to provide the maximum tokens on which to filter. Moreover, it looks up the autonomous system number (ASN) associated with each "from" address in order to provide a small set of tokens representing the various major subnets of the internet. And it checks the Sender Policy Framework (SPF) records of the sender to ensure that the given MX has permission to send on their behalf. Finally, Spamitarium assesses the headers for missing required header lines, inserting keyable tokens or supplying the missing information. And it compares the date fields to determine if the email has been pre- or post-dated by a large margin in order to influence where it appears in your mail client and inserts an x-header with keyable range tokens to compensate for this. Together, all of these techniques help to remove the noise which accompanies, either incidentally or maliciously, most email messages. This results in a cleaner header consisting of more easily scored tokens. This permits better accuracy with statistical filters as well as quicker processing and a smaller token database. =head1 FAQ =head2 Ask a question Ye may receive an answer here if it is asked frequently =head1 BUGS =over 4 =item * timegm($sec,$min,$hour,$day,$mon,$year) aborts if Perl's time_t is 32 bits large and the year is too high (>2038). =back =head1 TODO =over 4 =item * Suggestions welcome. =back =head1 SEE ALSO =over 4 =item * L =item * L =back =head1 AUTHOR Tom Anderson Jonathan Kamens =cut ################################################# ############### User Variables ################# ################################################# # please edit according to your setup # default path our $path = "/bin:/usr/bin:/usr/local/bin"; # default shell our $shell = "/bin/sh"; # seconds before we bail waiting on input our $timeout = 3; # server to use for ASN lookups our $asn_server = "asn.routeviews.org"; # Whitelist any IP addresses or ranges from SPF lookups our @whitelist = ("127.0.0.1","192.168.0.1-192.168.0.255"); # If you want to whitelist any addresses which have authenticated # via poprelayd (i.e. remote workstations of users on your server) # set $dbfile to your popip.db location, else set it to undef #our $dbfile = "/etc/mail/popip.db"; our $dbfile = undef; # distance in seconds from right now to consider a reasonable (non-spam) range to date an email our $date_limit = 60*60*24*2; # 2 days # EMAIL HEADER FIELDS # # See RFC 2076 / "Common Internet Message Header Fields" for a synopsis of common mail headers # SPECIFIED FIELDS -- all of the fields specified in RFC 822/2822, case-insensitive, in the suggested order our $spec_fields = "return-path,received,resent-date,resent-from,resent-sender,resent-reply-to,". "resent-to,resent-cc,resent-bcc,resent-message-id,date,from,sender,reply-to,". "to,cc,bcc,message-id,in-reply-to,references,subject,comments,keywords,encrypted"; # MIME header fields (RFC 1049/1341/1521/2183) $spec_fields .= ",mime-version,content-type,content-transfer-encoding,content-id,content-description,content-disposition"; # security/checksum (RFC 1864) $spec_fields .= ",content-md5"; # mailing list headers (RFC 2369/2919) may be added if you like, but for now I'm choosing to leave them out #$spec_fields .= ",list-id,list-help,list-unsubscribe,list-subscribe,list-post,list-owner,list-archive"; # MASKED FIELDS -- unnecessary fields often used for spam will be expunged from the spec fields list # (if you know of a valid, necessary use for these, let me know) our $masked_fields = "keywords,comments,encrypted,content-id,content-description"; # controversial and not strictly necessary: #$masked_fields .= ",reply-to"; # message-id fields are only machine-readable and not visible to nor readable by the recipient # however, they can be useful if your client produces discussion threading # uncomment this line if you don't care about threading: #$masked_fields .= ",message-id,resent-message-id,in-reply-to,references"; # resent fields are strictly informational (and not generally user-visible), therefore allowing them through is optional: # MIME specifies a different way of resending messages with the "Message" content-type, so these may be considered deprecated: $masked_fields .= ",resent-date,resent-from,resent-sender,resent-reply-to,resent-to,resent-cc,resent-bcc,resent-message-id"; # USER FIELDS -- User fields are those that are neither specified nor masked that you want permitted. # These may include special fields for your particular mail server, filter, or mail user agent. our $user_fields = ""; # NEW FIELDS -- New custom x-headers added by Spamitarium (it is recommend that you don't change these). # These are disabled unless you pass the 'x' option. our $new_fields = "x-date-check,x-spf"; # REQUIRED FIELDS -- Any fields that should show up in an email even if they are not sent -- i.e. if the lack of # these fields may be useful for the filter, a no-req-field tag will be added. The only *required* fields according to # RFC 2822 are "from", "sender", "reply-to", and "date", others are just suggested. However, "sender" and "reply-to" are # commonly not supplied, and so should probably not be in this list. On the other hand, "subject" and a few others may # be desired in this list. our $req_fields = "received,from,to,date,subject"; # of course, modify the first line of this file, # the shebang, to point to your perl interpreter # do not edit below this line unless you really # know what you're doing ################################################# ############## Include Libraries ################ ################################################# use Benchmark; use Data::Dumper; use File::Basename; use Time::Local; use Net::DNS::Resolver; use Mail::SPF; use Net::CIDR; use DB_File; use POSIX; use Getopt::Long; use Sys::Syslog; ################################################# ############## Default Globals ################## ################################################# $> = $<; # set effective user ID to real UID $) = $(; # set effective group ID to real GID # Make %ENV safer delete @ENV{qw(IFS CDPATH ENV BASH_ENV PATH SHELL)}; # Set the environment explicitly $ENV{PATH} = $path; $ENV{SHELL} = $shell; # options flags our $options = ""; our $return_path = ""; our $opt_remote_ip = ""; our $opt_remote_name = ""; our $opt_local_ip = ""; our $opt_local_name = ""; our $opt_helo = ""; our $opt_rcpt = ""; our $no_local_received = undef; our $add_local_received = undef; # define the control-linefeed syntax for this system our $CRLF = "\n"; #($^O=~/VMS/i)? "\n": # VMS #("\t" ne "\011")? "\r\n": # EBCDIC # "\015\012"; # others # DNS query options our $res = Net::DNS::Resolver->new( nameservers => [qw(127.0.0.1)], udp_timeout => 2, retry => 1, #debug => 1 ); # convert whitelist into CIDR notation our @cidr_list = (); foreach my $IP (@whitelist) { if (not eval {@cidr_list = Net::CIDR::cidradd ($IP, @cidr_list)}) { error("warn","Error processing whitelist: \"$IP\" is not a valid IP address or range."); } } ################################################ ##################### Main ##################### ################################################ openlog(basename $0, 'mail'); # process options sub usage { my $spamitarium = $1 if $0 =~ /^([\w\/.\-~]*)$/; system("perldoc $spamitarium"); } # Allow bundling of first set of options, as well as optional hyphen. if (@ARGV and $ARGV[0] =~ /^-?[hrdfasebwxtp]+$/) { if ($ARGV[0] =~ s/s//) { splice(@ARGV, 1, 0, "-s"); } $ARGV[0] =~ s/^([^-])/-$1/; Getopt::Long::Configure("bundling"); } if (! GetOptions("help|h" => sub { &usage; exit(0); }, "r" => sub { $options .= "r"; }, # process received headers "d" => sub { $options .= "d"; }, # perform domain lookups where needed "f" => sub { $options .= "f"; }, # force RDNS lookups even where MTA provided "a" => sub { $options .= "a"; }, # perform ASN lookups "s:s" => sub { $options .= "s"; # standard fields only (strip others) $user_fields = $_[1]; }, "e" => sub { $options .= "e"; }, # include the helo received field in output "b" => sub { $options .= "b"; }, # output benchmarking info "w" => sub { $options .= "w"; }, # process whole email (including body) "x" => sub { $options .= "x"; }, # insert custom x-header fields "t" => sub { $options .= "t"; }, # perform date range checks "p" => sub { $options .= "p"; }, # perform SPF lookups "return-path|returnpath=s" => \$return_path, "remote-ip|remoteip=s" => \$opt_remote_ip, "remote-name|remotename=s" => \$opt_remote_name, "local-ip|localip=s" => \$opt_local_ip, "local-name|localname=s" => \$opt_local_name, "helo=s" => \$opt_helo, "rcpt=s" => \$opt_rcpt, "no-local-received|nolocalreceived" => \$no_local_received, "add-local-received|addlocalreceived" => \$add_local_received, "timeout=i" => \$timeout, # 0 to disable timeout )) { &usage; exit(1); } # open popip database for reading our %db; &opendb_read if $dbfile; # start timing the process my $start_time = new Benchmark if $options =~ /b/; my ($start_parse, $end_parse, $start_rcvd, $end_rcvd, $start_set, $end_set); # get STDIN and process the email eval { # set an alarm so that we don't hang on an empty STDIN local $SIG{ALRM} = sub { die "timeout" }; if ($timeout > 0) { alarm $timeout; } # parse the header $start_parse = new Benchmark if $options =~ /b/; my ($header,$parse_benchmark) = parse_header(); $end_parse = new Benchmark if $options =~ /b/; # cancel timeout if we got this far alarm 0; # default date if none provided unless (defined $header->{'date'}) { $header->{'date'}->[0]->{'name'} = "Date"; $header->{'date'}->[0]->{'value'} = gmtime time; } # process the received lines if ($options =~ /r/) { $start_rcvd = new Benchmark if $options =~ /b/; $header->{'received'} = process_rcvd($header->{'received'},$return_path || ($header->{'return-path'} && $header->{'return-path'}->[0]->{'value'})); $end_rcvd = new Benchmark if $options =~ /b/; } # add new custom header fields if ($options =~ /x/) { if ($options =~ /t/) { $header->{'x-date-check'}->[0]->{'name'} = "X-Date-Check"; $header->{'x-date-check'}->[0]->{'value'} = date_check($header->{'date'}->[0]->{'value'},$header->{'received'} && $header->{'received'}->[0]->{'date'}); } if ($options =~ /p/) { for (my $x = 0; $x < scalar @{$header->{'received'}}; $x++) { if (defined $header->{'received'}->[$x]->{'spf'} && $header->{'received'}->[$x]->{'spf'} =~ /\w/) { push(@{$header->{'x-spf'}}, {'name' => "X-SPF", 'value' => $header->{'received'}->[$x]->{'spf'}}); } } } } # This needs to be done after the x-spf headers are generated # above, so that an x-spf header is generated for the # auto-generated local Received line. if ($no_local_received and not $add_local_received) { splice(@{$header->{'received'}}, 0, 1); } # output the new header containing the changes $start_set = new Benchmark if $options =~ /b/; print set_header($header); $end_set = new Benchmark if $options =~ /b/; # add the body if desired print parse_body() if $options =~ /w/; }; # propagate errors die if $@ && $@ !~ /timeout/i; # print timeout message if ($@ =~ /timeout/i) { error("die","Timed out... make sure to supply an email for processing. Try 'spamitarium -h' for details.\n"); } # calculate total running time if ($options =~ /b/) { my $end_time = new Benchmark; my $td = timediff($end_time, $start_time); my $usr = $td->[1]+$td->[3]; my $sys = $td->[2]+$td->[4]; my $cpu = $usr+$sys; my $wall = $td->[0]; print "Total running time was $wall wallclock secs; $usr usr + $sys sys = $cpu CPU secs.$CRLF"; $td = timediff($end_parse, $start_parse); $usr = $td->[1]+$td->[3]; $sys = $td->[2]+$td->[4]; $cpu = $usr+$sys; $wall = $td->[0]; print "Input parsing time was $wall wallclock secs; $usr usr + $sys sys = $cpu CPU secs.$CRLF"; if ($options =~ /r/) { $td = timediff($end_rcvd, $start_rcvd); $usr = $td->[1]+$td->[3]; $sys = $td->[2]+$td->[4]; $cpu = $usr+$sys; $wall = $td->[0]; print "Received line processing time was $wall wallclock secs; $usr usr + $sys sys = $cpu CPU secs.$CRLF"; } $td = timediff($end_set, $start_set); $usr = $td->[1]+$td->[3]; $sys = $td->[2]+$td->[4]; $cpu = $usr+$sys; $wall = $td->[0]; print "Rebuilding email time was $wall wallclock secs; $usr usr + $sys sys = $cpu CPU secs.$CRLF"; } # close popip database &closedb if $dbfile; exit(0); ################################################ ################ Parse Header ################# ################################################ sub parse_header { local($_); my $header_text = ""; while () { last if (/^\r?\n$/); $header_text .= $_; } # This is really gross. There is a certain prominent email marketing # company whose software has a bug in it which causes Date: headers to # sometimes be terminated with just CR rather than CRLF. If we interpret # RFC 5321 section 2.3.8 strictly, then we're required to treat such a # Date: header and the one following it as a single header field, but # strict adherence to the RFC when that results in obviously broken # behavior is not the best approach. On the other hand, when we're straying # from the RFC, we want to do so as minimally as possible. Therefore, what # we are doing here is checking specifically for this exact problem -- # Date: headers ending with CR rather than CRLF -- and correcting for just # that one, limited case. # This handles input with both CRLF and LF line terminators. $header_text =~ s/^(Date:.*)\r([^\n].*(.)\n)/$1$3\n$2/m; my(@headers) = split(/\n\b/, $header_text); my $header = {}; my $last_header = undef; for (@headers) { s/\s+$//; s/\s+/ /g; # collapse whitespace if (s/^(\S+):\s*//) { my $name = $1; my $tag = $name; $tag =~ tr/A-Z/a-z/; # header names are case-insensitive if (! $_ and $header->{$tag} and grep(! $_->{'value'}, @{$header->{$tag}})) { # If an empty header field is repeated multiple times, we only # need to preserve one of them. next; } push(@{$header->{$tag}}, {'name' => $name, 'value' => $_}); $last_header = $header->{$tag}->[-1]; } else { # What's the right thing to do here? Either there's no colon or # there's whitespace before the colon, both of which are RFC # violations. Our best guess is to append this to the previous # header. if (! $last_header) { error("warn", "Bad initial header line '$_' ignored\n"); } else { error("warn", "Bad header line '$_' appended to preceding '" . $last_header->{'name'} . "' header field"); $last_header->{'value'} .= " " . $_; } } } return $header; } sub date_check { my ($date,$rcvd) = shift; my ($dow, $day, $mon, $year, $hour, $min, $sec, $rmdr) = "?"; if ($date =~ /\s*?(\w{1,9}),?\s+?(\d+?)\s+?(\w{3})\s+?(\d{4})\s+?(\d{1,2}):(\d{2}):(\d{2})(.*)/i) { $dow=$1; $day=$2; $mon=$3; $year=$4; $hour=$5; $min=$6; $sec=$7; $rmdr=$8; $mon = $mon=~/Dec/i?11:$mon=~/Nov/i?10:$mon=~/Oct/i?9:$mon=~/Sep/i?8:$mon=~/Aug/i?7:$mon=~/Jul/i?6:$mon=~/Jun/i?5:$mon=~/May/i?4:$mon=~/Apr/i?3:$mon=~/Mar/i?2:$mon=~/Feb/i?1:0; eval { $date = timegm($sec,$min,$hour,$day,$mon,$year); }; if ($@) { return "date-out-of-range (overflow)"; } # adjust for local time if ($rmdr =~ /\+\d(\d)\d\d/) { $date -= $1 * 60 * 60; } if ($rmdr =~ /\-\d(\d)\d\d/) { $date += $1 * 60 * 60; } } else { return "date-format-unknown"; } if ($rcvd && $rcvd =~ /\s*?(\w{1,9}),?\s+?(\d+?)\s+?(\w{3})\s+?(\d{4})\s+?(\d{1,2}):(\d{2}):(\d{2})(.*)/i) { $dow=$1; $day=$2; $mon=$3; $year=$4; $hour=$5; $min=$6; $sec=$7; $rmdr=$8; $mon = $mon=~/Dec/i?11:$mon=~/Nov/i?10:$mon=~/Oct/i?9:$mon=~/Sep/i?8:$mon=~/Aug/i?7:$mon=~/Jul/i?6:$mon=~/Jun/i?5:$mon=~/May/i?4:$mon=~/Apr/i?3:$mon=~/Mar/i?2:$mon=~/Feb/i?1:0; eval { $rcvd = timegm($sec,$min,$hour,$day,$mon,$year); }; if ($@) { return "date-out-of-range (overflow)"; } # adjust for local time if ($rmdr =~ /\+\d(\d)\d\d/) { $rcvd -= $1 * 60 * 60; } if ($rmdr =~ /\-\d(\d)\d\d/) { $rcvd += $1 * 60 * 60; } } else { $rcvd = time; } # check for range +/- my $diff = $rcvd - $date; my $diff_days = round($diff/(60*60*24)); if (($diff < $date_limit) and ($diff > $date_limit * -1)) { return "date-in-range ($diff_days days)"; } else { return "date-out-of-range ($diff_days days)"; } } sub round { my $num = shift; return int(($num*100)+0.5)/100; } ################################################ ################# Parse Body ################## ################################################ sub parse_body { # this function is really only used for # email-to-email comparisons, where processing # the entire email is required... usually # we'll just process the header my $body = ""; while () { $body .= $_; } return $body; } ################################################ ########### Process Received Lines ############ ################################################ sub process_rcvd { my $rcvd = shift; my $rtrn = shift; # heuristics my $LUSER = qr~(?:\w|-|\.)+?~; my $DOMAIN = qr~(?:\w|-|\.)+\.\w{2,4}~; my $IP = qr~(?:\d{1,3}\.){3}\d{1,3}~; my $EMAIL = qr~$LUSER\@$DOMAIN~; my $HELO = qr~[^\s\0\/\\\#]+?~; my $RDNS = qr~(?:$DOMAIN|\[?$IP\]?|unknown|unverified)~; my $edge_ip = ""; my $untrusted = 0; if ($no_local_received) { my $local_rcvd; $local_rcvd->{'value'} = "auto-generated"; $local_rcvd->{'name'} = "Received"; $local_rcvd->{'sane'} = "trusted"; $local_rcvd->{'rdns'} = $opt_remote_name || ""; $local_rcvd->{'ipad'} = $opt_remote_ip || ""; $local_rcvd->{'mtan'} = $opt_local_name || ""; $local_rcvd->{'mtai'} = $opt_local_ip || ""; $local_rcvd->{'helo'} = $opt_helo || ""; $local_rcvd->{'fore'} = $opt_rcpt || ""; $local_rcvd->{'from'} = $return_path || ""; splice(@{$rcvd}, 0, 0, $local_rcvd); } # check if we were passed a valid array of received lines unless ((defined $rcvd) && (ref($rcvd) eq "ARRAY") && $rcvd->[0]->{'value'}) { no strict 'refs'; my %rcvd_hash = ('value' => "from localhost; " . gmtime time, 'name' => "Received"); my @rcvd_array; $rcvd_array[0] = \%rcvd_hash; $rcvd = \@rcvd_array; } else { # iterate through each received header, parsing and validating the info for (my $x = 0; $x < scalar @$rcvd; $x++) { # skip processing if we already lost confidence in this trail of received lines #if ($untrusted) { $rcvd->[$x]->{'sane'} = "untrusted"; next; } my $helo = $rcvd->[$x]->{'helo'} || ""; my $ipad = $rcvd->[$x]->{'ipad'} || ""; my $rdns = $rcvd->[$x]->{'rdns'} || ""; my $from = $rcvd->[$x]->{'from'} || ""; my $mtan = $rcvd->[$x]->{'mtan'} || ""; my $mtai = $rcvd->[$x]->{'mtai'} || ""; my $fore = $rcvd->[$x]->{'fore'} || ""; my $idnt=""; my $mtav=""; my $with=""; my $date=""; # try to take into account all known MTA formats if ($rcvd->[$x]->{'value'} =~ s/\(envelope-(?:sender|from) <($EMAIL)>\)//gis) { $from=$1; }# print "X-$x-matched-01: from=$from, remaining=$rcvd->[$x]->{'value'} $CRLF"; } if ($rcvd->[$x]->{'value'} =~ s/;\s+?(\w{3}, \d{1,2} \w{3} \d{2,4}.*?)$//gis) { $date=$1; }# print "X-$x-matched-02: date=$date, remaining=$rcvd->[$x]->{'value'} $CRLF"; } if ($rcvd->[$x]->{'value'} =~ s/for\s+??(?: \(single-drop\))?//gis) { $fore=$1; }# print "X-$x-matched-03: fore=$fore, remaining=$rcvd->[$x]->{'value'} $CRLF"; } if ($rcvd->[$x]->{'value'} =~ s/by\s+?(\S+?) \(($IP)\) \((.*?)\)//gis) { $mtan=$1; $mtai=$2; $mtav=$3; }# print "X-$x-matched-04: mtan=$mtan, mtai=$mtai, mtav=$mtav, remaining=$rcvd->[$x]->{'value'} $CRLF"; } elsif ($rcvd->[$x]->{'value'} =~ s/by\s+?(\S+?) \[($IP)\]//gis) { $mtan=$1; $mtai=$2; }# print "X-$x-matched-05: mtan=$mtan, mtai=$mtai, remaining=$rcvd->[$x]->{'value'} $CRLF"; } elsif ($rcvd->[$x]->{'value'} =~ s/by\s+?(\S+?) \((.+?)\)//gis) { $mtan=$1; $mtav=$2; }# print "X-$x-matched-06: mtan=$mtan, mtav=$mtav, remaining=$rcvd->[$x]->{'value'} $CRLF"; } elsif ($rcvd->[$x]->{'value'} =~ s/by\s+?($IP)(?=\W|;|$)//gis) { $mtai=$1; }# print "X-$x-matched-07: mtai=$mtai, remaining=$rcvd->[$x]->{'value'} $CRLF"; } elsif ($rcvd->[$x]->{'value'} =~ s/by\s+?($DOMAIN)(?=\W|;|$)//gis) { $mtan=$1; }# print "X-$x-matched-08: mtan=$mtan, remaining=$rcvd->[$x]->{'value'} $CRLF"; } elsif ($rcvd->[$x]->{'value'} =~ s/by\s+?(\S+?)(?=\W|;|$)//gis) { $mtan=$1; }# print "X-$x-matched-09: mtan=$mtan, remaining=$rcvd->[$x]->{'value'} $CRLF"; } if ($rcvd->[$x]->{'value'} =~ s/(?:with)\s+?(\S+?) \((.*?)\)//gis) { $with=$1; $mtav=$2 if !$mtav; }# print "X-$x-matched-10: with=$with, mtav=$mtav, remaining=$rcvd->[$x]->{'value'} $CRLF";} elsif ($rcvd->[$x]->{'value'} =~ s/(?:with)\s+?(\S+?)(?=\W|;|$)//gis) { $with=$1; }# print "X-$x-matched-11: with=$with, remaining=$rcvd->[$x]->{'value'} $CRLF"; } if ($rcvd->[$x]->{'value'} =~ s/^from\s+?($RDNS) \(HELO ($HELO)\) \(($LUSER)\@\[?($IP)\]?//gis) { $rdns=$1; $helo=$2; $idnt=$3; $ipad=$4; }# print "X-$x-matched-12: rdns=$rdns, helo=$helo, idnt=$idnt, ipad=$ipad, remaining=$rcvd->[$x]->{'value'} $CRLF"; } elsif ($rcvd->[$x]->{'value'} =~ s/^from\s+?($RDNS) \(HELO ($HELO)\) \(\[?($IP)\]?//gis) { $rdns=$1; $helo=$2; $ipad=$3; }# print "X-$x-matched-13: rdns=$rdns, helo=$helo, ipad=$ipad, remaining=$rcvd->[$x]->{'value'} $CRLF"; } elsif ($rcvd->[$x]->{'value'} =~ s/^from\s+?($RDNS) \(\[($IP)\] helo=($HELO)\)//gis) { $rdns=$1; $ipad=$2; $helo=$3; }# print "X-$x-matched-14: rdns=$rdns, ipad=$ipad, helo=$helo, remaining=$rcvd->[$x]->{'value'} $CRLF"; } elsif ($rcvd->[$x]->{'value'} =~ s/^from\s+?($RDNS) \(($LUSER)\@\[?($IP)\]?\)//gis) { $rdns=$1; $idnt=$2; $ipad=$3; }# print "X-$x-matched-15: rdns=$rdns, idnt=$idnt, ipad=$ipad, remaining=$rcvd->[$x]->{'value'} $CRLF"; } elsif ($rcvd->[$x]->{'value'} =~ s/^from\s+?($RDNS)\(($IP)\)//gis) { $rdns=$1; $ipad=$2; }# print "X-$x-matched-16: rdns=$rdns, ipad=$ipad, remaining=$rcvd->[$x]->{'value'} $CRLF"; } elsif ($rcvd->[$x]->{'value'} =~ s/^from\s+?\[($IP)\] \(helo=($HELO) ident=($LUSER)\)//gis) { $ipad=$1; $helo=$2; $idnt=$3; }# print "X-$x-matched-17: ipad=$ipad, helo=$helo, idnt=$idnt, remaining=$rcvd->[$x]->{'value'} $CRLF"; } elsif ($rcvd->[$x]->{'value'} =~ s/^from\s+?\[($IP)\] \(account ($LUSER) HELO ($HELO)\)//gis) { $ipad=$1; $idnt=$2; $helo=$3; }# print "X-$x-matched-18: ipad=$ipad, idnt=$idnt, helo=$helo, remaining=$rcvd->[$x]->{'value'} $CRLF"; } elsif ($rcvd->[$x]->{'value'} =~ s/^from\s+?\[($IP)\] \(helo=($HELO)\)//gis) { $ipad=$1; $helo=$2; }# print "X-$x-matched-19: ipad=$ipad, helo=$helo, remaining=$rcvd->[$x]->{'value'} $CRLF"; } elsif ($rcvd->[$x]->{'value'} =~ s/^from\s+?\[?($IP)\]?:?\d*? \(HELO ($HELO)\)//gis) { $ipad=$1; $helo=$2; }# print "X-$x-matched-20: ipad=$ipad, helo=$helo, remaining=$rcvd->[$x]->{'value'} $CRLF"; } elsif ($rcvd->[$x]->{'value'} =~ s/^from\s+?($HELO) \(IDENT:($LUSER)\@($RDNS) \[($IP)\]//gis) { $helo=$1; $idnt=$2; $rdns=$3; $ipad=$4; }# print "X-$x-matched-21: helo=$helo, idnt=$idnt, rdns=$rdns, ipad=$ipad, remaining=$rcvd->[$x]->{'value'} $CRLF"; } elsif ($rcvd->[$x]->{'value'} =~ s/^from\s+?($HELO) \(?\s?\[($IP)\]//gis) { $helo=$1; $rdns=$2; $ipad=$3; }# print "X-$x-matched-22: helo=$helo, rdns=$rdns, ipad=$ipad, remaining=$rcvd->[$x]->{'value'} $CRLF"; } elsif ($rcvd->[$x]->{'value'} =~ s/^from\s+?($HELO) \(\[($IP)\] ident=($LUSER)\)//gis) { $helo=$1; $ipad=$2; $idnt=$3; }# print "X-$x-matched-23: helo=$helo, ipad=$ipad, idnt=$idnt, remaining=$rcvd->[$x]->{'value'} $CRLF"; } elsif ($rcvd->[$x]->{'value'} =~ s/^from\s+?($HELO) \(proxying for ($IP)\) \(.*? user ($LUSER)\)//gis) { $helo=$1; $ipad=$2; $idnt=$3; }# print "X-$x-matched-24: helo=$helo, ipad=$ipad, idnt=$idnt, remaining=$rcvd->[$x]->{'value'} $CRLF"; } elsif ($rcvd->[$x]->{'value'} =~ s/^from\s+?($HELO) \(account ($LUSER) \[($IP)\] verified\)//gis) { $helo=$1; $idnt=$2; $ipad=$3; }# print "X-$x-matched-25: helo=$helo, idnt=$idnt, ipad=$ipad, remaining=$rcvd->[$x]->{'value'} $CRLF"; } elsif ($rcvd->[$x]->{'value'} =~ s/^from\s+?\(?($HELO) \(?\[?($IP)\]?\)?//gis) { $helo=$1; $ipad=$2; }# print "X-$x-matched-26: helo=$helo, ipad=$ipad, remaining=$rcvd->[$x]->{'value'} $CRLF"; } elsif ($rcvd->[$x]->{'value'} =~ s/^from\s+?($HELO) \(localhost \[.*?:($IP)\]\)//gis) { $helo=$1; $ipad=$2; }# print "X-$x-matched-27: helo=$helo, ipad=$ipad, remaining=$rcvd->[$x]->{'value'} $CRLF"; } elsif ($rcvd->[$x]->{'value'} =~ s/^from\s+?($HELO) \(($LUSER)\@($RDNS)\)//gis) { $helo=$1; $idnt=$2; $rdns=$3; }# print "X-$x-matched-28: helo=$helo, idnt=$idnt, rdns=$rdns, remaining=$rcvd->[$x]->{'value'} $CRLF"; } elsif ($rcvd->[$x]->{'value'} =~ s/^from\s+?($HELO) \(($RDNS)\)//gis) { $helo=$1; $rdns=$2; }# print "X-$x-matched-29: helo=$helo, rdns=$rdns, remaining=$rcvd->[$x]->{'value'} $CRLF"; } elsif ($rcvd->[$x]->{'value'} =~ s/\(from\s+?($LUSER)\@($RDNS)\)//gis) { $idnt=$1; $rdns=$2; }# print "X-$x-matched-30: idnt=$idnt, rdns=$rdns, remaining=$rcvd->[$x]->{'value'} $CRLF"; } elsif ($rcvd->[$x]->{'value'} =~ s/\(from\s+?($LUSER)\@($HELO)\)//gis) { $idnt=$1; $helo=$2; }# print "X-$x-matched-31: idnt=$idnt, helo=$helo, remaining=$rcvd->[$x]->{'value'} $CRLF"; } elsif ($rcvd->[$x]->{'value'} =~ s/^from\s+?\(?\[?($IP)\]?\)?//gis) { $ipad=$1; }# print "X-$x-matched-32: ipad=$ipad, remaining=$rcvd->[$x]->{'value'} $CRLF"; } elsif ($rcvd->[$x]->{'value'} =~ s/^from\s+?($HELO)(?=\W|;|$)//gis) { $helo=$1; }# print "X-$x-matched-33: helo=$helo, remaining=$rcvd->[$x]->{'value'} $CRLF"; } # lookup IP if not provided $ipad = host($rdns) if !$ipad && $rdns && $options =~ /d/; $ipad = host($helo) if !$ipad && !$rdns && $helo && $helo =~ /$DOMAIN/ && $options =~ /d/; # exclude lines with no IP #next if !$ipad && ((scalar @$rcvd) > 1); # ensure the local received line has a date stamp $date = gmtime time unless $date || $x; # save "from" info for comparison in next iteration $rcvd->[$x]->{'rdns'} = $rdns; $rcvd->[$x]->{'ipad'} = $ipad; $rcvd->[$x]->{'date'} = $date; # exclude lines from local, private (RFC 1918), and invalid IP address ranges my $reserved = qr~^((?:127\.)|(?:10\.)|(?:172\.(?:1[6-9]|2[0-9]|31)\.)|(?:192\.168\.)|(?:169\.254\.))~; my $valid = qr~^((?:0?0?\d|[01]?\d\d|2[0-4]\d|25[0-5])\.(?:0?0?\d|[01]?\d\d|2[0-4]\d|25[0-5])\.(?:0?0?\d|[01]?\d\d|2[0-4]\d|25[0-5])\.(?:0?0?\d|[01]?\d\d|2[0-4]\d|25[0-5]))$~; next if (($rdns and $rdns =~ /localhost/i) || ($ipad and ($ipad =~ /$reserved/ || $ipad !~ /$valid/))) && ((scalar @$rcvd) > 1); # lookup MTA IP/rDNS if not provided $mtai = host($mtan) if !$mtai && $mtan && $options =~ /d/; $mtan = host($mtai) if !$mtan && $mtai && $options =~ /d/; # exclude lines from within our class B (/16) network next if (($edge_ip && $ipad && is_same_class_B($edge_ip,$ipad))||(!$edge_ip && $mtai && $ipad && is_same_class_B($mtai,$ipad))) && ((scalar @$rcvd) > 1); # perform reverse DNS lookup if not provided by MTA $rdns = host($ipad) if !$rdns && $ipad && $options =~ /d/; # force a reverse DNS lookup on all IPs, even those with an RDNS set by the MTA $rdns = host($ipad) if $ipad && $options =~ /f/; # perform ASN lookup (RFC 1930/2270) my $asn = ""; $asn = asn($ipad) if $ipad && $options =~ /a/; # perform SPF lookup (RFC 4408) my $result = ""; my $received_spf = ""; if ($options =~ /p/) { if (scalar @cidr_list && eval{Net::CIDR::cidrlookup($ipad, @cidr_list)}) { $result = "pass"; $received_spf = "pass ($ipad is locally whitelisted)"; } if ($dbfile && !$result) { &retie unless tied %db; if ($db{$ipad}) { $result = "pass"; $received_spf = "pass ($ipad is authenticated via POP3)"; } } if ($rtrn && $ipad && !$result) { my $srvr = $rdns?$rdns:($helo?$helo:$ipad); eval { my $spf_server = Mail::SPF::Server->new(); my $request = Mail::SPF::Request->new( identity => $rtrn, ip_address => $ipad, helo_identity => $srvr ); my $response = $spf_server->process($request); $result = $response->code; $received_spf = $response->received_spf_header; }; if ($@) { syslog('warning', 'SPF query error (ip=%s, sender=%s, helo=%s): %s', $ipad, $rtrn, $srvr, $@); $@ = undef; $result = undef; } } if (!$result) { $result = "error"; $received_spf = "unable to determine sender info"; } } # we implicitely trust the received line set "by" our own server as valid (first untrusted "from") if (!$edge_ip) { $edge_ip = $mtai; $rcvd->[$x]->{'sane'} = set_rcvd($helo,$ipad,$idnt,$rdns,$from,$mtan,$mtai,$mtav,$fore,$with,$date,$asn,$result); $rcvd->[$x]->{'spf'} = $received_spf if $options =~ /p/; } # now we'll try to establish the validity of each received line by checking # for continuity and rejecting lines that don't fit the "from/by" chain else { #print " by " . $mtan . " / prev from " . $rcvd->[$x-1]->{'rdns'} . "$CRLF"; #print " by " . $mtai . " / prev from " . $rcvd->[$x-1]->{'ipad'} . "$CRLF"; if ( ( ($mtan && $rcvd->[$x-1]->{'rdns'} && $mtan =~ /$rcvd->[$x-1]->{'rdns'}/) || ($mtai && $rcvd->[$x-1]->{'ipad'} && $mtai =~ /$rcvd->[$x-1]->{'ipad'}/) ) && (!$untrusted) ) { $rcvd->[$x]->{'sane'} = set_rcvd($helo,$ipad,$idnt,$rdns,$from,$mtan,$mtai,$mtav,$fore,$with,$date,$asn,$result); $rcvd->[$x]->{'spf'} = $received_spf if $options =~ /p/; } else { $helo = "untrusted-".$helo if $helo; $ipad = "untrusted-".$ipad if $ipad; $idnt = "untrusted-".$idnt if $idnt; $rdns = "untrusted-".$rdns if $rdns; $from = "untrusted-".$from if $from; $mtan = "untrusted-".$mtan if $mtan; $mtai = "untrusted-".$mtai if $mtai; $mtav = "untrusted-".$mtav if $mtav; $fore = "untrusted-".$fore if $fore; $with = "untrusted-".$with if $with; $date = ""; $asn = ""; $result = ""; $rcvd->[$x]->{'sane'} = set_rcvd($helo,$ipad,$idnt,$rdns,$from,$mtan,$mtai,$mtav,$fore,$with,$date,$asn,$result); $untrusted = 1; } } }} return $rcvd; } sub is_same_class_B { my ($ip1,$ip2) = @_; $ip1 =~ s/^(\d{1,3}\.\d{1,3}\.).*?$/$1/gis; $ip2 =~ s/^(\d{1,3}\.\d{1,3}\.).*?$/$1/gis; return ($ip1 eq $ip2)? 1:0; } sub asn { my $target = shift; my $output = ""; my $IP = qr~(?:\d{1,3}\.){3}\d{1,3}~; my $DOMAIN = qr~(?:\w|-|\.)+?\.\w{2,4}~; if ( $target =~ s/(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})/$4.$3.$2.$1.$asn_server/ ) { # uncomment this code if you do not want to use Net::DNS::Resolver and you have 'host' on your system #open (HOST, "host -t txt $target 2>/dev/null |") or error("warn","Host lookup failed: $!"); #while () { $output = $1 if /\Q$target\E(?: descriptive)? text "(\d*?)".*/; } #close HOST; # find ASN info via Net::DNS::Resolver if (my $query = $res->send($target,"TXT")) { foreach ($query->answer) { $output = $1 if $_->string =~ /$DOMAIN\.\s+?\d+?\s+?IN\s+?TXT\s+?"(\d+?)"\s+?"$IP"\s+?"\d+?"/; }} #else { error("warn","ASN lookup failed: " . $res->errorstring); } } return $output; } sub host { my $target = shift; my $output = ""; my $IP = qr~(?:\d{1,3}\.){3}\d{1,3}~; my $DOMAIN = qr~(?:\w|-|\.)+?\.\w{2,4}~; if ($target =~ s/($IP|$DOMAIN)/$1/) { # uncomment this code if you do not want to use Net::DNS::Resolver and you have 'host' on your system #open (HOST, "host $target 2>/dev/null |") or error("warn","Host lookup failed: $!"); #while () { $output = $1 if /$DOMAIN (?:domain name pointer|has address) ($IP|$DOMAIN)\.?/; } #close HOST; # find DNS info via Net::DNS::Resolver if (my $query = $res->send($target)) { foreach ($query->answer) { $output = $1 if $_->string =~ /$DOMAIN\.\s+?\d+?\s+?IN\s+?(?:PTR|A)\s+?($IP|$DOMAIN)\.?/; }} #else { error("warn","DNS lookup failed: " . $res->errorstring); } } return $output; } sub set_rcvd { my ($helo,$ipad,$idnt,$rdns,$from,$mtan,$mtai,$mtav,$fore,$with,$date,$asn,$spf) = @_; my $output = "from"; if ($options =~ /e/) { $output .= ($helo)? " helo-$helo" : " no-helo";} # sender's salutation $output .= ($rdns)? " $rdns" : " no-rdns"; # sender's name $output .= ($ipad)? " $ipad" : " no-ipad"; # sender's IP if ($options =~ /p/) { $output .= ($spf)? " spf-$spf" : " no-spf";} # sender's policy result $output .= ($asn)? " as$asn" : " no-asn"; # sender's ASN $output .= ($mtan||$mtai)? " $CRLF\t by" : ""; $output .= ($mtan)? " $mtan" : " no-mta-name"; # receiving MTA's name $output .= ($mtai)? " $mtai" : " no-mta-ip"; # receiving MTA's IP $output .= ($fore)? " $CRLF\t for" : ""; $output .= ($fore)? " <$fore>" : " no-to-addr"; # envelope to address $output .= ($date)? "; $date" : " no-date"; # received date/time #print "outputting received: $output" . $CRLF; return $output; } sub opendb_read { tie(%db, "DB_File", $dbfile, O_RDONLY, 0, $DB_HASH) or error("warn","Can't open $dbfile: $!"); } sub closedb { untie %db; undef %db; } sub retie { &closedb; &opendb_read; } ################################################ ################ Output Header ################# ################################################ sub set_header { my $header = shift; my $output = ""; my $name = ""; # exclude the "masked fields" from display foreach $name (split(/,/,$masked_fields)) { $spec_fields =~ s/(?<=,)$name,?//; } # output the fields in the order specified by RFC 2822 - minus the masked fields foreach $name (split(/,/,$spec_fields)) { $output .= set_field($header,$name); delete $header->{$name}; } # set any user-specified fields foreach $name (split(/,/,$user_fields)) { $output .= set_field($header,$name); delete $header->{$name}; } # set new custom x-header fields if ($options =~ /x/) { foreach $name (split(/,/,$new_fields)) { $output .= set_field($header,$name); delete $header->{$name}; } } # then set any remaining fields (if allowed to set non-standard fields) if ($options !~ /s/) { foreach $name (keys %{$header}) { $output .= set_field($header,$name); } } $output .= $CRLF; return $output; } sub set_field { my($header, $name) = @_; my $output = ""; if ($header->{$name}) { foreach my $header (@{$header->{$name}}) { if ($name eq "received" and $options =~ /r/) { if ($header->{'sane'} and $header->{'sane'} =~ /\w/) { $output .= $header->{'name'} . ": " . $header->{'sane'} . $CRLF; } # else { # $output .= $header->{'name'} . ": sanity check failed" . # $CRLF; # } } elsif ($header->{'name'} and defined($header->{'value'})) { $output .= $header->{'name'} . ": " . $header->{'value'} . $CRLF; } else { my $dumped = Data::Dumper->new([$header], [qw(header)])-> Indent(0)->Dump(); error("warn", "Header for $name, $dumped, is missing name " . "and/or value?"); } } } elsif ($req_fields =~ /(?:^|,)$name(?:,|$)/) { $output .= ucfirst($name) . ": [no-$name] " . $CRLF; } return $output; } ################################################ ################ Error Handling ################ ################################################ sub error { my ($action,$msg) = @_; die $msg if $action eq "die"; warn $msg unless $action eq "die"; # add other actions if you like } sub sig_trap { my $sig = shift; my ($action,$more) = ("warn",""); sig: { $action = "die", last sig if $sig =~ /ALRM/; $action = "warn", last sig if $sig =~ /PIPE/; $action = "warn", last sig if $sig =~ /CHLD/; $action = "die" , last sig if $sig =~ /INT/; $action = "die" , last sig if $sig =~ /HUP/; $action = "warn"; } my $waitedpid = wait; $more = "; Reaped pid $waitedpid, exited with status " . ($? >> 8) if $waitedpid; $SIG{$sig} = \&sig_trap; error ($action,"Trapped signal SIG$sig$more"); } ################################################ ################################################ ################################################