misc/getcost/getcost - metacpan.org

: # use perl
	eval 'exec perl -S $0 ${1+"$@"}'
		if $running_under_some_shell;

###############################################################################
#
# File:         getcost
# RCS:          $Header: /home/ram/home/projects/mailagent/misc/getcost/RCS/getcost,v 3.0.1.2 1999/07/12 13:59:33 ram Exp $
# Description: SPAM scoring program
#		Version 3.0
# Authors:      Darryl Okahata (with additions from Raphael Manfredi)
# Created:      Wed Jan 15 10:44:56 1997
# Modified:     Mon May  5 14:06:09 1997 (Darryl Okahata) darrylo@sr.hp.com
# Modified:     Wed Sep 10 16:09:09 1997 Raphael_Manfredi@grenoble.hp.com
# Language:     Perl
# Package:      N/A
# Status:       Experimental (Do Not Distribute)
#
# (C) Copyright 1997, Hewlett-Packard, all rights reserved.
#
# Permission to redistribute this file as part as mailagent-3.0 kindly granted
# by Darryl, under the express condition that it cannot be spearated and
# redistributed separately.	-- RAM, 11/03/98
#
###############################################################################

$my_name = getlogin . "\\@";

$production = '';

$config_file = $ENV{'HOME'} . "/.spamconfig";

###############################################################################

require 'getopts.pl';
&Getopts('acBDLbdf:mokMT:') || warn "getcost: bad switch usage\n";
$debug = 1 if ($opt_d);
$dump_scanners = 1 if ($opt_D);
$config_file = $opt_f if ($opt_f);
$stop_suppressed_recipients_spam = 1 if ($opt_S);
$ding_binary_characters = 1 if ($opt_B);
$ding_long_lines = 1 if ($opt_L);
$allow_openmail = 1 if ($opt_o);
$allow_any_body_contents = 1 if ($opt_b);

# Added by RAM
$apply_all = $opt_a;
$collect_line = $opt_c;
$threshold = $opt_T;
$mailagent = $opt_M;
$keep_going = $opt_k;			# Don't stop when cost <= -100000
$TO_SIZE = 1000;				# To header longer than this is suspect
$match_all = $opt_m;			# Apply patterns with //g and count occurences

# If debugging, test whether we're emitting to a terminal or a file...
# Matches will be outlined with _ escapes when logging into a file, but
# with terminal escapes otherwise...
if ($debug) {
	$to_tty = -t STDERR;
	$tty_underline = `tput smul 2>/dev/null` || '[4m';
	$tty_normal = `tput rmul 2>/dev/null` || '[0m';
}

&read_config($config_file);

$number_of_uppercase_lines = 0;
$lines = 0;
$cost = 0;
$in_header = 1;
$from_processed = '';
$pending_score = 0;
$collecting_to = '';
$collecting_line = '';
$is_a_reply = 0;
loop: while (<>) {
    if ($cost <= -100000 || $cost >= 1000000) {
	last loop unless $keep_going;
    }
    if ($in_header) {
      HEADER_SWITCH: {
	  /^\s*$/ &&
	      do {
		  $in_header = '';
		  last HEADER_SWITCH;
	      };
	  if ($collecting_to) {
	      if (/^(\s+.+)$/) {
		  $token = $1;
		  $token =~ s/^\s+/ /;
		  $to_header = $to_header . $token;
		  last HEADER_SWITCH;
	      } else {
		  $collecting_to = '';
		  # fall through
	      }
	  }
	  (/^From:\s+(.+)$/ || /^Reply-To:\s+(.+)$/i
	   ) && !$from_processed &&
	       do {
		   $token = $1;
		   if ($token !~ /@.*@/ &&
		       $token !~ /mailer-daemon/i) {
		       #
		       # There can't be two "@"s in the address
		       #
		       foreach $item (@good_sites) {
			   if ($token =~ /$item/i) {
			       # It's a good site -- do no more.
			       &update($_, 100000000);
			       last loop;
			   }
		       }
		   }
		   foreach $sender (@senders) {
		       if ($token =~ /$sender/i) {
			   &update($_, $sender_scores{$sender});
			   $from_processed = 1;
			   last HEADER_SWITCH;
		       }
		   }
		   last HEADER_SWITCH;
	       };
	  (/^Sender:\s+(.+)$/) &&
	       do {
		   $token = $1;
		   foreach $sender (@senders) {
		       if ($token =~ /$sender/i) {
			   &update($_, $sender_scores{$sender});
			   last HEADER_SWITCH;
		       }
		   }
		   last HEADER_SWITCH;
	       };
	  (/^To:\s+(.+)$/ || /^Cc:\s+(.+)$/i
	   ) &&
	      do {
		  $to_header = $1;
		  $collecting_to = 1;
		  last HEADER_SWITCH;
	      };
	  (
	   /^Received:\s+from\s+[-a-z0-9_.]+\s+\(([-a-z0-9_.]+)\s+/i ||
	   /^Received:\s+from\s+([-a-z0-9_.@]+)\b/i ||
	   /^Received:\s+\(from\s+([-a-z0-9_.\@]+)\b/i ||
	   /^\s+\(peer\s+crosschecked\s+as:\s+([-a-z0-9_.]+)\s+/i) &&
	      do {
		  $token = $1;
		  last HEADER_SWITCH 
		      if ($token =~ /^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$/ ||
			  $token =~ /\d{3}-\d{3}-\d{4}\b/ ||
			  $token !~ /[\.\@]/
			  );
		  foreach $relay (@relays) {
		      if ($token =~ /$relay/i) {
			  &update($_, $relay_scores{$relay});
			  last HEADER_SWITCH;
		      }
		  }
		  last HEADER_SWITCH;
	      };
	  (/^Newsgroups:\s(.+)$/ ||
	   /^Xref:\s(.+)$/
	   ) &&
	      do {
		  $token = $1;
		  if (length($token) > 200) {
		      &update($_, -1000000);
		      last HEADER_SWITCH;
		  }
		  # Fall through ...
	      };
	  (/^References:\s/i || /^In-Reply-To:\s/i) && do {
		$is_a_reply = 1;		# Probably a reply, don't scan quoted material
		# Fall through...
	  };
	  (/^References:\s/i || /^In-Reply-To:\s/i || /^Newsgroups:\s/i ||
	   /^X-Also-Posted-To:\s/i || /^X-URL:/i) &&
	       do {
		   &update($_, 500);
		   last HEADER_SWITCH;
	       };
	  /^X-Mailer:\s+(.+)$/ &&
	      do {
		  $token = $1;
		  print STDERR "Mailer: $token\n" if ($debug);
		  if ($token =~ /flood/i) {
		      &update($_, -1000000);
		      last loop;
		  } elsif ($token =~ /\bEudora\b/i) {
		      $is_eudora = 1;
		  }
		  last HEADER_SWITCH;
	      };
	  /^Subject:\s+(.+)$/ &&
	      do {
		  $token = $1;
		  foreach $subject (@subjects) {
		      if ($token =~ /$subject/i) {
			  &update($_, $subject_scores{$subject});
			  last HEADER_SWITCH;
		      }
		  }
		  # This is CASE SENSITIVE:
		  if ($token =~ /^[-A-Z_=!?<>,.]{5,}$/) {
		      &update($_, -10000);
		      last HEADER_SWITCH;
		  }
		  last HEADER_SWITCH;
	      };
	  /^X-Openmail-Hops:\s/i && $allow_openmail &&
	      do {
		  &update($_, 100000000);
		  last loop;
	      };
      }
    } else {
      BODY_SWITCH: {
		if (!$to_header_processed) {
			$debug = '' if ($production);
			if (length($to_header) > $TO_SIZE) {
				&update("Long To: $to_header\n", 
					$TO_SIZE - length($to_header));
			} elsif ($to_header =~ /\blist\s+suppressed\b/i &&
				$stop_suppressed_recipients_spam) {
				&update("<recipients suppressed>\n", -1000000);
				last loop;
			}
			$to_header_processed = 1;
			last loop if ($allow_any_body_contents);
		}
	  last loop if ($cost >= 1000000);

	  $is_a_reply && /^\s*[>:|]/ && next loop;		# Quoted body, hopefully
	  $lines++ unless /^\s*$/;						# Don't count blank lines
	  $ding_binary_characters && /[\r\200-\377]/ &&
	      do {
		  &update("Binary: $_", -100);
	      };
	  $ding_long_lines && length($_) >= 90 &&
	      do {
		  &update("Long line: $_", -(10 + length($_)));
	      };

	  $_ .= '. ' if $collect_line && eof();			# Finish last sentence
	  $result = &check_body_line($_);
	  last loop if ($result == 3);
	  last BODY_SWITCH if ($result != -1);

	  /^[-A-Z_ \d\t,.<>?!()]*[A-Z][-A-Z_ \d\t,.<>?!()]*$/ &&
	      do {
		  if (length($_) > 10) {
		      if ($debug) {
			  print STDERR "Upper:", $_;
		      }
		      ++$number_of_uppercase_lines;
		  }
	      };
      }
  }
}

# Flush input pipe, if in "mailagent" mode, to avoid "broken pipe"
# (exiting early when mailagent has not yet written all the message)
if ($mailagent && !eof) {
	while (<>) {}
}

if ($cost > -10000) {
    $cost += $pending_score;
    if ($debug && $pending_score != 0) {
	print STDERR "$pending_score: <pending score>\n";
    }
}

if ($number_of_uppercase_lines >= 10) {
    $cost += -10000;
    if ($debug) {
	print STDERR "-10000: <too many uppercase lines>\n";
    }
} elsif ($lines && $number_of_uppercase_lines / $lines > .15 ) {
	# Trap short, loud and to the point messages
    $cost += -10000;
    if ($debug) {
	print STDERR "-10000: <more than 15% of uppercase lines>\n";
    }
}

if ($debug) {
    print STDERR "Cost = $cost\n";
}
if ($mailagent) {
	print "ASSIGN cost $cost; ";
	if ($cost < $threshold) {
		print "NOP -f\n";
	} else {
		print "NOP -t\n";
	}
} else {
	print $cost, "\n";
}

exit 0;

###############################################################################

sub update
{
    local($line, $delta) = @_;

    $cost += $delta;
    if ($debug) {
		my ($before, $match, $after) = ($`, $&, $');
		($before, $match, $after) = ($`, $&, $')
			if $line ne "$before$match$after" && $line =~ /\Q$match\E/;
		if ($to_tty) {
			$match = "$tty_underline$match$tty_normal";
		} else {
			$match =~ s/(.)/_\b$1/g;
		}
		$line = "$before$match$after";
		printf STDERR "%10s: %s", $delta, $line;
    }
}

sub collect_line {
	return 1 unless $collect_line;
	my ($lref) = @_;

	# Identify the part of the line that goes up to the end of a sentence,
	# which is a punctuation sign, followed by a space. When a full sentence
	# has been recognized, it is put back into $lref, and the residual part
	# is left in $collecting_line.

	my $line = $$lref;
	$line =~ tr/\n/ /;
	$collecting_line .= $line;

	my $sentence;
	$sentence = $1 if $collecting_line =~ s/^((?:.*?)[.?!:]+)\s//;

	return 0 if $sentence eq '';	# No full sentence yet

	$$lref = $sentence . "\n";
	return 1;						# Full sentence recognized, held in $$lref
}

###############################################################################

sub read_config
{
    local($config_file) = @_;
    local(*IN, $phase, $name, $cost);

    if (open(IN, "<$config_file")) {
	$phase = -1;
	while (<IN>) {
	    chop;
	    next if (/^[ \t]*(\#.*)?$/);
	    if (/^\s*\[\s*([a-z0-9 \t]+)\]/) {
		$name = $1;
		$name =~ s/\s+$//;
		($name =~ /^name$/i) && do {
		    $phase = 0;
		    next;
		};
		($name =~ /^domain$/i) && do {
		    $phase = 1;
		    next;
		};
		($name =~ /^good\s+sites$/i) && do {
		    $phase = 10;
		    next;
		};
		($name =~ /^sender\s+scores$/i) && do {
		    $phase = 11;
		    next;
		};
		($name =~ /^subject\s+scores$/i) && do {
		    $phase = 12;
		    next;
		};
		($name =~ /^relay\s+scores$/i) && do {
		    $phase = 13;
		    next;
		};
		($name =~ /^body\s+text$/i) && do {
		    $phase = 100;
		    next;
		};
		print STDERR "Warning: Unknown config block name: \"$name\"\n";
		next;
	    }
	    ($phase == 0) && do {
		# my name
		s/^\s+//;
		s/\s+$//;
		$my_name = $_;				## NO LONGER USED
		next;
	    };
	    ($phase == 1) && do {
		# my domain
		s/^\s+//;
		s/\s+$//;
		$my_domain = $_;			## NO LONGER USED
		next;
	    };
	    ($phase == 10) && do {
		# good sites
		if (!/^\s*(.+)$/) {
		    print STDERR
			"Warning: unknown good site line: \"$_\"\n";
		    next;
		}
		$name = $1;
		$name =~ s/\s+$//;
		push(@good_sites, $name);
		next;
	    };
	    ($phase == 100) && do {
		s/^\s+//;
		s/\s+$//;
		$line = $_;
		push(@body_scores, $line);
		next;
	    };
	    if (!/^\s*(.+)\s+([-+0-9]+)\s*$/) {
		print STDERR
		    "Warning: unknown score line: \"$_\"\n";
		next;
	    }
	    $name = $1;
	    $cost = $2;
	    $name =~ s/\s+$//;
	    ($phase == 11) && do {
		# sender scores
		push(@senders, $name);
		$sender_scores{$name} = $cost;
		next;
	    };
	    ($phase == 12) && do {
		# subject scores
		push(@subjects, $name);
		$subject_scores{$name} = $cost;
		next;
	    };
	    ($phase == 13) && do {
		# relay scores
		push(@relays, $name);
		$relay_scores{$name} = $cost;
		next;
	    };
	    print STDERR "Unknown phase for line: \"$_\"\n";
	}
	close(IN);
	if (@body_scores) {
	    $body_scanner = &build_scanner("scanner", @body_scores);
	    $procedure = <<EOF;
sub check_body_line
{
    local(\$line) = \@_;
    local(\$result);

	return unless collect_line(\\\$line);
	study \$line;
    $body_scanner
    return (\$result);
}
EOF
	    print STDERR "Body scanner:\n$procedure\n" if ($dump_scanners);
	    eval $procedure;
	    die "Problem in body scanner: $@" if ($@);
	}
    } else {
    }
}

sub build_scanner
{
    local($switch_name, @scores) = @_;
    local($scanner);

    $scanner = <<EOF_SCANNER;
$switch_name: {
    # \$result is (largely unimplemented):
    #        -1 for no match
    #        1 for match
    #        2 for exit header/body loop
    #        3 to stop scanning completely
    \$result = -1;
	my \$copy;
	my \$stripped;
EOF_SCANNER
    foreach $line (@scores) {
	if ($line !~ /^\s*(.+)\s+([-+0-9]+)\s*$/) {
	    print STDERR
		"Warning: unknown score line: \"$line\"\n";
	    next;
	}
	$pattern = $1;
	my $cost = $2;
	$pattern =~ s/\s+$//;
	if ($cost == 0) {
		$newcode = <<EOF
    (\$line =~ $pattern) && do {
	&update(\$line, $cost);
	\$result = 3;
	last $switch_name;
    };
EOF
	} else {
	    if ($pattern eq "free") {
		$newcode = <<EOF
    (\$line =~ /\\bfree\\b/i &&
     \$line !~ /free[ \\t]*\\(/ &&
     \$line !~ /free\\'d/ &&
	\$line !~ /\\bto\\s+free\\s+\\w+/i &&	# to free smth -> verb
     \$line !~ /free\\s+software/) && do {
	&update(\$line, $cost);
	\$result = 1;
	last $switch_name unless \$apply_all;
    };
EOF
	    } else {
			my $attempt_match_all = 0;
			if ($pattern !~ m!^(?:/|m\W)\^!) {	# Not an anchored match at start
				$attempt_match_all = 1 if $match_all;
			}
			if ($attempt_match_all) {
				# Match pattern as many times as possible, removing
				# the leading matched part and trying again, and again...
				$newcode = <<EOF;
	\$copy = \$line;
	\$stripped = 0;
	while (\$copy =~ $pattern) {
		my (\$before, \$match, \$after) = (\$`, \$&, \$');
		&update(
			\$stripped ? "...\$before\$match\$after" : \$line,
			$cost);
		\$copy = substr(\$copy, length(\$before) + length(\$match));
		\$stripped++;
	}
	\$result = \$stripped ? 1 : \$result;
	last $switch_name if \$result == 1 && !\$apply_all;
EOF
			} else {
				# Match pattern once
				$newcode = <<EOF;
	(\$line =~ $pattern) && do {
		&update(\$line, $cost);
		\$result = 1;
		last $switch_name unless \$apply_all;
	};
EOF
			}
		}
	}
	$scanner .= $newcode;
    }
    $scanner .= "}\n";
    return ($scanner);
}

sub check_body_line
{
}
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)