#!/usr/bin/perl my $rcs='@(#)$Header: /home/jamesb/CVS/junkfilter/src/junkFilter,v 1.1.1.1 2004/07/11 16:53:07 jamesb Exp $'; # # Module : junkFilter # Purpose: Junk mail filter. # Author : B.James # Date : $Date: 2004/07/11 16:53:07 $ # Version: $Revision: 1.1.1.1 $ # # 11/07/2004: B.James: Updated to use SDBM_File instead of DB_File # =pod =head1 NAME junkFilter - Junk mail filter. =head1 SYNOPSIS junkFilter [ -s | -h | -p file | -j file | -g file ] =head2 Options:- =over =item -s Setup the data directory =item -j Processes the specified file as a junk file. Builds the junk tables. =item -g Processes the specified file as a good word file. =item -p Processes the specified file and says if it thinks it's junk or not. =item -h Help. =back With no arguments, processes stdin against the database, and tries to determine if it's junk. Exit code ( $? )is set as:- 0 Junk email 1 Good email. (or undecided about it). =head1 DESCRIPTION =head2 Overview Designed to be used in conjunction with procmail. Based on a Bayesian Heuristic, it builds a database of good and junk words, with associated probabilities and processes messages against the database to determine if they are junk mail or not. =head1 PREREQUISITES =over =item C =item C =item C =item C =back =head1 INSTALLATION =head2 Setting up Copy the program to a location on your path, or in your home directory. The program will automatically create the data directory using the -s switch. The data files will be placed in the .junkFilter directory in the users home directory. =head2 Configuration To be effective, it will need to be taught good or junk emails. Basically it just takes a text as an argument to the -j or -g options and builds the database of words from that. This program is designed to be used under procmail, which allows much flexibility in its usage. Here are the procmail rules I use:- =over # Run the mail through the junkFilter :0 Wibc: junk.lock | $HOME/bin/junkFilter # If it returns 1 then it's junked, otherwise drop through and send the mail to the inbox :0 a: $HOME/mail/junk =back This configuration is one I feel safest with, as it won't actually delete the emails it finds as junk. =head1 OSNAMES Unix or Unix-likes. =head1 SCRIPT CATEGORIES Mail =head1 README Junk mail filter. Designed to be used in conjunction with procmail, but not necessarily restricted to it. Based on a Bayesian Heuristic (yawn), it builds a database of good and junk words, with associated probabilities and processes messages against the database to determine if they are junk mail or not. =head1 LICENCE Copyright (c) 2004, Bruce James This program is free software; you can redistribute it and/or modify it under the same terms as Perl. =head1 DISCLAIMER Use at your own risk. This program is supplied as is, and it is up to you whether you choose to use it or not. As the program is based on a heuristic, it cannot guarantee to give accurate results. I cannot be held responsible for any data or email loss that may occur during the use or misconfiguration of this program. =head1 AUTHOR Bruce James (custard@cpan.org) =cut package junkFilter; @ISA = qw( Exporter ); @EXPORT = qw(); use strict; #use DB_File; use SDBM_File; use FileHandle; use Getopt::Std; my $VERSION=0.2; # $Revision: 1.1.1.1 $ - 1 sub new { my $class=shift; my $this={}; bless $this,$class; return $this->constructor( @_ ); } sub constructor { my $this=shift; my %jdb; my %gdb; my %sdb; my $jfdir = $this->checkDirectory(); if ($jfdir) { my $jdb=tie( %jdb, 'SDBM_File', $jfdir."/junkWords.db", O_RDWR | O_CREAT, 0755); my $gdb=tie( %gdb, 'SDBM_File', $jfdir."/goodWords.db", O_RDWR | O_CREAT, 0755); my $sdb=tie( %sdb, 'SDBM_File', $jfdir."/junkStatus.db", O_RDWR | O_CREAT, 0755); } else { die( "Failed checking for directory $jfdir.\n" ); } $this->{junkDB}= \%jdb; $this->{goodDB}= \%gdb; $this->{statusDB}= \%sdb; return $this; } # emailGood & emaiJunk getters & setters # I know it's a bit long-winded, but it leaves # the opportunity to swap databases in the future. sub getJunkCount { my $this=shift; my $word=shift || die( "getJunkCount: no word!\n" ); my $db=$this->{junkDB}; return $db->{$word}; } sub setJunkCount { my $this=shift; my $word=shift || die( "setJunkCount: no word!\n" ); my $count=shift || 0; my $db=$this->{junkDB}; $db->{$word}=$count; return; } sub getGoodCount { my $this=shift; my $word=shift || die( "getGoodCount: no word!\n" ); my $db=$this->{goodDB}; return $db->{$word}; } sub setGoodCount { my $this=shift; my $word=shift || die( "setGoodCount: no word!\n" ); my $count=shift || 0; my $db=$this->{goodDB}; $db->{$word}=$count; return; } # emailStatus getters & setters sub getGoodTotalCount { my $this=shift; my $db=$this->{statusDB}; return $db->{'goodCount'} || 0;; } sub getJunkTotalCount { my $this=shift; my $db=$this->{statusDB}; return $db->{'junkCount'} || 0;; } sub setGoodTotalCount { my $this=shift; my $count=shift || 0; my $db=$this->{statusDB}; $db->{'goodCount'}=$count; return; } sub setJunkTotalCount { my $this=shift; my $count=shift || 0; my $db=$this->{statusDB}; $db->{'junkCount'}=$count; return; } sub checkDirectory { my $this=shift; my $home=$ENV{HOME} || die("Can't locate user home dir." ); my $jfdir=$home."/.junkFilter"; print( "Checking $jfdir exists..." ); if (-e $jfdir && -d $jfdir ) { print( " it does..." ); } else { print( " it doesn't. Creating it..." ); mkdir( $jfdir, 0700 ); # rwx------ } # Make sure the directory exists and is writable. if (-e $jfdir && -d $jfdir && -w $jfdir ) { print( " Ok.\n" ); return $jfdir; } else { print( " Failed.\n" ); return undef; } } sub processFile { # Process a file and produce a hash of words & counts, and a total count my $this=shift; my $fh=shift; my %words; my $ignore; my $wordCount; if ($fh && (!ref($fh))) { $fh=new FileHandle( $fh ); } else { $fh=*STDIN; } while( <$fh> ) { (/: base64/) && ($ignore=1); (/^--/) && ($ignore=0); next if $ignore; s/<.+>/!htmltags!/g; s/\d+/!consecutivenumbers!/g; my @words=split(/\W/,$_); foreach (@words) { s/\W//g; s/\s*//g; $_=lc($_); $words{$_}++ if $_; $wordCount++; } } return \%words, $wordCount; } sub updateJunkData { # Update Database from a file my $this=shift; my $file = shift || return; my $words; my $word; my $wordCount; print( "Compiling junk words from $file...\n" ); ($words,$wordCount) = $this->processFile( $file ); # Add this total wordcount to the running total my $count = $this->getJunkTotalCount; $this->setJunkTotalCount( $count + $wordCount ); print( "Storing in database...\n" ); # get all words and store in mySql database foreach $word (keys %{$words}) { my $curcount=$this->getJunkCount( $word ); $this->setJunkCount( $word, $curcount + $words->{$word} ); } print( "Done ($wordCount words processed).\n" ); } sub updateGoodData { # Update Database from a file my $this=shift; my $file = shift || return; my $words; my $word; my $wordCount; print( "Compiling good words from $file...\n" ); ($words,$wordCount) = $this->processFile( $file ); # Add this total wordcount to the running total my $count = $this->getGoodTotalCount; $this->setGoodTotalCount( $count + $wordCount ); print( "Storing in database...\n" ); # get all words and store in mySql database foreach $word (keys %{$words}) { my $curcount=$this->getGoodCount( $word ); $this->setGoodCount( $word, $curcount + $words->{$word} ); } print( "Done ($wordCount words processed).\n" ); } sub processMail { my $this=shift; my $file = shift; my ($junkCount,$goodCount); my ($junkProb,$goodProb); my $words; my $wordCount; my $word; my $total; ($words,$wordCount) = $this->processFile( $file ); my $totalJunkWords = $this->getJunkTotalCount || 1; # Foil the divide by zero trap my $totalGoodWords = $this->getGoodTotalCount || 1; # Find ratio of good words to junk words my $junkRatio = $totalJunkWords / $totalGoodWords; foreach $word (keys %{$words}) { # Correct only good count to even the ratio $junkCount = $this->getJunkCount( $word ); $goodCount = $this->getGoodCount( $word ) * $junkRatio; # Add up probabilities ($total = $junkCount + $goodCount) && ( $junkProb+=$junkCount/$total, $goodProb+=$goodCount/$total ); } $total=$junkProb+$goodProb; if ($total) { $junkProb=$junkProb/$total; $goodProb=$goodProb/$total; } else { print( "Undecided...\n" ); return 1; } # Update counters for stats if ($junkProb > 0.50) { print( "It's junk...\n" ); return 0; } else { print( "It's good...\n" ); return 1; } } sub run { my $this=shift; my $file=shift; my %args; my $rc=0; getopts( 'hsij:g:p:', \%args ); if ($args{s}) { # Checks the data directory & creates if necessary. $this->checkDirectory(); return 1; } if ($args{j}) { # Read junk mail & good mail files & update word database $this->updateJunkData( $args{j} ); return 1; } if ($args{g}) { # Read good mail & good mail files & update word database $this->updateGoodData( $args{g} ); return 1; } if ($args{p}) { # Process file and filter according to probability of being junk $rc = $this->processMail( $args{p}); return $rc; } if ($args{h}) { print( "junkFilter version $VERSION\n" ); print( "junkFilter -s Create database dir if necessary.\n"); print( " -p file Process file against database\n"); print( " -j file | -g file Process junk & good files respectively\n"); print( " -h Help\n"); return 1; } $rc = $this->processMail(); return $rc; } my $main=new junkFilter(); exit $main->run( @ARGV );