#!/bin/sh # parmtest.sh - a parameter testing script. # # This script can be used to test a bogofilter option # to see whether the option makes bogofilter is more # effective or less effective. # # It is presently set to test 'tag_header_lines', but # by creating different config files can be used to test # other config file options. # # For each test named in variable TESTS, a corresponding # config file, e.g. $test.cf, is needed. Bogofilter is # trained using $test.cf to create wordlists for the test. # The newly created wordlists are saved in directory $test.d. # This allows a test to be run using the same parameters # used to build the wordlists. # # For each mbox file used in testing, an output file is # created that gives the result of testing each message # in the mbox. The result lines include: # # S/H/U - the spam/ham/unsure classification. # spamicity - 0.0 to 1.0 # message number - 0000, 0001, ... # subject line - "Subject: This is a test" # # A summary line is generated for each file giving # test name, filename, S count, H count, and U count, # for example: # # default.spam.mbx.out 1603 5 136 # default.good.mbx.out 3 4934 108 BIN=~/bin BOGOUTIL="$BIN/bogoutil" BOGOFILTER="$BIN/bogofilter" # TEST_DIR="./test.bogofilter" BOGOFILTER_DIR="$TEST_DIR" export BOGOFILTER_DIR #specify files for creating the test wordlists SPAM_TRAIN=$(ls spam_train*mbx) GOOD_TRAIN=$(ls good_train*mbx) #specify files for testing SPAM_TEST=$(ls spam_test*mbx) GOOD_TEST=$(ls good_test*mbx) #specify names of tests to run TESTS="default tag_header_lines" if [ -z "$1" ] ; then # create standard (base) config file cat < base.cf algorithm = fisher ham_cutoff = 0.10 spam_cutoff = 0.95 terse_format = %1.1c %f header_format = %1.1c spamicity_tags = S, H, U spamicity_formats = %0.6f %0.6f %0.6f EOF # copy base config file for each test for test in $TESTS ; do cat base.cf > $test.cf done # customize config files cat <> default.cf tag_header_lines=no EOF cat <> tag_header_lines.cf tag_header_lines=yes EOF [ ! -d $TEST_DIR ] && mkdir $TEST_DIR date for test in $TESTS ; do echo $test # train (build wordlists for testing) if [ ! -d $test.d ] ; then mkdir $test.d rm -f $TEST_DIR/*.db if [ ! -f spamlist.db ] ; then for m in $SPAM_TRAIN ; do echo $m $BOGOFILTER -v -c $test.cf -s < $m done cp -p $TEST_DIR/spamlist.db $test.d fi if [ ! -f goodlist.db ] ; then for m in $GOOD_TRAIN ; do echo $m $BOGOFILTER -v -c $test.cfd -n < $m done cp -p $TEST_DIR/goodlist.db $test.d fi $BOGOUTIL -w $test.d .MSG_COUNT fi # copy wordlists for use cp -pf $test.d/*.db $TEST_DIR # test (score accuracy) cfg="$test.cf" list="$SPAM_TEST $GOOD_TEST" for mbox in $list ; do b=$(basename $mbox) f="$test.$b.out" cat /dev/null > $f FILENO=0000 formail -s $0 $cfg $f < $mbox s=$(grep "^S" $f | wc -l) h=$(grep "^H" $f | wc -l) u=$(grep "^U" $f | wc -l) printf "%-20s %4d %4d %4d\n" $(basename $f) $s $h $u done echo "" done else cfg="$1" out="$2" cat > $$.tmp result=$($BOGOFILTER -t -v -c $cfg < $$.tmp | head -1) echo $result $FILENO $(grep ^Subject: $$.tmp | head -1) >> $out rm -f $$.tmp fi