#!/bin/bash
# 
# Copyright (C) 2002 Laird Breyer
#  
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA.
# 
# Author:   Laird Breyer <laird@lbreyer.com>
#
PROGNAME="$0"
PROGNAME2=`basename $0`
VERSION="$PROGNAME version 1.14\nCross validation simulator"
MXDIR="$PWD/mailcross.d"
ALOG="$MXDIR/log/activity.log"
CLOG="$MXDIR/log/crossval.log"
BLOG="$MXDIR/log/biplot.log"
SLOG="$MXDIR/log/summary.log"
TEMPDIR="$MXDIR/tmp"
BOOTSTRAP="/opt/local/share/dbacl/testsuite"
FILTERS="$MXDIR/filters"

# use this for debugging
# set -x

usage() {
    echo "
Usage: 

    $PROGNAME2 prepare size
    $PROGNAME2 add category [MBOX]...
    $PROGNAME2 learn
    $PROGNAME2 run
    $PROGNAME2 summarize [LEVEL]
    $PROGNAME2 review TRUECAT PREDCAT
    $PROGNAME2 biplot CAT1 CAT2
    $PROGNAME2 clean
    $PROGNAME2 killall

    $PROGNAME2 testsuite select [FILTER]...
    $PROGNAME2 testsuite deselect [FILTER]...
    $PROGNAME2 testsuite list
    $PROGNAME2 testsuite status
    $PROGNAME2 testsuite run
    $PROGNAME2 testsuite summarize
"
    exit 1
}




prerequisite_command() {
    if [ -z "`type -p $1`" ]; then
	echo "Error: $1 not found. Please install $2 to proceed."
	exit 1
    fi
}

clean_working_tree() {
    if [ -e $MXDIR ]; then
	rm -rf $MXDIR
    else
	echo "Nothing to clean"
    fi
}

make_dummy_mbox() {
    cat > $MXDIR/mbox/dummy.mailbox <<EOF
From MAILER-DAEMON Thu Dec  4 13:50:55 2003
Date: 04 Dec 2003 13:50:55 +1000
From: Mail System Internal Data <MAILER-DAEMON@scooby>
Subject: DON\'T DELETE THIS MESSAGE -- FOLDER INTERNAL DATA
Message-ID: <1070509855@scooby>
X-IMAP: 1023007291 0000010227
Status: RO

This text is part of the internal format of your mail folder, and is not
a real message.  It is created automatically by the mail system software.
If deleted, important folder data will be lost, and it will be re-created
with the data reset to initial values.

EOF
}

prepare_working_tree() {
    if [ -d $MXDIR ]; then 
	echo "Error: Directory $MXDIR already exists. Remove it or use it.";
	usage;
    elif [ -n "$1" ]; then
	prerequisite_command "seq" "shellutils"
	NUM=`expr $1 - 1`;
	if [ $NUM -gt -1 ]; then
	    mkdir "$MXDIR" && \
	    mkdir "$MXDIR/tmp" && \
	    mkdir "$MXDIR/log" && \
	    mkdir "$MXDIR/plots" && \
	    mkdir "$MXDIR/mbox" && \
	    mkdir "$MXDIR/review" && \
	    mkdir "$MXDIR/filters" && \
	    for i in `seq 0 $NUM`; do mkdir "$MXDIR/$i"; done && \
	    echo "=== prepare_working_tree $*" >> $ALOG

	else
	    echo "Error: Please specify a number greater than zero."
	    usage
	fi

    else
	echo "Error: Please specify a number."
	usage
    fi
}

get_categories() {
    CATS=`find $MXDIR -type f -name '*.mbox' -exec basename {} \; | sort -u`
    [ -z "$CATS" ] && echo "No categories found"
}

get_filters() {
    FILTS=`find $FILTERS -type f -perm +0111 -exec basename {} \; | sort -u`
    [ -z "$FILTS" ] && echo "No filters found"
}

get_number_of_subsets() {
    NUM=`ls $MXDIR | grep '^[0-9]' | wc -l`
    if [ $(($NUM)) -le 0 ]; then
	echo "error: you need to prepare first."
	usage
    fi
}

review_misclassified() {
    SM=$1
    shift
    cat > "$SM" <<EOF
#!/bin/sh
cat > msg.tmp.\$POSITION.\$FILENO
M=\`head -1 msg.tmp.\$POSITION.\$FILENO | grep -e "\$EMAIL.*\$DATE"\`
if [ -n "\$M" ]; then
    mv msg.tmp.\$POSITION.\$FILENO $MXDIR/review/\$1.\$2.\$POSITION.\$FILENO
else
    rm msg.tmp.\$POSITION.\$FILENO
fi
EOF
    
    if [ -d "$MXDIR/review" ]; then
	rm -f "$MXDIR/review/*"
    else
	echo "Error: You need to prepare first."
	usage
    fi

    if [ -s "$CLOG" ]; then
	grep " $1 $2 " "$CLOG" | \
	    while read f; do
		BOX=`echo $f | cut -d' ' -f2`
		export POSITION=`echo $f | cut -d' ' -f1`
		export EMAIL=`echo $f | cut -d' ' -f4`
		export DATE=`echo $f | cut -d' ' -f5-`
		echo "$f"
		[ -e "$MXDIR/$POSITION/$BOX.mbox" ] || export POSITION=mbox
		cat "$MXDIR/$POSITION/$BOX.mbox" | formail -s /bin/sh "$SM" "$1" "$2"
	    done
    else
	echo "Error: There are no logs - run the classifier(s) first" 
    fi

    rm -f "$SM"
}

testsuite_list_wrappers() {
    if [ -d $BOOTSTRAP ] ; then
	echo -ne "The following classification wrappers are selectable:\n\n"
	for f in `ls $BOOTSTRAP` ; do
	    [ -x "$BOOTSTRAP/$f" ] && echo "$f - `$BOOTSTRAP/$f describe`"
	done
    else
	echo "Bootstrap directory $BOOTSTRAP does not exist."
    fi
}

testsuite_deselect_wrappers() {
    shift
    if [ -z "$*" ] ; then
	usage
    else
	for f in "$@" ; do 
	    if [ -e $FILTERS/$f ] ; then
		echo "deselecting $f"
		rm -f $FILTERS/$f
	    else
		echo "$f: no such filter."
	    fi
	done
    fi
}

testsuite_select_wrappers() {
    shift
    if [ -z "$*" ] ; then
	usage
    else
	for f in $* ; do
	    if [ -x $BOOTSTRAP/$f ] ; then
		$BOOTSTRAP/$f bootstrap $FILTERS
	    else
		echo "The wrapper $f cannot be selected, skipping."
	    fi
    done
    fi
}

summarize_log() {
    awk -v "num=$NUM" -v "cats=${CATS//.mbox/}" '
BEGIN{

    split(cats,names)

}
/^[^#]/{

    f[$2,$3]++
    fn[$3]++
    fp[$2]++

}
END{

    printf("Where do misclassifications go?\n(numbers on diagonal represent \"recall\")\n\n")

    printf("  true     | but predicted as...\n")
    printf("    *      | ")
    for(c in names) printf("%10s", names[c])
    printf("\n")

    for(c in names) {
	printf("%-10s | ", names[c])
	for(d in names) {
	    printf("%9.2f%%", 100 * f[names[c],names[d]]/fp[names[c]])
	}
	printf("\n")
    }

    printf("\n")

    printf("What is really in each category after prediction?\n(numbers on diagonal represent \"precision\")\n\n")

    printf("category   | contains mixture of...\n")
    printf("    *      | ")
    for(c in names) printf("%10s", names[c])
    printf("\n")

    for(c in names) {
	printf("%-10s | ", names[c])
	for(d in names) {
	    printf("%9.2f%%", 100 * f[names[d],names[c]]/fn[names[c]])
	}
	printf("\n")
    }

    printf("\n")

    x = y = 0
    for(c in names) {
	x += f[names[c],names[c]]
	for(d in names) {
	    y += f[names[c],names[d]]
	}
    }

    printf("Total correct classifications: %9.2f%%\n\n", (100 * x)/y)

}
'
}

plot_errors() {
    OUTFILE=$MXDIR/plots/`basename $1 .log`
    CMDFILE=$OUTFILE.cmd
    DATAFILE=$OUTFILE.plotdata
    TITLE=$2
    cat $1 \
	| grep -v '^#' \
	| cut -f1,2,3 -d ' ' \
	| awk '{count[$1]++; ecount[$1] += ($2 != $3); print (count[$1] == 1) ? "\n" : "", count[$1], ecount[$1]}' \
	> $DATAFILE
    shift 2
    OUTPUT="set terminal x11"
    SCALE="unset logscale"
    PAUSE="pause -1 \"Press any key...\""
    OUTPRINT=""
    for o in "$@"; do
	if [ "$o" = "ps" ]; then
	    OUTPUT="set terminal postscript"
	    OUTPRINT="set output \"$OUTFILE.ps\""
	    PAUSE=""
	    echo "writing $OUTFILE.ps"
	elif [ "$o" = "logscale" ]; then
	    SCALE="set logscale"
	fi
    done
    cat > $CMDFILE <<EOF
$OUTPUT
$OUTPRINT
$SCALE
unset key
set title "$TITLE"
plot "$DATAFILE" with points
$PAUSE
EOF
    gnuplot $CMDFILE
}

plot_biplot() {
    OUTFILE=$MXDIR/plots/$1
    CMDFILE=$OUTFILE.cmd
    DATAFILE=$OUTFILE.plotdata
    ERRORFILE=$OUTFILE.ploterrors
    TITLE=$1
    cat $2 \
	| grep -v '^#' \
	| awk '{print $5, $14}' \
	> $DATAFILE
    cat $2 \
	| grep -v '^#' \
	| awk '{if( (($2 == $3) && ($5 > $14)) || (($2 != $3) && ($5 < $14)) ) print }' \
	| awk '{print $5, $14}' \
	> $ERRORFILE
    shift 2
    OUTPUT="set terminal x11"
    SCALE="unset logscale"
    PAUSE="pause -1 \"Press any key...\""
    OUTPRINT=""
    for o in "$@"; do
	if [ "$o" = "ps" ]; then
	    OUTPUT="set terminal postscript"
	    OUTPRINT="set output \"$OUTFILE.ps\""
	    PAUSE=""
	    echo "writing $OUTFILE.ps"
	elif [ "$o" = "logscale" ]; then
	    SCALE="set logscale"
	fi
    done
    cat > $CMDFILE <<EOF
$OUTPUT
$OUTPRINT
$SCALE
unset key
set title "$TITLE"
plot "$DATAFILE" using 1:2 pt 1, "$ERRORFILE" using 1:2 pt 5, x

$PAUSE
EOF
    gnuplot $CMDFILE
}


# check this for environment variable overrrides
[ -e $HOME/.mailcrossrc ] && . $HOME/.mailcrossrc

# this is the default filter
if [ -z "$MAILCROSS_FILTER" ]; then
    MAILCROSS_FILTER="dbacl -v -m -T email"
fi

# this is the default scorer
if [ -z "$MAILCROSS_SCORER" ]; then
    MAILCROSS_SCORER="dbacl -T email -m -v -X"
fi

# this is the default learner
if [ -z "$MAILCROSS_LEARNER" ]; then
    MAILCROSS_LEARNER="dbacl -H 19 -T email -l"
    MAILCROSS_CPREFIX="-c" # needed by dbacl
fi

export TEMPDIR
# main switch statement - this processes commands
case $1 in

    '-V')
	echo $VERSION
	;;

    clean) # delete working tree
	clean_working_tree
	;;

    killall)
	prerequisite_command "killall" "killall"
	killall -9 -g mailcross
	;;

    prepare) # create directory tree
	shift
	prepare_working_tree "$@"
	;;

    add)
	shift
	CATNAME="$1"
	if [ -z "$CATNAME" ]; then
	    echo "error: missing category name."
	    usage
	fi

	prerequisite_command "formail" "mailutils"
	get_number_of_subsets

	echo "=== $PROGNAME $*" >> $ALOG

	shift
	# either add files on command line or from stdin 
	if [ -n "$*" ]; then
	    cat "$@" | formail -s /bin/bash -c \
		"cat >> $MXDIR/\$((\$RANDOM % $NUM))/$CATNAME.mbox"
	else
	    formail -s /bin/bash -c \
		"cat >> $MXDIR/\$((\$RANDOM % $NUM))/$CATNAME.mbox"
	fi
	;;

    learn) 
	shift
	get_number_of_subsets
	NUM=`expr $NUM - 1` # we count from zero to NUM-1
	get_categories
	
	prerequisite_command "seq" "shellutils"

	echo "=== $PROGNAME learn $*" >> $ALOG

	for n in $CATS; do

	    echo "Learning `basename $n .mbox`" >> $ALOG

	    for i in `seq 0 $NUM`; do

		echo "| $MAILCROSS_LEARNER $MXDIR/$i/`basename $n .mbox` $*" >> $ALOG
		for j in `seq 0 $NUM`; do 
		    if [ "$i" != "$j" ]; then
			echo "    cat $MXDIR/$j/$n |" >> $ALOG
			cat "$MXDIR/$j/$n"
		    fi
		done | $MAILCROSS_LEARNER "$MXDIR/$i/`basename $n .mbox`" "$@"
		
	    done

	done
	;;

    run) 
	shift
	get_number_of_subsets
	NUM=`expr $NUM - 1` # we count from zero to NUM-1
	get_categories
	STUFF="$MXDIR/log/run.stuff"

	prerequisite_command "formail" "mailutils"
	prerequisite_command "sed" "sed"
	prerequisite_command "seq" "shellutils"

	echo "=== $PROGNAME run $*" >> $ALOG

	
	echo "# location | true | predicted | from" > $CLOG

	for i in `seq 0 $NUM`; do

	    ARGS=`for n in $CATS; do echo -n "$MAILCROSS_CPREFIX $MXDIR/$i/\`basename $n .mbox\` "; done` 
	    COMMAND="$MAILCROSS_FILTER $ARGS"
	    echo "| $COMMAND" >> $ALOG

	    for m in $CATS; do
		cat $MXDIR/$i/$m | formail -s /bin/bash -c \
		    "sed -e 'h' -e '/^From /s/^From//w $STUFF' -e 'x' | $COMMAND | xargs echo -ne '$i `basename $m .mbox`' && cat $STUFF" >> $CLOG
		echo "    cat $MXDIR/$i/$m |" >> $ALOG
	    done

	done
	;;

    summarize) 
	shift
	get_number_of_subsets # includes check that directory tree is present
	get_categories

	prerequisite_command "awk" "awk or equivalent"

	echo "=== $PROGNAME summarize $*" >> $ALOG

	if [ -s $CLOG ]; then
	    cat $CLOG | summarize_log
	else
	    echo "Error: No results found. You must run the cross validation first."
	    usage
	fi
	;;

    review) 
	shift
	prerequisite_command "formail" "mailutils"
	prerequisite_command "grep" "grep"

	if [ -z "$1" -o -z "$2" ]; then
	    echo "Error: Missing category, e.g. $PROGNAME review notspam spam"
	else
	    review_misclassified "$MXDIR/tmp/save_msg.sh" "$1" "$2"
	fi
	;;

    biplot)
	shift
	get_number_of_subsets
	NUM=`expr $NUM - 1` # we count from zero to NUM-1
	if [ -z "$1" -o -z "$2" ]; then
	    echo "Error: Needs two categories, e.g. $PROGNAME biplot spam notspam"
	    usage
	else
	    CAT1=$1
	    CAT2=$2
	    CATS="$1 $2"

	    prerequisite_command "formail" "mailutils"
	    prerequisite_command "sed" "sed"
	    prerequisite_command "seq" "shellutils"

	    echo "=== $PROGNAME biplot $*" >> $ALOG
	    echo "# location | true | cat1 ... | cat2 ..." > $BLOG

	    for i in `seq 0 $NUM`; do

		ARGS=`for n in $CATS; do echo -n "$MAILCROSS_CPREFIX $MXDIR/$i/$n "; done` 
		COMMAND="$MAILCROSS_SCORER $ARGS"
		echo "| $COMMAND" >> $ALOG

		for m in $CATS; do
		    cat $MXDIR/$i/$m.mbox | formail -s /bin/bash -c \
			"$COMMAND | xargs echo -ne '$i $m' && echo" >> $BLOG
		    echo "    cat $MXDIR/$i/$m |" >> $ALOG
		done
		
		echo >> $BLOG

	    done

	    plot_biplot "$CAT1-$CAT2" $BLOG

	fi
	;;
    testsuite)
	shift
	case $1 in
	    list)
		testsuite_list_wrappers
		;;

	    deselect)
		testsuite_deselect_wrappers "$@"
		;;

	    select)
		testsuite_select_wrappers "$@"
		;;

	    status)
		echo -e "The following categories are ready to be cross validated:\n"
		get_categories
		for c in $CATS; do
		    echo -n "$c - counting... "
		    NUM=`grep '^From ' $MXDIR/*/$c | wc -l`
		    echo "$NUM messages"
		done

		echo -e "\nThe following classifiers are ready to be cross validated:\n"
		get_filters
		for f in $FILTS; do
		    echo "$f - `$FILTERS/$f describe`"
		done
		;;

	    run)
		get_filters
		for f in $FILTS; do

		    echo -ne "Now testing: "
		    "$FILTERS/$f" describe

		    echo "Cleanup."
		    "$FILTERS/$f" clean "$MXDIR"

		    export MAILCROSS_FILTER="$FILTERS/$f filter"
		    export MAILCROSS_LEARNER="$FILTERS/$f learn"

		    echo "Learning."
		    time "$PROGNAME" learn

		    echo "Running."
		    time "$PROGNAME" run

		    echo "Writing results."
		    echo -e "\n---------------" >> "$SLOG"
		    "$FILTERS/$f" describe >> "$SLOG"
		    date >> "$CLOG"
		    echo "---------------" >> "$SLOG"
		    "$PROGNAME" summarize >> "$SLOG"

		done
		;;

	    summarize) 
		if [ -s "$SLOG" ]; then
		    cat "$SLOG"
		else
		    echo "Error: No results found. You must run the testsuite first."
		fi
		;;

	    *)
		usage
		;;
	esac
	;;

    *) 
	usage
	;;
esac

exit 0