mox/junk.go

package main

/*
note: these testdata paths are not in the repo, you should gather some of your
own ham/spam emails.

./mox junk train testdata/train/ham testdata/train/spam
./mox junk train -sent-dir testdata/sent testdata/train/ham testdata/train/spam
./mox junk check 'testdata/check/ham/mail1'
./mox junk test testdata/check/ham testdata/check/spam
./mox junk analyze testdata/train/ham testdata/train/spam
./mox junk analyze -top-words 10 -train-ratio 0.5 -spam-threshold 0.85 -max-power 0.01 -sent-dir testdata/sent testdata/train/ham testdata/train/spam
./mox junk play -top-words 10 -train-ratio 0.5 -spam-threshold 0.85 -max-power 0.01 -sent-dir testdata/sent testdata/train/ham testdata/train/spam
*/

import (
	"flag"
	"fmt"
	"log"
	mathrand "math/rand"
	"os"
	"runtime"
	"runtime/pprof"
	"sort"
	"time"

	"github.com/mjl-/mox/junk"
	"github.com/mjl-/mox/message"
	"github.com/mjl-/mox/mlog"
	"github.com/mjl-/mox/mox-"
)

type junkArgs struct {
	params                        junk.Params
	cpuprofile, memprofile        string
	spamThreshold                 float64
	trainRatio                    float64
	seed                          bool
	sentDir                       string
	databasePath, bloomfilterPath string
	debug                         bool
}

func (a junkArgs) Memprofile() {
	if a.memprofile == "" {
		return
	}

	f, err := os.Create(a.memprofile)
	xcheckf(err, "creating memory profile")
	defer f.Close()
	runtime.GC() // get up-to-date statistics
	err = pprof.WriteHeapProfile(f)
	xcheckf(err, "writing memory profile")
}

func (a junkArgs) Profile() func() {
	if a.cpuprofile == "" {
		return func() {
			a.Memprofile()
		}
	}

	f, err := os.Create(a.cpuprofile)
	xcheckf(err, "creating CPU profile")
	err = pprof.StartCPUProfile(f)
	xcheckf(err, "start CPU profile")
	return func() {
		pprof.StopCPUProfile()
		f.Close()
		a.Memprofile()
	}
}

func (a junkArgs) SetLogLevel() {
	mox.Conf.Log[""] = mlog.LevelInfo
	if a.debug {
		mox.Conf.Log[""] = mlog.LevelDebug
	}
	mlog.SetConfig(mox.Conf.Log)
}

func junkFlags(fs *flag.FlagSet) (a junkArgs) {
	fs.BoolVar(&a.params.Onegrams, "one-grams", false, "use 1-grams, i.e. single words, for scoring")
	fs.BoolVar(&a.params.Twograms, "two-grams", true, "use 2-grams, i.e. word pairs, for scoring")
	fs.BoolVar(&a.params.Threegrams, "three-grams", false, "use 3-grams, i.e. word triplets, for scoring")
	fs.Float64Var(&a.params.MaxPower, "max-power", 0.05, "maximum word power, e.g. min 0.05/max 0.95")
	fs.Float64Var(&a.params.IgnoreWords, "ignore-words", 0.1, "ignore words with ham/spaminess within this distance from 0.5")
	fs.IntVar(&a.params.TopWords, "top-words", 10, "number of top spam and number of top ham words from email to use")
	fs.IntVar(&a.params.RareWords, "rare-words", 1, "words are rare if encountered this number during training, and skipped for scoring")
	fs.BoolVar(&a.debug, "debug", false, "print debug logging when calculating spam probability")

	fs.Float64Var(&a.spamThreshold, "spam-threshold", 0.95, "probability where message is seen as spam")
	fs.Float64Var(&a.trainRatio, "train-ratio", 0.5, "part of data to use for training versus analyzing (for analyze only)")
	fs.StringVar(&a.sentDir, "sent-dir", "", "directory with sent mails, for training")
	fs.BoolVar(&a.seed, "seed", false, "seed prng before analysis")
	fs.StringVar(&a.databasePath, "dbpath", "filter.db", "database file for ham/spam words")
	fs.StringVar(&a.bloomfilterPath, "bloompath", "filter.bloom", "bloom filter for ignoring unique strings")

	fs.StringVar(&a.cpuprofile, "cpuprof", "", "store cpu profile to file")
	fs.StringVar(&a.memprofile, "memprof", "", "store mem profile to file")
	return
}

func listDir(dir string) (l []string) {
	files, err := os.ReadDir(dir)
	xcheckf(err, "listing directory %q", dir)
	for _, f := range files {
		l = append(l, f.Name())
	}
	return l
}

func must(f *junk.Filter, err error) *junk.Filter {
	xcheckf(err, "filter")
	return f
}

func cmdJunkTrain(c *cmd) {
	c.unlisted = true
	c.params = "hamdir spamdir"
	c.help = "Train a junk filter with messages from hamdir and spamdir."
	a := junkFlags(c.flag)
	args := c.Parse()
	if len(args) != 2 {
		c.Usage()
	}
	defer a.Profile()()
	a.SetLogLevel()

	f := must(junk.NewFilter(mlog.New("junktrain"), a.params, a.databasePath, a.bloomfilterPath))
	defer f.Close()

	hamFiles := listDir(args[0])
	spamFiles := listDir(args[1])
	var sentFiles []string
	if a.sentDir != "" {
		sentFiles = listDir(a.sentDir)
	}

	err := f.TrainDirs(args[0], a.sentDir, args[1], hamFiles, sentFiles, spamFiles)
	xcheckf(err, "train")
}

func cmdJunkCheck(c *cmd) {
	c.unlisted = true
	c.params = "mailfile"
	c.help = "Check an email message against a junk filter, printing the probability of spam on a scale from 0 to 1."
	a := junkFlags(c.flag)
	args := c.Parse()
	if len(args) != 1 {
		c.Usage()
	}
	defer a.Profile()()
	a.SetLogLevel()

	f := must(junk.OpenFilter(mlog.New("junkcheck"), a.params, a.databasePath, a.bloomfilterPath, false))
	defer f.Close()

	prob, _, _, _, err := f.ClassifyMessagePath(args[0])
	xcheckf(err, "testing mail")

	fmt.Printf("%.6f\n", prob)
}

func cmdJunkTest(c *cmd) {
	c.unlisted = true
	c.params = "hamdir spamdir"
	c.help = "Check a directory with hams and one with spams against the junk filter, and report the success ratio."
	a := junkFlags(c.flag)
	args := c.Parse()
	if len(args) != 2 {
		c.Usage()
	}
	defer a.Profile()()
	a.SetLogLevel()

	f := must(junk.OpenFilter(mlog.New("junktest"), a.params, a.databasePath, a.bloomfilterPath, false))
	defer f.Close()

	testDir := func(dir string, ham bool) (int, int) {
		ok, bad := 0, 0
		files, err := os.ReadDir(dir)
		xcheckf(err, "readdir %q", dir)
		for _, fi := range files {
			path := dir + "/" + fi.Name()
			prob, _, _, _, err := f.ClassifyMessagePath(path)
			if err != nil {
				log.Printf("classify message %q: %s", path, err)
				continue
			}
			if ham && prob < a.spamThreshold || !ham && prob > a.spamThreshold {
				ok++
			} else {
				bad++
			}
			if ham && prob > a.spamThreshold {
				fmt.Printf("ham %q: %.4f\n", path, prob)
			}
			if !ham && prob < a.spamThreshold {
				fmt.Printf("spam %q: %.4f\n", path, prob)
			}
		}
		return ok, bad
	}

	nhamok, nhambad := testDir(args[0], true)
	nspamok, nspambad := testDir(args[1], false)
	fmt.Printf("total ham, ok %d, bad %d\n", nhamok, nhambad)
	fmt.Printf("total spam, ok %d, bad %d\n", nspamok, nspambad)
	fmt.Printf("specifity (true negatives, hams identified): %.6f\n", float64(nhamok)/(float64(nhamok+nhambad)))
	fmt.Printf("sensitivity (true positives, spams identified): %.6f\n", float64(nspamok)/(float64(nspamok+nspambad)))
	fmt.Printf("accuracy: %.6f\n", float64(nhamok+nspamok)/float64(nhamok+nhambad+nspamok+nspambad))
}

func cmdJunkAnalyze(c *cmd) {
	c.unlisted = true
	c.params = "hamdir spamdir"
	c.help = `Analyze a directory with ham messages and one with spam messages.

A part of the messages is used for training, and remaining for testing. The
messages are shuffled, with optional random seed.`
	a := junkFlags(c.flag)
	args := c.Parse()
	if len(args) != 2 {
		c.Usage()
	}
	defer a.Profile()()
	a.SetLogLevel()

	f := must(junk.NewFilter(mlog.New("junkanalyze"), a.params, a.databasePath, a.bloomfilterPath))
	defer f.Close()

	hamDir := args[0]
	spamDir := args[1]
	hamFiles := listDir(hamDir)
	spamFiles := listDir(spamDir)

	var rand *mathrand.Rand
	if a.seed {
		rand = mathrand.New(mathrand.NewSource(time.Now().UnixMilli()))
	} else {
		rand = mathrand.New(mathrand.NewSource(0))
	}

	shuffle := func(l []string) {
		count := len(l)
		for i := range l {
			n := rand.Intn(count)
			l[i], l[n] = l[n], l[i]
		}
	}

	shuffle(hamFiles)
	shuffle(spamFiles)

	ntrainham := int(a.trainRatio * float64(len(hamFiles)))
	ntrainspam := int(a.trainRatio * float64(len(spamFiles)))

	trainHam := hamFiles[:ntrainham]
	trainSpam := spamFiles[:ntrainspam]
	testHam := hamFiles[ntrainham:]
	testSpam := spamFiles[ntrainspam:]

	var trainSent []string
	if a.sentDir != "" {
		trainSent = listDir(a.sentDir)
	}

	err := f.TrainDirs(hamDir, a.sentDir, spamDir, trainHam, trainSent, trainSpam)
	xcheckf(err, "train")

	testDir := func(dir string, files []string, ham bool) (ok, bad, malformed int) {
		for _, name := range files {
			path := dir + "/" + name
			prob, _, _, _, err := f.ClassifyMessagePath(path)
			if err != nil {
				// log.Infof("%s: %s", path, err)
				malformed++
				continue
			}
			if ham && prob < a.spamThreshold || !ham && prob > a.spamThreshold {
				ok++
			} else {
				bad++
			}
			if ham && prob > a.spamThreshold {
				fmt.Printf("ham %q: %.4f\n", path, prob)
			}
			if !ham && prob < a.spamThreshold {
				fmt.Printf("spam %q: %.4f\n", path, prob)
			}
		}
		return
	}

	nhamok, nhambad, nmalformedham := testDir(args[0], testHam, true)
	nspamok, nspambad, nmalformedspam := testDir(args[1], testSpam, false)
	fmt.Printf("training done, nham %d, nsent %d, nspam %d\n", ntrainham, len(trainSent), ntrainspam)
	fmt.Printf("total ham, ok %d, bad %d, malformed %d\n", nhamok, nhambad, nmalformedham)
	fmt.Printf("total spam, ok %d, bad %d, malformed %d\n", nspamok, nspambad, nmalformedspam)
	fmt.Printf("specifity (true negatives, hams identified): %.6f\n", float64(nhamok)/(float64(nhamok+nhambad)))
	fmt.Printf("sensitivity (true positives, spams identified): %.6f\n", float64(nspamok)/(float64(nspamok+nspambad)))
	fmt.Printf("accuracy: %.6f\n", float64(nhamok+nspamok)/float64(nhamok+nhambad+nspamok+nspambad))
}

func cmdJunkPlay(c *cmd) {
	c.unlisted = true
	c.params = "hamdir spamdir"
	c.help = "Play messages from ham and spam directory according to their time of arrival and report on junk filter performance."
	a := junkFlags(c.flag)
	args := c.Parse()
	if len(args) != 2 {
		c.Usage()
	}
	defer a.Profile()()
	a.SetLogLevel()

	f := must(junk.NewFilter(mlog.New("junkplay"), a.params, a.databasePath, a.bloomfilterPath))
	defer f.Close()

	// We'll go through all emails to find their dates.
	type msg struct {
		dir, filename string
		ham, sent     bool
		t             time.Time
	}
	var msgs []msg

	var nbad, nnodate, nham, nspam, nsent int

	scanDir := func(dir string, ham, sent bool) {
		for _, name := range listDir(dir) {
			path := dir + "/" + name
			mf, err := os.Open(path)
			xcheckf(err, "open %q", path)
			fi, err := mf.Stat()
			xcheckf(err, "stat %q", path)
			p, err := message.EnsurePart(mf, fi.Size())
			if err != nil {
				nbad++
				mf.Close()
				continue
			}
			if p.Envelope.Date.IsZero() {
				nnodate++
				mf.Close()
				continue
			}
			mf.Close()
			msgs = append(msgs, msg{dir, name, ham, sent, p.Envelope.Date})
			if sent {
				nsent++
			} else if ham {
				nham++
			} else {
				nspam++
			}
		}
	}

	hamDir := args[0]
	spamDir := args[1]
	scanDir(hamDir, true, false)
	scanDir(spamDir, false, false)
	if a.sentDir != "" {
		scanDir(a.sentDir, true, true)
	}

	// Sort the messages, earliest first.
	sort.Slice(msgs, func(i, j int) bool {
		return msgs[i].t.Before(msgs[j].t)
	})

	// Play all messages as if they are coming in. We predict their spaminess, check if
	// we are right. And we train the system with the result.
	var nhamok, nhambad, nspamok, nspambad int

	play := func(msg msg) {
		var words map[string]struct{}
		path := msg.dir + "/" + msg.filename
		if !msg.sent {
			var prob float64
			var err error
			prob, words, _, _, err = f.ClassifyMessagePath(path)
			if err != nil {
				nbad++
				return
			}
			if msg.ham {
				if prob < a.spamThreshold {
					nhamok++
				} else {
					nhambad++
				}
			} else {
				if prob > a.spamThreshold {
					nspamok++
				} else {
					nspambad++
				}
			}
		} else {
			mf, err := os.Open(path)
			xcheckf(err, "open %q", path)
			defer mf.Close()
			fi, err := mf.Stat()
			xcheckf(err, "stat %q", path)
			p, err := message.EnsurePart(mf, fi.Size())
			if err != nil {
				log.Printf("bad sent message %q: %s", path, err)
				return
			}

			words, err = f.ParseMessage(p)
			if err != nil {
				log.Printf("bad sent message %q: %s", path, err)
				return
			}
		}

		if err := f.Train(msg.ham, words); err != nil {
			log.Printf("train: %s", err)
		}
	}

	for _, m := range msgs {
		play(m)
	}

	err := f.Save()
	xcheckf(err, "saving filter")

	fmt.Printf("completed, nham %d, nsent %d, nspam %d, nbad %d, nwithoutdate %d\n", nham, nsent, nspam, nbad, nnodate)
	fmt.Printf("total ham, ok %d, bad %d\n", nhamok, nhambad)
	fmt.Printf("total spam, ok %d, bad %d\n", nspamok, nspambad)
	fmt.Printf("specifity (true negatives, hams identified): %.6f\n", float64(nhamok)/(float64(nhamok+nhambad)))
	fmt.Printf("sensitivity (true positives, spams identified): %.6f\n", float64(nspamok)/(float64(nspamok+nspambad)))
	fmt.Printf("accuracy: %.6f\n", float64(nhamok+nspamok)/float64(nhamok+nhambad+nspamok+nspambad))
}