mox/junk.go

package main

/*
note: these testdata paths are not in the repo, you should gather some of your
own ham/spam emails.

./mox junk train testdata/train/ham testdata/train/spam
./mox junk train -sent-dir testdata/sent testdata/train/ham testdata/train/spam
./mox junk check 'testdata/check/ham/mail1'
./mox junk test testdata/check/ham testdata/check/spam
./mox junk analyze testdata/train/ham testdata/train/spam
./mox junk analyze -top-words 10 -train-ratio 0.5 -spam-threshold 0.85 -max-power 0.01 -sent-dir testdata/sent testdata/train/ham testdata/train/spam
./mox junk play -top-words 10 -train-ratio 0.5 -spam-threshold 0.85 -max-power 0.01 -sent-dir testdata/sent testdata/train/ham testdata/train/spam
*/

import (
	"flag"
	"fmt"
	"log"
	mathrand "math/rand"
	"os"
	"runtime"
	"runtime/pprof"
	"sort"
	"time"

	"github.com/mjl-/mox/junk"
	"github.com/mjl-/mox/message"
	"github.com/mjl-/mox/mlog"
	"github.com/mjl-/mox/mox-"
)

type junkArgs struct {
	params                        junk.Params
	cpuprofile, memprofile        string
	spamThreshold                 float64
	trainRatio                    float64
	seed                          bool
	sentDir                       string
	databasePath, bloomfilterPath string
	debug                         bool
}

func (a junkArgs) Memprofile() {
	if a.memprofile == "" {
		return
	}

	f, err := os.Create(a.memprofile)
	xcheckf(err, "creating memory profile")
	defer f.Close()
	runtime.GC() // get up-to-date statistics
	err = pprof.WriteHeapProfile(f)
	xcheckf(err, "writing memory profile")
}

func (a junkArgs) Profile() func() {
	if a.cpuprofile == "" {
		return func() {
			a.Memprofile()
		}
	}

	f, err := os.Create(a.cpuprofile)
	xcheckf(err, "creating CPU profile")
	err = pprof.StartCPUProfile(f)
	xcheckf(err, "start CPU profile")
	return func() {
		pprof.StopCPUProfile()
		f.Close()
		a.Memprofile()
	}
}

func (a junkArgs) SetLogLevel() {
	mox.Conf.Log[""] = mlog.LevelInfo
	if a.debug {
		mox.Conf.Log[""] = mlog.LevelDebug
	}
	mlog.SetConfig(mox.Conf.Log)
}

func junkFlags(fs *flag.FlagSet) (a junkArgs) {
	fs.BoolVar(&a.params.Onegrams, "one-grams", false, "use 1-grams, i.e. single words, for scoring")
	fs.BoolVar(&a.params.Twograms, "two-grams", true, "use 2-grams, i.e. word pairs, for scoring")
	fs.BoolVar(&a.params.Threegrams, "three-grams", false, "use 3-grams, i.e. word triplets, for scoring")
	fs.Float64Var(&a.params.MaxPower, "max-power", 0.05, "maximum word power, e.g. min 0.05/max 0.95")
	fs.Float64Var(&a.params.IgnoreWords, "ignore-words", 0.1, "ignore words with ham/spaminess within this distance from 0.5")
	fs.IntVar(&a.params.TopWords, "top-words", 10, "number of top spam and number of top ham words from email to use")
	fs.IntVar(&a.params.RareWords, "rare-words", 1, "words are rare if encountered this number during training, and skipped for scoring")
	fs.BoolVar(&a.debug, "debug", false, "print debug logging when calculating spam probability")

	fs.Float64Var(&a.spamThreshold, "spam-threshold", 0.95, "probability where message is seen as spam")
	fs.Float64Var(&a.trainRatio, "train-ratio", 0.5, "part of data to use for training versus analyzing (for analyze only)")
	fs.StringVar(&a.sentDir, "sent-dir", "", "directory with sent mails, for training")
	fs.BoolVar(&a.seed, "seed", false, "seed prng before analysis")
	fs.StringVar(&a.databasePath, "dbpath", "filter.db", "database file for ham/spam words")
	fs.StringVar(&a.bloomfilterPath, "bloompath", "filter.bloom", "bloom filter for ignoring unique strings")

	fs.StringVar(&a.cpuprofile, "cpuprof", "", "store cpu profile to file")
	fs.StringVar(&a.memprofile, "memprof", "", "store mem profile to file")
	return
}

func listDir(dir string) (l []string) {
	files, err := os.ReadDir(dir)
	xcheckf(err, "listing directory %q", dir)
	for _, f := range files {
		l = append(l, f.Name())
	}
	return l
}

func must(f *junk.Filter, err error) *junk.Filter {
	xcheckf(err, "filter")
	return f
}

func cmdJunkTrain(c *cmd) {
	c.unlisted = true
	c.params = "hamdir spamdir"
	c.help = "Train a junk filter with messages from hamdir and spamdir."
	a := junkFlags(c.flag)
	args := c.Parse()
	if len(args) != 2 {
		c.Usage()
	}
	defer a.Profile()()
	a.SetLogLevel()

	f := must(junk.NewFilter(mlog.New("junktrain"), a.params, a.databasePath, a.bloomfilterPath))
	defer f.Close()

	hamFiles := listDir(args[0])
	spamFiles := listDir(args[1])
	var sentFiles []string
	if a.sentDir != "" {
		sentFiles = listDir(a.sentDir)
	}

	err := f.TrainDirs(args[0], a.sentDir, args[1], hamFiles, sentFiles, spamFiles)
	xcheckf(err, "train")
}

func cmdJunkCheck(c *cmd) {
	c.unlisted = true
	c.params = "mailfile"
	c.help = "Check an email message against a junk filter, printing the probability of spam on a scale from 0 to 1."
	a := junkFlags(c.flag)
	args := c.Parse()
	if len(args) != 1 {
		c.Usage()
	}
	defer a.Profile()()
	a.SetLogLevel()

	f := must(junk.OpenFilter(mlog.New("junkcheck"), a.params, a.databasePath, a.bloomfilterPath, false))
	defer f.Close()

	prob, _, _, _, err := f.ClassifyMessagePath(args[0])
	xcheckf(err, "testing mail")

	fmt.Printf("%.6f\n", prob)
}

func cmdJunkTest(c *cmd) {
	c.unlisted = true
	c.params = "hamdir spamdir"
	c.help = "Check a directory with hams and one with spams against the junk filter, and report the success ratio."
	a := junkFlags(c.flag)
	args := c.Parse()
	if len(args) != 2 {
		c.Usage()
	}
	defer a.Profile()()
	a.SetLogLevel()

	f := must(junk.OpenFilter(mlog.New("junktest"), a.params, a.databasePath, a.bloomfilterPath, false))
	defer f.Close()

	testDir := func(dir string, ham bool) (int, int) {
		ok, bad := 0, 0
		files, err := os.ReadDir(dir)
		xcheckf(err, "readdir %q", dir)
		for _, fi := range files {
			path := dir + "/" + fi.Name()
			prob, _, _, _, err := f.ClassifyMessagePath(path)
			if err != nil {
				log.Printf("classify message %q: %s", path, err)
				continue
			}
			if ham && prob < a.spamThreshold || !ham && prob > a.spamThreshold {
				ok++
			} else {
				bad++
			}
			if ham && prob > a.spamThreshold {
				fmt.Printf("ham %q: %.4f\n", path, prob)
			}
			if !ham && prob < a.spamThreshold {
				fmt.Printf("spam %q: %.4f\n", path, prob)
			}
		}
		return ok, bad
	}

	nhamok, nhambad := testDir(args[0], true)
	nspamok, nspambad := testDir(args[1], false)
	fmt.Printf("total ham, ok %d, bad %d\n", nhamok, nhambad)
	fmt.Printf("total spam, ok %d, bad %d\n", nspamok, nspambad)
	fmt.Printf("specifity (true negatives, hams identified): %.6f\n", float64(nhamok)/(float64(nhamok+nhambad)))
	fmt.Printf("sensitivity (true positives, spams identified): %.6f\n", float64(nspamok)/(float64(nspamok+nspambad)))
	fmt.Printf("accuracy: %.6f\n", float64(nhamok+nspamok)/float64(nhamok+nhambad+nspamok+nspambad))
}

func cmdJunkAnalyze(c *cmd) {
	c.unlisted = true
	c.params = "hamdir spamdir"
	c.help = `Analyze a directory with ham messages and one with spam messages.

A part of the messages is used for training, and remaining for testing. The
messages are shuffled, with optional random seed.`
	a := junkFlags(c.flag)
	args := c.Parse()
	if len(args) != 2 {
		c.Usage()
	}
	defer a.Profile()()
	a.SetLogLevel()

	f := must(junk.NewFilter(mlog.New("junkanalyze"), a.params, a.databasePath, a.bloomfilterPath))
	defer f.Close()

	hamDir := args[0]
	spamDir := args[1]
	hamFiles := listDir(hamDir)
	spamFiles := listDir(spamDir)

	var rand *mathrand.Rand
	if a.seed {
		rand = mathrand.New(mathrand.NewSource(time.Now().UnixMilli()))
	} else {
		rand = mathrand.New(mathrand.NewSource(0))
	}

	shuffle := func(l []string) {
		count := len(l)
		for i := range l {
			n := rand.Intn(count)
			l[i], l[n] = l[n], l[i]
		}
	}

	shuffle(hamFiles)
	shuffle(spamFiles)

	ntrainham := int(a.trainRatio * float64(len(hamFiles)))
	ntrainspam := int(a.trainRatio * float64(len(spamFiles)))

	trainHam := hamFiles[:ntrainham]
	trainSpam := spamFiles[:ntrainspam]
	testHam := hamFiles[ntrainham:]
	testSpam := spamFiles[ntrainspam:]

	var trainSent []string
	if a.sentDir != "" {
		trainSent = listDir(a.sentDir)
	}

	err := f.TrainDirs(hamDir, a.sentDir, spamDir, trainHam, trainSent, trainSpam)
	xcheckf(err, "train")

	testDir := func(dir string, files []string, ham bool) (ok, bad, malformed int) {
		for _, name := range files {
			path := dir + "/" + name
			prob, _, _, _, err := f.ClassifyMessagePath(path)
			if err != nil {
				// log.Infof("%s: %s", path, err)
				malformed++
				continue
			}
			if ham && prob < a.spamThreshold || !ham && prob > a.spamThreshold {
				ok++
			} else {
				bad++
			}
			if ham && prob > a.spamThreshold {
				fmt.Printf("ham %q: %.4f\n", path, prob)
			}
			if !ham && prob < a.spamThreshold {
				fmt.Printf("spam %q: %.4f\n", path, prob)
			}
		}
		return
	}

	nhamok, nhambad, nmalformedham := testDir(args[0], testHam, true)
	nspamok, nspambad, nmalformedspam := testDir(args[1], testSpam, false)
	fmt.Printf("training done, nham %d, nsent %d, nspam %d\n", ntrainham, len(trainSent), ntrainspam)
	fmt.Printf("total ham, ok %d, bad %d, malformed %d\n", nhamok, nhambad, nmalformedham)
	fmt.Printf("total spam, ok %d, bad %d, malformed %d\n", nspamok, nspambad, nmalformedspam)
	fmt.Printf("specifity (true negatives, hams identified): %.6f\n", float64(nhamok)/(float64(nhamok+nhambad)))
	fmt.Printf("sensitivity (true positives, spams identified): %.6f\n", float64(nspamok)/(float64(nspamok+nspambad)))
	fmt.Printf("accuracy: %.6f\n", float64(nhamok+nspamok)/float64(nhamok+nhambad+nspamok+nspambad))
}

func cmdJunkPlay(c *cmd) {
	c.unlisted = true
	c.params = "hamdir spamdir"
	c.help = "Play messages from ham and spam directory according to their time of arrival and report on junk filter performance."
	a := junkFlags(c.flag)
	args := c.Parse()
	if len(args) != 2 {
		c.Usage()
	}
	defer a.Profile()()
	a.SetLogLevel()

	f := must(junk.NewFilter(mlog.New("junkplay"), a.params, a.databasePath, a.bloomfilterPath))
	defer f.Close()

	// We'll go through all emails to find their dates.
	type msg struct {
		dir, filename string
		ham, sent     bool
		t             time.Time
	}
	var msgs []msg

	var nbad, nnodate, nham, nspam, nsent int

	scanDir := func(dir string, ham, sent bool) {
		for _, name := range listDir(dir) {
			path := dir + "/" + name
			mf, err := os.Open(path)
			xcheckf(err, "open %q", path)
			fi, err := mf.Stat()
			xcheckf(err, "stat %q", path)
			p, err := message.EnsurePart(mf, fi.Size())
			if err != nil {
				nbad++
				mf.Close()
				continue
			}
			if p.Envelope.Date.IsZero() {
				nnodate++
				mf.Close()
				continue
			}
			mf.Close()
			msgs = append(msgs, msg{dir, name, ham, sent, p.Envelope.Date})
			if sent {
				nsent++
			} else if ham {
				nham++
			} else {
				nspam++
			}
		}
	}

	hamDir := args[0]
	spamDir := args[1]
	scanDir(hamDir, true, false)
	scanDir(spamDir, false, false)
	if a.sentDir != "" {
		scanDir(a.sentDir, true, true)
	}

	// Sort the messages, earliest first.
	sort.Slice(msgs, func(i, j int) bool {
		return msgs[i].t.Before(msgs[j].t)
	})

	// Play all messages as if they are coming in. We predict their spaminess, check if
	// we are right. And we train the system with the result.
	var nhamok, nhambad, nspamok, nspambad int

	play := func(msg msg) {
		var words map[string]struct{}
		path := msg.dir + "/" + msg.filename
		if !msg.sent {
			var prob float64
			var err error
			prob, words, _, _, err = f.ClassifyMessagePath(path)
			if err != nil {
				nbad++
				return
			}
			if msg.ham {
				if prob < a.spamThreshold {
					nhamok++
				} else {
					nhambad++
				}
			} else {
				if prob > a.spamThreshold {
					nspamok++
				} else {
					nspambad++
				}
			}
		} else {
			mf, err := os.Open(path)
			xcheckf(err, "open %q", path)
			defer mf.Close()
			fi, err := mf.Stat()
			xcheckf(err, "stat %q", path)
			p, err := message.EnsurePart(mf, fi.Size())
			if err != nil {
				log.Printf("bad sent message %q: %s", path, err)
				return
			}

			words, err = f.ParseMessage(p)
			if err != nil {
				log.Printf("bad sent message %q: %s", path, err)
				return
			}
		}

		if err := f.Train(msg.ham, words); err != nil {
			log.Printf("train: %s", err)
		}
	}

	for _, m := range msgs {
		play(m)
	}

	err := f.Save()
	xcheckf(err, "saving filter")

	fmt.Printf("completed, nham %d, nsent %d, nspam %d, nbad %d, nwithoutdate %d\n", nham, nsent, nspam, nbad, nnodate)
	fmt.Printf("total ham, ok %d, bad %d\n", nhamok, nhambad)
	fmt.Printf("total spam, ok %d, bad %d\n", nspamok, nspambad)
	fmt.Printf("specifity (true negatives, hams identified): %.6f\n", float64(nhamok)/(float64(nhamok+nhambad)))
	fmt.Printf("sensitivity (true positives, spams identified): %.6f\n", float64(nspamok)/(float64(nspamok+nspambad)))
	fmt.Printf("accuracy: %.6f\n", float64(nhamok+nspamok)/float64(nhamok+nhambad+nspamok+nspambad))
}
mox! 2023-01-30 16:27:06 +03:00			`package main`

			`/*`
			`note: these testdata paths are not in the repo, you should gather some of your`
			`own ham/spam emails.`

			`./mox junk train testdata/train/ham testdata/train/spam`
			`./mox junk train -sent-dir testdata/sent testdata/train/ham testdata/train/spam`
			`./mox junk check 'testdata/check/ham/mail1'`
			`./mox junk test testdata/check/ham testdata/check/spam`
			`./mox junk analyze testdata/train/ham testdata/train/spam`
			`./mox junk analyze -top-words 10 -train-ratio 0.5 -spam-threshold 0.85 -max-power 0.01 -sent-dir testdata/sent testdata/train/ham testdata/train/spam`
			`./mox junk play -top-words 10 -train-ratio 0.5 -spam-threshold 0.85 -max-power 0.01 -sent-dir testdata/sent testdata/train/ham testdata/train/spam`
			`*/`

			`import (`
			`"flag"`
			`"fmt"`
			`"log"`
			`mathrand "math/rand"`
			`"os"`
			`"runtime"`
			`"runtime/pprof"`
			`"sort"`
			`"time"`

			`"github.com/mjl-/mox/junk"`
			`"github.com/mjl-/mox/message"`
			`"github.com/mjl-/mox/mlog"`
			`"github.com/mjl-/mox/mox-"`
			`)`

			`type junkArgs struct {`
			`params junk.Params`
			`cpuprofile, memprofile string`
			`spamThreshold float64`
			`trainRatio float64`
			`seed bool`
			`sentDir string`
			`databasePath, bloomfilterPath string`
			`debug bool`
			`}`

			`func (a junkArgs) Memprofile() {`
			`if a.memprofile == "" {`
			`return`
			`}`

			`f, err := os.Create(a.memprofile)`
			`xcheckf(err, "creating memory profile")`
			`defer f.Close()`
			`runtime.GC() // get up-to-date statistics`
			`err = pprof.WriteHeapProfile(f)`
			`xcheckf(err, "writing memory profile")`
			`}`

			`func (a junkArgs) Profile() func() {`
			`if a.cpuprofile == "" {`
			`return func() {`
			`a.Memprofile()`
			`}`
			`}`

			`f, err := os.Create(a.cpuprofile)`
			`xcheckf(err, "creating CPU profile")`
			`err = pprof.StartCPUProfile(f)`
			`xcheckf(err, "start CPU profile")`
			`return func() {`
			`pprof.StopCPUProfile()`
			`f.Close()`
			`a.Memprofile()`
			`}`
			`}`

			`func (a junkArgs) SetLogLevel() {`
			`mox.Conf.Log[""] = mlog.LevelInfo`
			`if a.debug {`
			`mox.Conf.Log[""] = mlog.LevelDebug`
			`}`
			`mlog.SetConfig(mox.Conf.Log)`
			`}`

			`func junkFlags(fs *flag.FlagSet) (a junkArgs) {`
			`fs.BoolVar(&a.params.Onegrams, "one-grams", false, "use 1-grams, i.e. single words, for scoring")`
			`fs.BoolVar(&a.params.Twograms, "two-grams", true, "use 2-grams, i.e. word pairs, for scoring")`
			`fs.BoolVar(&a.params.Threegrams, "three-grams", false, "use 3-grams, i.e. word triplets, for scoring")`
			`fs.Float64Var(&a.params.MaxPower, "max-power", 0.05, "maximum word power, e.g. min 0.05/max 0.95")`
			`fs.Float64Var(&a.params.IgnoreWords, "ignore-words", 0.1, "ignore words with ham/spaminess within this distance from 0.5")`
			`fs.IntVar(&a.params.TopWords, "top-words", 10, "number of top spam and number of top ham words from email to use")`
			`fs.IntVar(&a.params.RareWords, "rare-words", 1, "words are rare if encountered this number during training, and skipped for scoring")`
			`fs.BoolVar(&a.debug, "debug", false, "print debug logging when calculating spam probability")`

			`fs.Float64Var(&a.spamThreshold, "spam-threshold", 0.95, "probability where message is seen as spam")`
			`fs.Float64Var(&a.trainRatio, "train-ratio", 0.5, "part of data to use for training versus analyzing (for analyze only)")`
			`fs.StringVar(&a.sentDir, "sent-dir", "", "directory with sent mails, for training")`
			`fs.BoolVar(&a.seed, "seed", false, "seed prng before analysis")`
			`fs.StringVar(&a.databasePath, "dbpath", "filter.db", "database file for ham/spam words")`
			`fs.StringVar(&a.bloomfilterPath, "bloompath", "filter.bloom", "bloom filter for ignoring unique strings")`

			`fs.StringVar(&a.cpuprofile, "cpuprof", "", "store cpu profile to file")`
			`fs.StringVar(&a.memprofile, "memprof", "", "store mem profile to file")`
			`return`
			`}`

			`func listDir(dir string) (l []string) {`
			`files, err := os.ReadDir(dir)`
			`xcheckf(err, "listing directory %q", dir)`
			`for _, f := range files {`
			`l = append(l, f.Name())`
			`}`
			`return l`
			`}`

			`func must(f junk.Filter, err error) junk.Filter {`
			`xcheckf(err, "filter")`
			`return f`
			`}`

			`func cmdJunkTrain(c *cmd) {`
			`c.unlisted = true`
			`c.params = "hamdir spamdir"`
			`c.help = "Train a junk filter with messages from hamdir and spamdir."`
			`a := junkFlags(c.flag)`
			`args := c.Parse()`
			`if len(args) != 2 {`
			`c.Usage()`
			`}`
			`defer a.Profile()()`
			`a.SetLogLevel()`

			`f := must(junk.NewFilter(mlog.New("junktrain"), a.params, a.databasePath, a.bloomfilterPath))`
			`defer f.Close()`

			`hamFiles := listDir(args[0])`
			`spamFiles := listDir(args[1])`
			`var sentFiles []string`
			`if a.sentDir != "" {`
			`sentFiles = listDir(a.sentDir)`
			`}`

			`err := f.TrainDirs(args[0], a.sentDir, args[1], hamFiles, sentFiles, spamFiles)`
			`xcheckf(err, "train")`
			`}`

			`func cmdJunkCheck(c *cmd) {`
			`c.unlisted = true`
			`c.params = "mailfile"`
			`c.help = "Check an email message against a junk filter, printing the probability of spam on a scale from 0 to 1."`
			`a := junkFlags(c.flag)`
			`args := c.Parse()`
			`if len(args) != 1 {`
			`c.Usage()`
			`}`
			`defer a.Profile()()`
			`a.SetLogLevel()`

			`f := must(junk.OpenFilter(mlog.New("junkcheck"), a.params, a.databasePath, a.bloomfilterPath, false))`
			`defer f.Close()`

			`prob, _, _, _, err := f.ClassifyMessagePath(args[0])`
			`xcheckf(err, "testing mail")`

			`fmt.Printf("%.6f\n", prob)`
			`}`

			`func cmdJunkTest(c *cmd) {`
			`c.unlisted = true`
			`c.params = "hamdir spamdir"`
			`c.help = "Check a directory with hams and one with spams against the junk filter, and report the success ratio."`
			`a := junkFlags(c.flag)`
			`args := c.Parse()`
			`if len(args) != 2 {`
			`c.Usage()`
			`}`
			`defer a.Profile()()`
			`a.SetLogLevel()`

			`f := must(junk.OpenFilter(mlog.New("junktest"), a.params, a.databasePath, a.bloomfilterPath, false))`
			`defer f.Close()`

			`testDir := func(dir string, ham bool) (int, int) {`
			`ok, bad := 0, 0`
			`files, err := os.ReadDir(dir)`
			`xcheckf(err, "readdir %q", dir)`
			`for _, fi := range files {`
			`path := dir + "/" + fi.Name()`
			`prob, _, _, _, err := f.ClassifyMessagePath(path)`
			`if err != nil {`
			`log.Printf("classify message %q: %s", path, err)`
			`continue`
			`}`
			`if ham && prob < a.spamThreshold \|\| !ham && prob > a.spamThreshold {`
			`ok++`
			`} else {`
			`bad++`
			`}`
			`if ham && prob > a.spamThreshold {`
			`fmt.Printf("ham %q: %.4f\n", path, prob)`
			`}`
			`if !ham && prob < a.spamThreshold {`
			`fmt.Printf("spam %q: %.4f\n", path, prob)`
			`}`
			`}`
			`return ok, bad`
			`}`

			`nhamok, nhambad := testDir(args[0], true)`
			`nspamok, nspambad := testDir(args[1], false)`
			`fmt.Printf("total ham, ok %d, bad %d\n", nhamok, nhambad)`
			`fmt.Printf("total spam, ok %d, bad %d\n", nspamok, nspambad)`
			`fmt.Printf("specifity (true negatives, hams identified): %.6f\n", float64(nhamok)/(float64(nhamok+nhambad)))`
			`fmt.Printf("sensitivity (true positives, spams identified): %.6f\n", float64(nspamok)/(float64(nspamok+nspambad)))`
			`fmt.Printf("accuracy: %.6f\n", float64(nhamok+nspamok)/float64(nhamok+nhambad+nspamok+nspambad))`
			`}`

			`func cmdJunkAnalyze(c *cmd) {`
			`c.unlisted = true`
			`c.params = "hamdir spamdir"`
			c.help = `Analyze a directory with ham messages and one with spam messages.

			`A part of the messages is used for training, and remaining for testing. The`
			messages are shuffled, with optional random seed.`
			`a := junkFlags(c.flag)`
			`args := c.Parse()`
			`if len(args) != 2 {`
			`c.Usage()`
			`}`
			`defer a.Profile()()`
			`a.SetLogLevel()`

			`f := must(junk.NewFilter(mlog.New("junkanalyze"), a.params, a.databasePath, a.bloomfilterPath))`
			`defer f.Close()`

			`hamDir := args[0]`
			`spamDir := args[1]`
			`hamFiles := listDir(hamDir)`
			`spamFiles := listDir(spamDir)`

			`var rand *mathrand.Rand`
			`if a.seed {`
			`rand = mathrand.New(mathrand.NewSource(time.Now().UnixMilli()))`
			`} else {`
			`rand = mathrand.New(mathrand.NewSource(0))`
			`}`

			`shuffle := func(l []string) {`
			`count := len(l)`
			`for i := range l {`
			`n := rand.Intn(count)`
			`l[i], l[n] = l[n], l[i]`
			`}`
			`}`

			`shuffle(hamFiles)`
			`shuffle(spamFiles)`

			`ntrainham := int(a.trainRatio * float64(len(hamFiles)))`
			`ntrainspam := int(a.trainRatio * float64(len(spamFiles)))`

			`trainHam := hamFiles[:ntrainham]`
			`trainSpam := spamFiles[:ntrainspam]`
			`testHam := hamFiles[ntrainham:]`
			`testSpam := spamFiles[ntrainspam:]`

			`var trainSent []string`
			`if a.sentDir != "" {`
			`trainSent = listDir(a.sentDir)`
			`}`

			`err := f.TrainDirs(hamDir, a.sentDir, spamDir, trainHam, trainSent, trainSpam)`
			`xcheckf(err, "train")`

			`testDir := func(dir string, files []string, ham bool) (ok, bad, malformed int) {`
			`for _, name := range files {`
			`path := dir + "/" + name`
			`prob, _, _, _, err := f.ClassifyMessagePath(path)`
			`if err != nil {`
			`// log.Infof("%s: %s", path, err)`
			`malformed++`
			`continue`
			`}`
			`if ham && prob < a.spamThreshold \|\| !ham && prob > a.spamThreshold {`
			`ok++`
			`} else {`
			`bad++`
			`}`
			`if ham && prob > a.spamThreshold {`
			`fmt.Printf("ham %q: %.4f\n", path, prob)`
			`}`
			`if !ham && prob < a.spamThreshold {`
			`fmt.Printf("spam %q: %.4f\n", path, prob)`
			`}`
			`}`
			`return`
			`}`

			`nhamok, nhambad, nmalformedham := testDir(args[0], testHam, true)`
			`nspamok, nspambad, nmalformedspam := testDir(args[1], testSpam, false)`
			`fmt.Printf("training done, nham %d, nsent %d, nspam %d\n", ntrainham, len(trainSent), ntrainspam)`
			`fmt.Printf("total ham, ok %d, bad %d, malformed %d\n", nhamok, nhambad, nmalformedham)`
			`fmt.Printf("total spam, ok %d, bad %d, malformed %d\n", nspamok, nspambad, nmalformedspam)`
			`fmt.Printf("specifity (true negatives, hams identified): %.6f\n", float64(nhamok)/(float64(nhamok+nhambad)))`
			`fmt.Printf("sensitivity (true positives, spams identified): %.6f\n", float64(nspamok)/(float64(nspamok+nspambad)))`
			`fmt.Printf("accuracy: %.6f\n", float64(nhamok+nspamok)/float64(nhamok+nhambad+nspamok+nspambad))`
			`}`

			`func cmdJunkPlay(c *cmd) {`
			`c.unlisted = true`
			`c.params = "hamdir spamdir"`
			`c.help = "Play messages from ham and spam directory according to their time of arrival and report on junk filter performance."`
			`a := junkFlags(c.flag)`
			`args := c.Parse()`
			`if len(args) != 2 {`
			`c.Usage()`
			`}`
			`defer a.Profile()()`
			`a.SetLogLevel()`

			`f := must(junk.NewFilter(mlog.New("junkplay"), a.params, a.databasePath, a.bloomfilterPath))`
			`defer f.Close()`

			`// We'll go through all emails to find their dates.`
			`type msg struct {`
			`dir, filename string`
			`ham, sent bool`
			`t time.Time`
			`}`
			`var msgs []msg`

			`var nbad, nnodate, nham, nspam, nsent int`

			`scanDir := func(dir string, ham, sent bool) {`
			`for _, name := range listDir(dir) {`
			`path := dir + "/" + name`
			`mf, err := os.Open(path)`
			`xcheckf(err, "open %q", path)`
			`fi, err := mf.Stat()`
			`xcheckf(err, "stat %q", path)`
			`p, err := message.EnsurePart(mf, fi.Size())`
			`if err != nil {`
			`nbad++`
			`mf.Close()`
			`continue`
			`}`
			`if p.Envelope.Date.IsZero() {`
			`nnodate++`
			`mf.Close()`
			`continue`
			`}`
			`mf.Close()`
			`msgs = append(msgs, msg{dir, name, ham, sent, p.Envelope.Date})`
			`if sent {`
			`nsent++`
			`} else if ham {`
			`nham++`
			`} else {`
			`nspam++`
			`}`
			`}`
			`}`

			`hamDir := args[0]`
			`spamDir := args[1]`
			`scanDir(hamDir, true, false)`
			`scanDir(spamDir, false, false)`
			`if a.sentDir != "" {`
			`scanDir(a.sentDir, true, true)`
			`}`

			`// Sort the messages, earliest first.`
			`sort.Slice(msgs, func(i, j int) bool {`
			`return msgs[i].t.Before(msgs[j].t)`
			`})`

			`// Play all messages as if they are coming in. We predict their spaminess, check if`
			`// we are right. And we train the system with the result.`
			`var nhamok, nhambad, nspamok, nspambad int`

			`play := func(msg msg) {`
			`var words map[string]struct{}`
			`path := msg.dir + "/" + msg.filename`
			`if !msg.sent {`
			`var prob float64`
			`var err error`
			`prob, words, _, _, err = f.ClassifyMessagePath(path)`
			`if err != nil {`
			`nbad++`
			`return`
			`}`
			`if msg.ham {`
			`if prob < a.spamThreshold {`
			`nhamok++`
			`} else {`
			`nhambad++`
			`}`
			`} else {`
			`if prob > a.spamThreshold {`
			`nspamok++`
			`} else {`
			`nspambad++`
			`}`
			`}`
			`} else {`
			`mf, err := os.Open(path)`
			`xcheckf(err, "open %q", path)`
			`defer mf.Close()`
			`fi, err := mf.Stat()`
			`xcheckf(err, "stat %q", path)`
			`p, err := message.EnsurePart(mf, fi.Size())`
			`if err != nil {`
			`log.Printf("bad sent message %q: %s", path, err)`
			`return`
			`}`

			`words, err = f.ParseMessage(p)`
			`if err != nil {`
			`log.Printf("bad sent message %q: %s", path, err)`
			`return`
			`}`
			`}`

			`if err := f.Train(msg.ham, words); err != nil {`
			`log.Printf("train: %s", err)`
			`}`
			`}`

			`for _, m := range msgs {`
			`play(m)`
			`}`

			`err := f.Save()`
			`xcheckf(err, "saving filter")`

			`fmt.Printf("completed, nham %d, nsent %d, nspam %d, nbad %d, nwithoutdate %d\n", nham, nsent, nspam, nbad, nnodate)`
			`fmt.Printf("total ham, ok %d, bad %d\n", nhamok, nhambad)`
			`fmt.Printf("total spam, ok %d, bad %d\n", nspamok, nspambad)`
			`fmt.Printf("specifity (true negatives, hams identified): %.6f\n", float64(nhamok)/(float64(nhamok+nhambad)))`
			`fmt.Printf("sensitivity (true positives, spams identified): %.6f\n", float64(nspamok)/(float64(nspamok+nspambad)))`
			`fmt.Printf("accuracy: %.6f\n", float64(nhamok+nspamok)/float64(nhamok+nhambad+nspamok+nspambad))`
			`}`