mox/junk/filter_test.go
Mechiel Lukkien 6aa2139a54
do not use results from junk filter if we have less than 50 positive classifications to base the decision on
useful for new accounts. we don't want to start rejecting incoming messages for
having a score near 0.5 because of too little training material. we err on the
side of allowing messages in. the user will mark them as junk, training the
filter. once enough non-junk has come in, we'll start the actual filtering.

for issue #64 by x8x, and i've also seen this concern on matrix
2025-01-23 22:55:50 +01:00

204 lines
5.7 KiB
Go

package junk
import (
"context"
"fmt"
"math"
"os"
"path/filepath"
"testing"
"github.com/mjl-/mox/mlog"
)
var ctxbg = context.Background()
func tcheck(t *testing.T, err error, msg string) {
t.Helper()
if err != nil {
t.Fatalf("%s: %s", msg, err)
}
}
func tlistdir(t *testing.T, name string) []string {
t.Helper()
l, err := os.ReadDir(name)
tcheck(t, err, "readdir")
names := make([]string, len(l))
for i, e := range l {
names[i] = e.Name()
}
return names
}
func TestFilter(t *testing.T) {
log := mlog.New("junk", nil)
params := Params{
Onegrams: true,
Twograms: true,
Threegrams: false,
MaxPower: 0.1,
TopWords: 10,
IgnoreWords: 0.1,
RareWords: 1,
}
dbPath := filepath.FromSlash("../testdata/junk/filter.db")
bloomPath := filepath.FromSlash("../testdata/junk/filter.bloom")
os.Remove(dbPath)
os.Remove(bloomPath)
f, err := NewFilter(ctxbg, log, params, dbPath, bloomPath)
tcheck(t, err, "new filter")
err = f.Close()
tcheck(t, err, "close filter")
f, err = OpenFilter(ctxbg, log, params, dbPath, bloomPath, true)
tcheck(t, err, "open filter")
// Ensure these dirs exist. Developers should bring their own ham/spam example
// emails.
os.MkdirAll("../testdata/train/ham", 0770)
os.MkdirAll("../testdata/train/spam", 0770)
hamdir := filepath.FromSlash("../testdata/train/ham")
spamdir := filepath.FromSlash("../testdata/train/spam")
hamfiles := tlistdir(t, hamdir)
if len(hamfiles) > 100 {
hamfiles = hamfiles[:100]
}
spamfiles := tlistdir(t, spamdir)
if len(spamfiles) > 100 {
spamfiles = spamfiles[:100]
}
err = f.TrainDirs(hamdir, "", spamdir, hamfiles, nil, spamfiles)
tcheck(t, err, "train dirs")
if len(hamfiles) == 0 || len(spamfiles) == 0 {
fmt.Println("not training, no ham and/or spam messages, add them to testdata/train/ham and testdata/train/spam")
return
}
result, err := f.ClassifyMessagePath(ctxbg, filepath.Join(hamdir, hamfiles[0]))
tcheck(t, err, "classify ham message")
if result.Probability > 0.1 {
t.Fatalf("trained ham file has prob %v, expected <= 0.1", result.Probability)
}
result, err = f.ClassifyMessagePath(ctxbg, filepath.Join(spamdir, spamfiles[0]))
tcheck(t, err, "classify spam message")
if result.Probability < 0.9 {
t.Fatalf("trained spam file has prob %v, expected > 0.9", result.Probability)
}
err = f.Close()
tcheck(t, err, "close filter")
// Start again with empty filter. We'll train a few messages and check they are
// classified as ham/spam. Then we untrain to see they are no longer classified.
os.Remove(dbPath)
os.Remove(bloomPath)
f, err = NewFilter(ctxbg, log, params, dbPath, bloomPath)
tcheck(t, err, "open filter")
hamf, err := os.Open(filepath.Join(hamdir, hamfiles[0]))
tcheck(t, err, "open hamfile")
defer hamf.Close()
hamstat, err := hamf.Stat()
tcheck(t, err, "stat hamfile")
hamsize := hamstat.Size()
spamf, err := os.Open(filepath.Join(spamdir, spamfiles[0]))
tcheck(t, err, "open spamfile")
defer spamf.Close()
spamstat, err := spamf.Stat()
tcheck(t, err, "stat spamfile")
spamsize := spamstat.Size()
// Train each message twice, to prevent single occurrences from being ignored.
err = f.TrainMessage(ctxbg, hamf, hamsize, true)
tcheck(t, err, "train ham message")
_, err = hamf.Seek(0, 0)
tcheck(t, err, "seek ham message")
err = f.TrainMessage(ctxbg, hamf, hamsize, true)
tcheck(t, err, "train ham message")
err = f.TrainMessage(ctxbg, spamf, spamsize, false)
tcheck(t, err, "train spam message")
_, err = spamf.Seek(0, 0)
tcheck(t, err, "seek spam message")
err = f.TrainMessage(ctxbg, spamf, spamsize, false)
tcheck(t, err, "train spam message")
if !f.modified {
t.Fatalf("filter not modified after training")
}
if !f.bloom.Modified() {
t.Fatalf("bloom filter not modified after training")
}
err = f.Save()
tcheck(t, err, "save filter")
if f.modified || f.bloom.Modified() {
t.Fatalf("filter or bloom filter still modified after save")
}
// Classify and verify.
_, err = hamf.Seek(0, 0)
tcheck(t, err, "seek ham message")
result, err = f.ClassifyMessageReader(ctxbg, hamf, hamsize)
tcheck(t, err, "classify ham")
if result.Probability > 0.1 {
t.Fatalf("got prob %v, expected <= 0.1", result.Probability)
}
_, err = spamf.Seek(0, 0)
tcheck(t, err, "seek spam message")
result, err = f.ClassifyMessageReader(ctxbg, spamf, spamsize)
tcheck(t, err, "classify spam")
if result.Probability < 0.9 {
t.Fatalf("got prob %v, expected >= 0.9", result.Probability)
}
// Untrain ham & spam.
_, err = hamf.Seek(0, 0)
tcheck(t, err, "seek ham message")
err = f.UntrainMessage(ctxbg, hamf, hamsize, true)
tcheck(t, err, "untrain ham message")
_, err = hamf.Seek(0, 0)
tcheck(t, err, "seek ham message")
err = f.UntrainMessage(ctxbg, hamf, hamsize, true)
tcheck(t, err, "untrain ham message")
_, err = spamf.Seek(0, 0)
tcheck(t, err, "seek spam message")
err = f.UntrainMessage(ctxbg, spamf, spamsize, false)
tcheck(t, err, "untrain spam message")
_, err = spamf.Seek(0, 0)
tcheck(t, err, "seek spam message")
err = f.UntrainMessage(ctxbg, spamf, spamsize, false)
tcheck(t, err, "untrain spam message")
if !f.modified {
t.Fatalf("filter not modified after untraining")
}
// Classify again, should be unknown.
_, err = hamf.Seek(0, 0)
tcheck(t, err, "seek ham message")
result, err = f.ClassifyMessageReader(ctxbg, hamf, hamsize)
tcheck(t, err, "classify ham")
if math.Abs(result.Probability-0.5) > 0.1 {
t.Fatalf("got prob %v, expected 0.5 +-0.1", result.Probability)
}
_, err = spamf.Seek(0, 0)
tcheck(t, err, "seek spam message")
result, err = f.ClassifyMessageReader(ctxbg, spamf, spamsize)
tcheck(t, err, "classify spam")
if math.Abs(result.Probability-0.5) > 0.1 {
t.Fatalf("got prob %v, expected 0.5 +-0.1", result.Probability)
}
err = f.Close()
tcheck(t, err, "close filter")
}