2023-01-30 16:27:06 +03:00
|
|
|
package junk
|
|
|
|
|
|
|
|
import (
|
2023-05-22 15:40:36 +03:00
|
|
|
"context"
|
2023-01-30 16:27:06 +03:00
|
|
|
"fmt"
|
|
|
|
"math"
|
|
|
|
"os"
|
|
|
|
"path/filepath"
|
|
|
|
"testing"
|
|
|
|
|
|
|
|
"github.com/mjl-/mox/mlog"
|
|
|
|
)
|
|
|
|
|
2023-05-22 15:40:36 +03:00
|
|
|
var ctxbg = context.Background()
|
|
|
|
|
2023-01-30 16:27:06 +03:00
|
|
|
func tcheck(t *testing.T, err error, msg string) {
|
|
|
|
t.Helper()
|
|
|
|
if err != nil {
|
|
|
|
t.Fatalf("%s: %s", msg, err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func tlistdir(t *testing.T, name string) []string {
|
|
|
|
t.Helper()
|
|
|
|
l, err := os.ReadDir(name)
|
|
|
|
tcheck(t, err, "readdir")
|
|
|
|
names := make([]string, len(l))
|
|
|
|
for i, e := range l {
|
|
|
|
names[i] = e.Name()
|
|
|
|
}
|
|
|
|
return names
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestFilter(t *testing.T) {
|
2023-12-05 15:35:58 +03:00
|
|
|
log := mlog.New("junk", nil)
|
2023-01-30 16:27:06 +03:00
|
|
|
params := Params{
|
|
|
|
Onegrams: true,
|
|
|
|
Twograms: true,
|
|
|
|
Threegrams: false,
|
|
|
|
MaxPower: 0.1,
|
|
|
|
TopWords: 10,
|
|
|
|
IgnoreWords: 0.1,
|
|
|
|
RareWords: 1,
|
|
|
|
}
|
make mox compile on windows, without "mox serve" but with working "mox localserve"
getting mox to compile required changing code in only a few places where
package "syscall" was used: for accessing file access times and for umask
handling. an open problem is how to start a process as an unprivileged user on
windows. that's why "mox serve" isn't implemented yet. and just finding a way
to implement it now may not be good enough in the near future: we may want to
starting using a more complete privilege separation approach, with a process
handling sensitive tasks (handling private keys, authentication), where we may
want to pass file descriptors between processes. how would that work on
windows?
anyway, getting mox to compile for windows doesn't mean it works properly on
windows. the largest issue: mox would normally open a file, rename or remove
it, and finally close it. this happens during message delivery. that doesn't
work on windows, the rename/remove would fail because the file is still open.
so this commit swaps many "remove" and "close" calls. renames are a longer
story: message delivery had two ways to deliver: with "consuming" the
(temporary) message file (which would rename it to its final destination), and
without consuming (by hardlinking the file, falling back to copying). the last
delivery to a recipient of a message (and the only one in the common case of a
single recipient) would consume the message, and the earlier recipients would
not. during delivery, the already open message file was used, to parse the
message. we still want to use that open message file, and the caller now stays
responsible for closing it, but we no longer try to rename (consume) the file.
we always hardlink (or copy) during delivery (this works on windows), and the
caller is responsible for closing and removing (in that order) the original
temporary file. this does cost one syscall more. but it makes the delivery code
(responsibilities) a bit simpler.
there is one more obvious issue: the file system path separator. mox already
used the "filepath" package to join paths in many places, but not everywhere.
and it still used strings with slashes for local file access. with this commit,
the code now uses filepath.FromSlash for path strings with slashes, uses
"filepath" in a few more places where it previously didn't. also switches from
"filepath" to regular "path" package when handling mailbox names in a few
places, because those always use forward slashes, regardless of local file
system conventions. windows can handle forward slashes when opening files, so
test code that passes path strings with forward slashes straight to go stdlib
file i/o functions are left unchanged to reduce code churn. the regular
non-test code, or test code that uses path strings in places other than
standard i/o functions, does have the paths converted for consistent paths
(otherwise we would end up with paths with mixed forward/backward slashes in
log messages).
windows cannot dup a listening socket. for "mox localserve", it isn't
important, and we can work around the issue. the current approach for "mox
serve" (forking a process and passing file descriptors of listening sockets on
"privileged" ports) won't work on windows. perhaps it isn't needed on windows,
and any user can listen on "privileged" ports? that would be welcome.
on windows, os.Open cannot open a directory, so we cannot call Sync on it after
message delivery. a cursory internet search indicates that directories cannot
be synced on windows. the story is probably much more nuanced than that, with
long deep technical details/discussions/disagreement/confusion, like on unix.
for "mox localserve" we can get away with making syncdir a no-op.
2023-10-14 11:54:07 +03:00
|
|
|
dbPath := filepath.FromSlash("../testdata/junk/filter.db")
|
|
|
|
bloomPath := filepath.FromSlash("../testdata/junk/filter.bloom")
|
2023-01-30 16:27:06 +03:00
|
|
|
os.Remove(dbPath)
|
|
|
|
os.Remove(bloomPath)
|
2023-05-22 15:40:36 +03:00
|
|
|
f, err := NewFilter(ctxbg, log, params, dbPath, bloomPath)
|
2023-01-30 16:27:06 +03:00
|
|
|
tcheck(t, err, "new filter")
|
|
|
|
err = f.Close()
|
|
|
|
tcheck(t, err, "close filter")
|
|
|
|
|
2023-05-22 15:40:36 +03:00
|
|
|
f, err = OpenFilter(ctxbg, log, params, dbPath, bloomPath, true)
|
2023-01-30 16:27:06 +03:00
|
|
|
tcheck(t, err, "open filter")
|
|
|
|
|
|
|
|
// Ensure these dirs exist. Developers should bring their own ham/spam example
|
|
|
|
// emails.
|
|
|
|
os.MkdirAll("../testdata/train/ham", 0770)
|
|
|
|
os.MkdirAll("../testdata/train/spam", 0770)
|
|
|
|
|
make mox compile on windows, without "mox serve" but with working "mox localserve"
getting mox to compile required changing code in only a few places where
package "syscall" was used: for accessing file access times and for umask
handling. an open problem is how to start a process as an unprivileged user on
windows. that's why "mox serve" isn't implemented yet. and just finding a way
to implement it now may not be good enough in the near future: we may want to
starting using a more complete privilege separation approach, with a process
handling sensitive tasks (handling private keys, authentication), where we may
want to pass file descriptors between processes. how would that work on
windows?
anyway, getting mox to compile for windows doesn't mean it works properly on
windows. the largest issue: mox would normally open a file, rename or remove
it, and finally close it. this happens during message delivery. that doesn't
work on windows, the rename/remove would fail because the file is still open.
so this commit swaps many "remove" and "close" calls. renames are a longer
story: message delivery had two ways to deliver: with "consuming" the
(temporary) message file (which would rename it to its final destination), and
without consuming (by hardlinking the file, falling back to copying). the last
delivery to a recipient of a message (and the only one in the common case of a
single recipient) would consume the message, and the earlier recipients would
not. during delivery, the already open message file was used, to parse the
message. we still want to use that open message file, and the caller now stays
responsible for closing it, but we no longer try to rename (consume) the file.
we always hardlink (or copy) during delivery (this works on windows), and the
caller is responsible for closing and removing (in that order) the original
temporary file. this does cost one syscall more. but it makes the delivery code
(responsibilities) a bit simpler.
there is one more obvious issue: the file system path separator. mox already
used the "filepath" package to join paths in many places, but not everywhere.
and it still used strings with slashes for local file access. with this commit,
the code now uses filepath.FromSlash for path strings with slashes, uses
"filepath" in a few more places where it previously didn't. also switches from
"filepath" to regular "path" package when handling mailbox names in a few
places, because those always use forward slashes, regardless of local file
system conventions. windows can handle forward slashes when opening files, so
test code that passes path strings with forward slashes straight to go stdlib
file i/o functions are left unchanged to reduce code churn. the regular
non-test code, or test code that uses path strings in places other than
standard i/o functions, does have the paths converted for consistent paths
(otherwise we would end up with paths with mixed forward/backward slashes in
log messages).
windows cannot dup a listening socket. for "mox localserve", it isn't
important, and we can work around the issue. the current approach for "mox
serve" (forking a process and passing file descriptors of listening sockets on
"privileged" ports) won't work on windows. perhaps it isn't needed on windows,
and any user can listen on "privileged" ports? that would be welcome.
on windows, os.Open cannot open a directory, so we cannot call Sync on it after
message delivery. a cursory internet search indicates that directories cannot
be synced on windows. the story is probably much more nuanced than that, with
long deep technical details/discussions/disagreement/confusion, like on unix.
for "mox localserve" we can get away with making syncdir a no-op.
2023-10-14 11:54:07 +03:00
|
|
|
hamdir := filepath.FromSlash("../testdata/train/ham")
|
|
|
|
spamdir := filepath.FromSlash("../testdata/train/spam")
|
2023-01-30 16:27:06 +03:00
|
|
|
hamfiles := tlistdir(t, hamdir)
|
|
|
|
if len(hamfiles) > 100 {
|
|
|
|
hamfiles = hamfiles[:100]
|
|
|
|
}
|
|
|
|
spamfiles := tlistdir(t, spamdir)
|
|
|
|
if len(spamfiles) > 100 {
|
|
|
|
spamfiles = spamfiles[:100]
|
|
|
|
}
|
|
|
|
|
|
|
|
err = f.TrainDirs(hamdir, "", spamdir, hamfiles, nil, spamfiles)
|
|
|
|
tcheck(t, err, "train dirs")
|
|
|
|
|
|
|
|
if len(hamfiles) == 0 || len(spamfiles) == 0 {
|
|
|
|
fmt.Println("not training, no ham and/or spam messages, add them to testdata/train/ham and testdata/train/spam")
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2023-05-22 15:40:36 +03:00
|
|
|
prob, _, _, _, err := f.ClassifyMessagePath(ctxbg, filepath.Join(hamdir, hamfiles[0]))
|
2023-01-30 16:27:06 +03:00
|
|
|
tcheck(t, err, "classify ham message")
|
|
|
|
if prob > 0.1 {
|
|
|
|
t.Fatalf("trained ham file has prob %v, expected <= 0.1", prob)
|
|
|
|
}
|
|
|
|
|
2023-05-22 15:40:36 +03:00
|
|
|
prob, _, _, _, err = f.ClassifyMessagePath(ctxbg, filepath.Join(spamdir, spamfiles[0]))
|
2023-01-30 16:27:06 +03:00
|
|
|
tcheck(t, err, "classify spam message")
|
|
|
|
if prob < 0.9 {
|
|
|
|
t.Fatalf("trained spam file has prob %v, expected > 0.9", prob)
|
|
|
|
}
|
|
|
|
|
|
|
|
err = f.Close()
|
|
|
|
tcheck(t, err, "close filter")
|
|
|
|
|
|
|
|
// Start again with empty filter. We'll train a few messages and check they are
|
|
|
|
// classified as ham/spam. Then we untrain to see they are no longer classified.
|
|
|
|
os.Remove(dbPath)
|
|
|
|
os.Remove(bloomPath)
|
2023-05-22 15:40:36 +03:00
|
|
|
f, err = NewFilter(ctxbg, log, params, dbPath, bloomPath)
|
2023-01-30 16:27:06 +03:00
|
|
|
tcheck(t, err, "open filter")
|
|
|
|
|
|
|
|
hamf, err := os.Open(filepath.Join(hamdir, hamfiles[0]))
|
|
|
|
tcheck(t, err, "open hamfile")
|
|
|
|
defer hamf.Close()
|
|
|
|
hamstat, err := hamf.Stat()
|
|
|
|
tcheck(t, err, "stat hamfile")
|
|
|
|
hamsize := hamstat.Size()
|
|
|
|
|
|
|
|
spamf, err := os.Open(filepath.Join(spamdir, spamfiles[0]))
|
|
|
|
tcheck(t, err, "open spamfile")
|
|
|
|
defer spamf.Close()
|
|
|
|
spamstat, err := spamf.Stat()
|
|
|
|
tcheck(t, err, "stat spamfile")
|
|
|
|
spamsize := spamstat.Size()
|
|
|
|
|
|
|
|
// Train each message twice, to prevent single occurrences from being ignored.
|
2023-05-22 15:40:36 +03:00
|
|
|
err = f.TrainMessage(ctxbg, hamf, hamsize, true)
|
2023-01-30 16:27:06 +03:00
|
|
|
tcheck(t, err, "train ham message")
|
|
|
|
_, err = hamf.Seek(0, 0)
|
|
|
|
tcheck(t, err, "seek ham message")
|
2023-05-22 15:40:36 +03:00
|
|
|
err = f.TrainMessage(ctxbg, hamf, hamsize, true)
|
2023-01-30 16:27:06 +03:00
|
|
|
tcheck(t, err, "train ham message")
|
|
|
|
|
2023-05-22 15:40:36 +03:00
|
|
|
err = f.TrainMessage(ctxbg, spamf, spamsize, false)
|
2023-01-30 16:27:06 +03:00
|
|
|
tcheck(t, err, "train spam message")
|
|
|
|
_, err = spamf.Seek(0, 0)
|
|
|
|
tcheck(t, err, "seek spam message")
|
2023-05-22 15:40:36 +03:00
|
|
|
err = f.TrainMessage(ctxbg, spamf, spamsize, true)
|
2023-01-30 16:27:06 +03:00
|
|
|
tcheck(t, err, "train spam message")
|
|
|
|
|
|
|
|
if !f.modified {
|
|
|
|
t.Fatalf("filter not modified after training")
|
|
|
|
}
|
|
|
|
if !f.bloom.Modified() {
|
|
|
|
t.Fatalf("bloom filter not modified after training")
|
|
|
|
}
|
|
|
|
|
|
|
|
err = f.Save()
|
|
|
|
tcheck(t, err, "save filter")
|
|
|
|
if f.modified || f.bloom.Modified() {
|
|
|
|
t.Fatalf("filter or bloom filter still modified after save")
|
|
|
|
}
|
|
|
|
|
|
|
|
// Classify and verify.
|
|
|
|
_, err = hamf.Seek(0, 0)
|
|
|
|
tcheck(t, err, "seek ham message")
|
2023-05-22 15:40:36 +03:00
|
|
|
prob, _, _, _, err = f.ClassifyMessageReader(ctxbg, hamf, hamsize)
|
2023-01-30 16:27:06 +03:00
|
|
|
tcheck(t, err, "classify ham")
|
|
|
|
if prob > 0.1 {
|
|
|
|
t.Fatalf("got prob %v, expected <= 0.1", prob)
|
|
|
|
}
|
|
|
|
|
|
|
|
_, err = spamf.Seek(0, 0)
|
|
|
|
tcheck(t, err, "seek spam message")
|
2023-05-22 15:40:36 +03:00
|
|
|
prob, _, _, _, err = f.ClassifyMessageReader(ctxbg, spamf, spamsize)
|
2023-01-30 16:27:06 +03:00
|
|
|
tcheck(t, err, "classify spam")
|
|
|
|
if prob < 0.9 {
|
|
|
|
t.Fatalf("got prob %v, expected >= 0.9", prob)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Untrain ham & spam.
|
|
|
|
_, err = hamf.Seek(0, 0)
|
|
|
|
tcheck(t, err, "seek ham message")
|
2023-05-22 15:40:36 +03:00
|
|
|
err = f.UntrainMessage(ctxbg, hamf, hamsize, true)
|
2023-01-30 16:27:06 +03:00
|
|
|
tcheck(t, err, "untrain ham message")
|
|
|
|
_, err = hamf.Seek(0, 0)
|
|
|
|
tcheck(t, err, "seek ham message")
|
2023-05-22 15:40:36 +03:00
|
|
|
err = f.UntrainMessage(ctxbg, hamf, spamsize, true)
|
2023-01-30 16:27:06 +03:00
|
|
|
tcheck(t, err, "untrain ham message")
|
|
|
|
|
|
|
|
_, err = spamf.Seek(0, 0)
|
|
|
|
tcheck(t, err, "seek spam message")
|
2023-05-22 15:40:36 +03:00
|
|
|
err = f.UntrainMessage(ctxbg, spamf, spamsize, true)
|
2023-01-30 16:27:06 +03:00
|
|
|
tcheck(t, err, "untrain spam message")
|
|
|
|
_, err = spamf.Seek(0, 0)
|
|
|
|
tcheck(t, err, "seek spam message")
|
2023-05-22 15:40:36 +03:00
|
|
|
err = f.UntrainMessage(ctxbg, spamf, spamsize, true)
|
2023-01-30 16:27:06 +03:00
|
|
|
tcheck(t, err, "untrain spam message")
|
|
|
|
|
|
|
|
if !f.modified {
|
|
|
|
t.Fatalf("filter not modified after untraining")
|
|
|
|
}
|
|
|
|
|
|
|
|
// Classify again, should be unknown.
|
|
|
|
_, err = hamf.Seek(0, 0)
|
|
|
|
tcheck(t, err, "seek ham message")
|
2023-05-22 15:40:36 +03:00
|
|
|
prob, _, _, _, err = f.ClassifyMessageReader(ctxbg, hamf, hamsize)
|
2023-01-30 16:27:06 +03:00
|
|
|
tcheck(t, err, "classify ham")
|
|
|
|
if math.Abs(prob-0.5) > 0.1 {
|
|
|
|
t.Fatalf("got prob %v, expected 0.5 +-0.1", prob)
|
|
|
|
}
|
|
|
|
|
|
|
|
_, err = spamf.Seek(0, 0)
|
|
|
|
tcheck(t, err, "seek spam message")
|
2023-05-22 15:40:36 +03:00
|
|
|
prob, _, _, _, err = f.ClassifyMessageReader(ctxbg, spamf, spamsize)
|
2023-01-30 16:27:06 +03:00
|
|
|
tcheck(t, err, "classify spam")
|
|
|
|
if math.Abs(prob-0.5) > 0.1 {
|
|
|
|
t.Fatalf("got prob %v, expected 0.5 +-0.1", prob)
|
|
|
|
}
|
|
|
|
|
|
|
|
err = f.Close()
|
|
|
|
tcheck(t, err, "close filter")
|
|
|
|
}
|