mox/vendor/github.com/mjl-/bstore/parse.go
Mechiel Lukkien bf04fb8a1a
improve training of junk filter
before, we used heuristics to decide when to train/untrain a message as junk or
nonjunk: the message had to be seen, be in certain mailboxes. then if a message
was marked as junk, it was junk. and otherwise it was nonjunk. this wasn't good
enough: you may want to keep some messages around as neither junk or nonjunk.
and that wasn't possible.

ideally, we would just look at the imap $Junk and $NotJunk flags. the problem
is that mail clients don't set these flags, or don't make it easy. thunderbird
can set the flags based on its own bayesian filter. it has a shortcut for
marking Junk and moving it to the junk folder (good), but the counterpart of
notjunk only marks a message as notjunk without showing in the UI that it was
marked as notjunk. there is also no "move and mark as notjunk" mechanism. e.g.
"archive" does not mark a message as notjunk. ios mail and mutt don't appear to
have any way to see or change the $Junk and $NotJunk flags.

what email clients do have is the ability to move messages to other
mailboxes/folders. so mox now has a mechanism that allows you to configure
mailboxes that automatically set $Junk or $NotJunk (or clear both) when a
message is moved/copied/delivered to that folder. e.g. a mailbox called junk or
spam or rejects marks its messags as junk. inbox, postmaster, dmarc, tlsrpt,
neutral* mark their messages as neither junk or notjunk. other folders mark
their messages as notjunk. e.g. list/*, archive. this functionality is
optional, but enabled with the quickstart and for new accounts.

also, mox now keeps track of the previous training of a message and will only
untrain/train if needed. before, there probably have been duplicate or missing
(un)trainings.

this also includes a new subcommand "retrain" to recreate the junkfilter for an
account. you should run it after updating to this version. and you should
probably also modify your account config to include the AutomaticJunkFlags.
2023-02-11 23:00:12 +01:00

329 lines
7.8 KiB
Go

package bstore
import (
"encoding"
"encoding/binary"
"fmt"
"math"
"reflect"
"time"
)
type parser struct {
buf []byte
orig []byte
}
func (p *parser) Errorf(format string, args ...any) {
panic(parseErr{fmt.Errorf(format, args...)})
}
func (p *parser) checkInt(un uint64) int {
if un > math.MaxInt32 {
p.Errorf("%w: uvarint %d does not fit in int32", ErrStore, un)
}
return int(un)
}
// Fieldmap starts a new fieldmap for n fields.
func (p *parser) Fieldmap(n int) *fieldmap {
// log.Printf("parse fieldmap %d bits", n)
nb := (n + 7) / 8
buf := p.Take(nb)
return &fieldmap{n, buf, 0, 0, p.Errorf}
}
// Take reads nb bytes.
func (p *parser) Take(nb int) []byte {
// log.Printf("take %d", nb)
if len(p.buf) < nb {
p.Errorf("%w: not enough bytes", ErrStore)
}
buf := p.buf[:nb]
p.buf = p.buf[nb:]
return buf
}
// TakeBytes reads a uvarint representing the size of the bytes, followed by
// that number of bytes.
// dup is needed if you need to hold on to the bytes. Values from BoltDB are
// only valid in the transaction, and not meant to be modified and are
// memory-mapped read-only.
func (p *parser) TakeBytes(dup bool) []byte {
un := p.Uvarint()
n := p.checkInt(un)
buf := p.Take(n)
if dup {
// todo: check for a max size, beyond which we refuse to allocate?
nbuf := make([]byte, len(buf))
copy(nbuf, buf)
buf = nbuf
}
return buf
}
func (p *parser) Uvarint() uint64 {
v, n := binary.Uvarint(p.buf)
if n == 0 {
p.Errorf("%w: uvarint: not enough bytes", ErrStore)
}
if n < 0 {
p.Errorf("%w: uvarint overflow", ErrStore)
}
// log.Printf("take uvarint, %d bytes", n)
p.buf = p.buf[n:]
return v
}
func (p *parser) Varint() int64 {
v, n := binary.Varint(p.buf)
if n == 0 {
p.Errorf("%w: varint: not enough bytes", ErrStore)
}
if n < 0 {
p.Errorf("%w: varint overflow", ErrStore)
}
// log.Printf("take varint, %d bytes", n)
p.buf = p.buf[n:]
return v
}
type parseErr struct {
err error
}
// parse rv (reflect.Struct) from buf.
// does not part primary key field.
func (st storeType) parse(rv reflect.Value, buf []byte) (rerr error) {
p := &parser{buf: buf, orig: buf}
var version uint32
defer func() {
x := recover()
if x == nil {
return
}
perr, ok := x.(parseErr)
if ok {
rerr = fmt.Errorf("%w (version %d, buf %x, orig %x)", perr.err, version, p.buf, p.orig)
return
}
panic(x)
}()
version = uint32(p.Uvarint())
tv, ok := st.Versions[version]
if !ok {
return fmt.Errorf("%w: unknown type version %d", ErrStore, version)
}
tv.parse(p, rv)
if len(p.buf) != 0 {
return fmt.Errorf("%w: leftover data after parsing", ErrStore)
}
return nil
}
// parseNew parses bk and bv into a newly created value of type st.Type.
func (st storeType) parseNew(bk, bv []byte) (reflect.Value, error) {
rv := reflect.New(st.Type).Elem()
if err := st.parseFull(rv, bk, bv); err != nil {
return reflect.Value{}, err
}
return rv, nil
}
// parseFull parses a full record from bk and bv into value rv, which must be
// of type st.Type.
func (st storeType) parseFull(rv reflect.Value, bk, bv []byte) error {
if err := parsePK(rv.Field(0), bk); err != nil {
return err
}
err := st.parse(rv, bv)
if err != nil {
return err
}
return nil
}
func (tv typeVersion) parse(p *parser, rv reflect.Value) {
// First field is the primary key, stored as boltdb key only, not in
// the value.
fm := p.Fieldmap(len(tv.Fields) - 1)
for i, f := range tv.Fields[1:] {
if f.structField.Type == nil {
// Do not parse this field in the current Go type, but
// we must still skip over the bytes.
if fm.Nonzero(i) {
f.Type.skip(p)
}
continue
}
if fm.Nonzero(i) {
f.Type.parse(p, rv.FieldByIndex(f.structField.Index))
} else if f.Nonzero {
// Consistency check. Should not happen, we enforce nonzeroness.
p.Errorf("%w: unexpected nonzero value for %q", ErrStore, f.Name)
} else {
rv.FieldByIndex(f.structField.Index).Set(reflect.Zero(f.structField.Type))
}
}
}
// parse a nonzero fieldType.
func (ft fieldType) parse(p *parser, rv reflect.Value) {
// Because we allow schema changes from ptr to nonptr, rv can be a pointer or direct value regardless of ft.Ptr.
if rv.Kind() == reflect.Ptr {
nrv := reflect.New(rv.Type().Elem())
rv.Set(nrv)
rv = nrv.Elem()
}
switch ft.Kind {
case kindBytes:
rv.SetBytes(p.TakeBytes(true))
case kindBinaryMarshal:
buf := p.TakeBytes(false)
t := rv.Type()
if t.Kind() == reflect.Ptr {
t = t.Elem()
}
v := reflect.New(t)
err := v.Interface().(encoding.BinaryUnmarshaler).UnmarshalBinary(buf)
if err != nil {
panic(parseErr{err})
}
if rv.Type().Kind() == reflect.Ptr {
rv.Set(v)
} else {
rv.Set(v.Elem())
}
case kindBool:
if ft.Ptr {
buf := p.Take(1)
rv.SetBool(buf[0] != 0)
} else {
rv.SetBool(true)
}
case kindInt:
v := p.Varint()
if v < math.MinInt32 || v > math.MaxInt32 {
p.Errorf("%w: int %d does not fit in int32", ErrStore, v)
}
rv.SetInt(v)
case kindInt8, kindInt16, kindInt32, kindInt64:
rv.SetInt(p.Varint())
case kindUint:
v := p.Uvarint()
if v > math.MaxUint32 {
p.Errorf("%w: uint %d does not fit in uint32", ErrStore, v)
}
rv.SetUint(v)
case kindUint8, kindUint16, kindUint32, kindUint64:
rv.SetUint(p.Uvarint())
case kindFloat32:
rv.SetFloat(float64(math.Float32frombits(uint32(p.Uvarint()))))
case kindFloat64:
rv.SetFloat(math.Float64frombits(p.Uvarint()))
case kindString:
rv.SetString(string(p.TakeBytes(false)))
case kindTime:
err := rv.Addr().Interface().(*time.Time).UnmarshalBinary(p.TakeBytes(false))
if err != nil {
p.Errorf("%w: parsing time: %s", ErrStore, err)
}
case kindSlice:
un := p.Uvarint()
n := p.checkInt(un)
fm := p.Fieldmap(n)
slc := reflect.MakeSlice(rv.Type(), n, n)
for i := 0; i < int(n); i++ {
if fm.Nonzero(i) {
ft.List.parse(p, slc.Index(i))
}
}
rv.Set(slc)
case kindMap:
un := p.Uvarint()
n := p.checkInt(un)
fm := p.Fieldmap(n)
mp := reflect.MakeMapWithSize(rv.Type(), n)
for i := 0; i < n; i++ {
mk := reflect.New(rv.Type().Key()).Elem()
ft.MapKey.parse(p, mk)
mv := reflect.New(rv.Type().Elem()).Elem()
if fm.Nonzero(i) {
ft.MapValue.parse(p, mv)
}
mp.SetMapIndex(mk, mv)
}
rv.Set(mp)
case kindStruct:
fm := p.Fieldmap(len(ft.Fields))
strct := reflect.New(rv.Type()).Elem()
for i, f := range ft.Fields {
if f.structField.Type == nil {
f.Type.skip(p)
continue
}
if fm.Nonzero(i) {
f.Type.parse(p, strct.FieldByIndex(f.structField.Index))
} else if f.Nonzero {
// Consistency check, we enforce that nonzero is not stored if not allowed.
p.Errorf("%w: %q", ErrZero, f.Name)
} else {
strct.FieldByIndex(f.structField.Index).Set(reflect.Zero(f.structField.Type))
}
}
rv.Set(strct)
default:
p.Errorf("internal error: unhandled field type") // should be prevented when registering type
}
}
// skip over the bytes for this fieldType. Needed when an older typeVersion has
// a field that the current reflect.Type does not (can) have.
func (ft fieldType) skip(p *parser) {
switch ft.Kind {
case kindBytes, kindBinaryMarshal, kindString:
p.TakeBytes(false)
case kindBool:
if ft.Ptr {
p.Take(1)
}
case kindInt8, kindInt16, kindInt32, kindInt, kindInt64:
p.Varint()
case kindUint8, kindUint16, kindUint32, kindUint, kindUint64, kindFloat32, kindFloat64:
p.Uvarint()
case kindTime:
p.TakeBytes(false)
case kindSlice:
un := p.Uvarint()
n := p.checkInt(un)
fm := p.Fieldmap(n)
for i := 0; i < n; i++ {
if fm.Nonzero(i) {
ft.List.skip(p)
}
}
case kindMap:
un := p.Uvarint()
n := p.checkInt(un)
fm := p.Fieldmap(n)
for i := 0; i < n; i++ {
ft.MapKey.skip(p)
if fm.Nonzero(i) {
ft.MapValue.skip(p)
}
}
case kindStruct:
fm := p.Fieldmap(len(ft.Fields))
for i, f := range ft.Fields {
if fm.Nonzero(i) {
f.Type.skip(p)
}
}
default:
p.Errorf("internal error: unhandled field type") // should be prevented when registering type
}
}