mirror of
https://github.com/mjl-/mox.git
synced 2024-12-26 16:33:47 +03:00
324 lines
7.3 KiB
Go
324 lines
7.3 KiB
Go
|
package junk
|
||
|
|
||
|
// see https://en.wikipedia.org/wiki/Naive_Bayes_spam_filtering
|
||
|
// - todo: better html parsing?
|
||
|
// - todo: try reading text in pdf?
|
||
|
// - todo: try to detect language, have words per language? can be in the same dictionary. currently my dictionary is biased towards treating english as spam.
|
||
|
|
||
|
import (
|
||
|
"bufio"
|
||
|
"fmt"
|
||
|
"io"
|
||
|
"os"
|
||
|
"strings"
|
||
|
"unicode"
|
||
|
|
||
|
"golang.org/x/net/html"
|
||
|
|
||
|
"github.com/mjl-/mox/message"
|
||
|
)
|
||
|
|
||
|
func (f *Filter) tokenizeMail(path string) (bool, map[string]struct{}, error) {
|
||
|
mf, err := os.Open(path)
|
||
|
if err != nil {
|
||
|
return false, nil, err
|
||
|
}
|
||
|
defer mf.Close()
|
||
|
fi, err := mf.Stat()
|
||
|
if err != nil {
|
||
|
return false, nil, err
|
||
|
}
|
||
|
p, _ := message.EnsurePart(mf, fi.Size())
|
||
|
words, err := f.ParseMessage(p)
|
||
|
return true, words, err
|
||
|
}
|
||
|
|
||
|
// ParseMessage reads a mail and returns a map with words.
|
||
|
func (f *Filter) ParseMessage(p message.Part) (map[string]struct{}, error) {
|
||
|
metaWords := map[string]struct{}{}
|
||
|
textWords := map[string]struct{}{}
|
||
|
htmlWords := map[string]struct{}{}
|
||
|
|
||
|
hdrs, err := p.Header()
|
||
|
if err != nil {
|
||
|
return nil, fmt.Errorf("parsing headers: %v", err)
|
||
|
}
|
||
|
|
||
|
// Add words from the header, annotated with <field>+":".
|
||
|
// todo: add whether header is dkim-verified?
|
||
|
for k, l := range hdrs {
|
||
|
for _, h := range l {
|
||
|
switch k {
|
||
|
case "From", "To", "Cc", "Bcc", "Reply-To", "Subject", "Sender", "Return-Path":
|
||
|
// case "Subject", "To":
|
||
|
default:
|
||
|
continue
|
||
|
}
|
||
|
words := map[string]struct{}{}
|
||
|
f.tokenizeText(strings.NewReader(h), words)
|
||
|
for w := range words {
|
||
|
if len(w) <= 3 {
|
||
|
continue
|
||
|
}
|
||
|
metaWords[k+":"+w] = struct{}{}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if err := f.mailParse(p, metaWords, textWords, htmlWords); err != nil {
|
||
|
return nil, fmt.Errorf("parsing message: %w", err)
|
||
|
}
|
||
|
|
||
|
for w := range metaWords {
|
||
|
textWords[w] = struct{}{}
|
||
|
}
|
||
|
for w := range htmlWords {
|
||
|
textWords[w] = struct{}{}
|
||
|
}
|
||
|
|
||
|
return textWords, nil
|
||
|
}
|
||
|
|
||
|
// mailParse looks through the mail for the first text and html parts, and tokenizes their words.
|
||
|
func (f *Filter) mailParse(p message.Part, metaWords, textWords, htmlWords map[string]struct{}) error {
|
||
|
ct := p.MediaType + "/" + p.MediaSubType
|
||
|
|
||
|
if ct == "TEXT/HTML" {
|
||
|
err := f.tokenizeHTML(p.Reader(), metaWords, htmlWords)
|
||
|
// log.Printf("html parsed, words %v", htmlWords)
|
||
|
return err
|
||
|
}
|
||
|
if ct == "" || strings.HasPrefix(ct, "TEXT/") {
|
||
|
err := f.tokenizeText(p.Reader(), textWords)
|
||
|
// log.Printf("text parsed, words %v", textWords)
|
||
|
return err
|
||
|
}
|
||
|
if p.Message != nil {
|
||
|
// Nested message, happens for forwarding.
|
||
|
if err := p.SetMessageReaderAt(); err != nil {
|
||
|
return fmt.Errorf("setting reader on nested message: %w", err)
|
||
|
}
|
||
|
return f.mailParse(*p.Message, metaWords, textWords, htmlWords)
|
||
|
}
|
||
|
for _, sp := range p.Parts {
|
||
|
if err := f.mailParse(sp, metaWords, textWords, htmlWords); err != nil {
|
||
|
return err
|
||
|
}
|
||
|
}
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func looksRandom(s string) bool {
|
||
|
// Random strings, eg 2fvu9stm9yxhnlu. ASCII only and a many consonants in a stretch.
|
||
|
stretch := 0
|
||
|
const consonants = "bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ23456789" // 0 and 1 may be used as o and l/i
|
||
|
stretches := 0
|
||
|
for _, c := range s {
|
||
|
if c >= 0x80 {
|
||
|
return false
|
||
|
}
|
||
|
if strings.ContainsRune(consonants, c) {
|
||
|
stretch++
|
||
|
continue
|
||
|
}
|
||
|
if stretch >= 6 {
|
||
|
stretches++
|
||
|
}
|
||
|
stretch = 0
|
||
|
}
|
||
|
if stretch >= 6 {
|
||
|
stretches++
|
||
|
}
|
||
|
return stretches > 0
|
||
|
}
|
||
|
|
||
|
func looksNumeric(s string) bool {
|
||
|
s = strings.TrimPrefix(s, "0x") // Hexadecimal.
|
||
|
var digits, hex, other, digitstretch, maxdigitstretch int
|
||
|
for _, c := range s {
|
||
|
if c >= '0' && c <= '9' {
|
||
|
digits++
|
||
|
digitstretch++
|
||
|
continue
|
||
|
} else if c >= 'a' && c <= 'f' || c >= 'A' && c <= 'F' {
|
||
|
hex++
|
||
|
} else {
|
||
|
other++
|
||
|
}
|
||
|
if digitstretch > maxdigitstretch {
|
||
|
maxdigitstretch = digitstretch
|
||
|
}
|
||
|
}
|
||
|
if digitstretch > maxdigitstretch {
|
||
|
maxdigitstretch = digitstretch
|
||
|
}
|
||
|
return maxdigitstretch >= 4 || other == 0 && maxdigitstretch >= 3
|
||
|
}
|
||
|
|
||
|
func (f *Filter) tokenizeText(r io.Reader, words map[string]struct{}) error {
|
||
|
b := &strings.Builder{}
|
||
|
var prev string
|
||
|
var prev2 string
|
||
|
|
||
|
add := func() {
|
||
|
defer b.Reset()
|
||
|
if b.Len() <= 2 {
|
||
|
return
|
||
|
}
|
||
|
|
||
|
s := b.String()
|
||
|
s = strings.Trim(s, "'")
|
||
|
var nondigit bool
|
||
|
for _, c := range s {
|
||
|
if !unicode.IsDigit(c) {
|
||
|
nondigit = true
|
||
|
break
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if !(nondigit && len(s) > 2) {
|
||
|
return
|
||
|
}
|
||
|
|
||
|
if looksRandom(s) {
|
||
|
return
|
||
|
}
|
||
|
if looksNumeric(s) {
|
||
|
return
|
||
|
}
|
||
|
|
||
|
// todo: do something for URLs, parse them? keep their domain only?
|
||
|
|
||
|
if f.Threegrams && prev2 != "" && prev != "" {
|
||
|
words[prev2+" "+prev+" "+s] = struct{}{}
|
||
|
}
|
||
|
if f.Twograms && prev != "" {
|
||
|
words[prev+" "+s] = struct{}{}
|
||
|
}
|
||
|
if f.Onegrams {
|
||
|
words[s] = struct{}{}
|
||
|
}
|
||
|
prev2 = prev
|
||
|
prev = s
|
||
|
}
|
||
|
|
||
|
br := bufio.NewReader(r)
|
||
|
|
||
|
peekLetter := func() bool {
|
||
|
c, _, err := br.ReadRune()
|
||
|
br.UnreadRune()
|
||
|
return err == nil && unicode.IsLetter(c)
|
||
|
}
|
||
|
|
||
|
for {
|
||
|
c, _, err := br.ReadRune()
|
||
|
if err == io.EOF {
|
||
|
break
|
||
|
}
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
if !unicode.IsLetter(c) && !unicode.IsDigit(c) && (c != '\'' || b.Len() > 0 && peekLetter()) {
|
||
|
add()
|
||
|
} else {
|
||
|
b.WriteRune(unicode.ToLower(c))
|
||
|
}
|
||
|
}
|
||
|
add()
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
// tokenizeHTML parses html, and tokenizes its text into words.
|
||
|
func (f *Filter) tokenizeHTML(r io.Reader, meta, words map[string]struct{}) error {
|
||
|
htmlReader := &htmlTextReader{
|
||
|
t: html.NewTokenizer(r),
|
||
|
meta: map[string]struct{}{},
|
||
|
}
|
||
|
return f.tokenizeText(htmlReader, words)
|
||
|
}
|
||
|
|
||
|
type htmlTextReader struct {
|
||
|
t *html.Tokenizer
|
||
|
meta map[string]struct{}
|
||
|
tagStack []string
|
||
|
buf []byte
|
||
|
err error
|
||
|
}
|
||
|
|
||
|
func (r *htmlTextReader) Read(buf []byte) (n int, err error) {
|
||
|
// todo: deal with invalid html better. the tokenizer is just tokenizing, we need to fix up the nesting etc. eg, rules say some elements close certain open elements.
|
||
|
// todo: deal with inline elements? they shouldn't cause a word break.
|
||
|
|
||
|
give := func(nbuf []byte) (int, error) {
|
||
|
n := len(buf)
|
||
|
if n > len(nbuf) {
|
||
|
n = len(nbuf)
|
||
|
}
|
||
|
copy(buf, nbuf[:n])
|
||
|
nbuf = nbuf[n:]
|
||
|
if len(nbuf) < cap(r.buf) {
|
||
|
r.buf = r.buf[:len(nbuf)]
|
||
|
} else {
|
||
|
r.buf = make([]byte, len(nbuf), 3*len(nbuf)/2)
|
||
|
}
|
||
|
copy(r.buf, nbuf)
|
||
|
return n, nil
|
||
|
}
|
||
|
|
||
|
if len(r.buf) > 0 {
|
||
|
return give(r.buf)
|
||
|
}
|
||
|
if r.err != nil {
|
||
|
return 0, r.err
|
||
|
}
|
||
|
|
||
|
for {
|
||
|
switch r.t.Next() {
|
||
|
case html.ErrorToken:
|
||
|
r.err = r.t.Err()
|
||
|
return 0, r.err
|
||
|
case html.TextToken:
|
||
|
if len(r.tagStack) > 0 {
|
||
|
switch r.tagStack[len(r.tagStack)-1] {
|
||
|
case "script", "style", "svg":
|
||
|
continue
|
||
|
}
|
||
|
}
|
||
|
buf := r.t.Text()
|
||
|
if len(buf) > 0 {
|
||
|
return give(buf)
|
||
|
}
|
||
|
case html.StartTagToken:
|
||
|
tagBuf, moreAttr := r.t.TagName()
|
||
|
tag := string(tagBuf)
|
||
|
//log.Printf("tag %q %v", tag, r.tagStack)
|
||
|
|
||
|
if tag == "img" && moreAttr {
|
||
|
var key, val []byte
|
||
|
for moreAttr {
|
||
|
key, val, moreAttr = r.t.TagAttr()
|
||
|
if string(key) == "alt" && len(val) > 0 {
|
||
|
return give(val)
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Empty elements, https://developer.mozilla.org/en-US/docs/Glossary/Empty_element
|
||
|
switch tag {
|
||
|
case "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr":
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
r.tagStack = append(r.tagStack, tag)
|
||
|
case html.EndTagToken:
|
||
|
// log.Printf("tag pop %v", r.tagStack)
|
||
|
if len(r.tagStack) > 0 {
|
||
|
r.tagStack = r.tagStack[:len(r.tagStack)-1]
|
||
|
}
|
||
|
case html.SelfClosingTagToken:
|
||
|
case html.CommentToken:
|
||
|
case html.DoctypeToken:
|
||
|
}
|
||
|
}
|
||
|
}
|