2023-01-30 16:27:06 +03:00
package junk
// see https://en.wikipedia.org/wiki/Naive_Bayes_spam_filtering
// - todo: better html parsing?
// - todo: try reading text in pdf?
// - todo: try to detect language, have words per language? can be in the same dictionary. currently my dictionary is biased towards treating english as spam.
import (
"bufio"
"fmt"
"io"
"os"
"strings"
"unicode"
"golang.org/x/net/html"
"github.com/mjl-/mox/message"
)
func ( f * Filter ) tokenizeMail ( path string ) ( bool , map [ string ] struct { } , error ) {
mf , err := os . Open ( path )
if err != nil {
return false , nil , err
}
2023-02-16 15:22:00 +03:00
defer func ( ) {
err := mf . Close ( )
f . log . Check ( err , "closing message file" )
} ( )
2023-01-30 16:27:06 +03:00
fi , err := mf . Stat ( )
if err != nil {
return false , nil , err
}
p , _ := message . EnsurePart ( mf , fi . Size ( ) )
words , err := f . ParseMessage ( p )
return true , words , err
}
// ParseMessage reads a mail and returns a map with words.
func ( f * Filter ) ParseMessage ( p message . Part ) ( map [ string ] struct { } , error ) {
metaWords := map [ string ] struct { } { }
textWords := map [ string ] struct { } { }
htmlWords := map [ string ] struct { } { }
hdrs , err := p . Header ( )
if err != nil {
return nil , fmt . Errorf ( "parsing headers: %v" , err )
}
// Add words from the header, annotated with <field>+":".
// todo: add whether header is dkim-verified?
for k , l := range hdrs {
for _ , h := range l {
switch k {
case "From" , "To" , "Cc" , "Bcc" , "Reply-To" , "Subject" , "Sender" , "Return-Path" :
// case "Subject", "To":
default :
continue
}
words := map [ string ] struct { } { }
f . tokenizeText ( strings . NewReader ( h ) , words )
for w := range words {
if len ( w ) <= 3 {
continue
}
metaWords [ k + ":" + w ] = struct { } { }
}
}
}
if err := f . mailParse ( p , metaWords , textWords , htmlWords ) ; err != nil {
return nil , fmt . Errorf ( "parsing message: %w" , err )
}
for w := range metaWords {
textWords [ w ] = struct { } { }
}
for w := range htmlWords {
textWords [ w ] = struct { } { }
}
return textWords , nil
}
// mailParse looks through the mail for the first text and html parts, and tokenizes their words.
func ( f * Filter ) mailParse ( p message . Part , metaWords , textWords , htmlWords map [ string ] struct { } ) error {
ct := p . MediaType + "/" + p . MediaSubType
if ct == "TEXT/HTML" {
2023-07-28 23:15:23 +03:00
err := f . tokenizeHTML ( p . ReaderUTF8OrBinary ( ) , metaWords , htmlWords )
2023-01-30 16:27:06 +03:00
// log.Printf("html parsed, words %v", htmlWords)
return err
}
if ct == "" || strings . HasPrefix ( ct , "TEXT/" ) {
2023-07-28 23:15:23 +03:00
err := f . tokenizeText ( p . ReaderUTF8OrBinary ( ) , textWords )
2023-01-30 16:27:06 +03:00
// log.Printf("text parsed, words %v", textWords)
return err
}
if p . Message != nil {
// Nested message, happens for forwarding.
if err := p . SetMessageReaderAt ( ) ; err != nil {
return fmt . Errorf ( "setting reader on nested message: %w" , err )
}
return f . mailParse ( * p . Message , metaWords , textWords , htmlWords )
}
for _ , sp := range p . Parts {
if err := f . mailParse ( sp , metaWords , textWords , htmlWords ) ; err != nil {
return err
}
}
return nil
}
func looksRandom ( s string ) bool {
// Random strings, eg 2fvu9stm9yxhnlu. ASCII only and a many consonants in a stretch.
stretch := 0
const consonants = "bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ23456789" // 0 and 1 may be used as o and l/i
stretches := 0
for _ , c := range s {
if c >= 0x80 {
return false
}
if strings . ContainsRune ( consonants , c ) {
stretch ++
continue
}
if stretch >= 6 {
stretches ++
}
stretch = 0
}
if stretch >= 6 {
stretches ++
}
return stretches > 0
}
func looksNumeric ( s string ) bool {
s = strings . TrimPrefix ( s , "0x" ) // Hexadecimal.
var digits , hex , other , digitstretch , maxdigitstretch int
for _ , c := range s {
if c >= '0' && c <= '9' {
digits ++
digitstretch ++
continue
} else if c >= 'a' && c <= 'f' || c >= 'A' && c <= 'F' {
hex ++
} else {
other ++
}
if digitstretch > maxdigitstretch {
maxdigitstretch = digitstretch
}
}
if digitstretch > maxdigitstretch {
maxdigitstretch = digitstretch
}
return maxdigitstretch >= 4 || other == 0 && maxdigitstretch >= 3
}
func ( f * Filter ) tokenizeText ( r io . Reader , words map [ string ] struct { } ) error {
b := & strings . Builder { }
var prev string
var prev2 string
add := func ( ) {
defer b . Reset ( )
if b . Len ( ) <= 2 {
return
}
s := b . String ( )
s = strings . Trim ( s , "'" )
var nondigit bool
for _ , c := range s {
if ! unicode . IsDigit ( c ) {
nondigit = true
break
}
}
if ! ( nondigit && len ( s ) > 2 ) {
return
}
if looksRandom ( s ) {
return
}
if looksNumeric ( s ) {
return
}
// todo: do something for URLs, parse them? keep their domain only?
if f . Threegrams && prev2 != "" && prev != "" {
words [ prev2 + " " + prev + " " + s ] = struct { } { }
}
if f . Twograms && prev != "" {
words [ prev + " " + s ] = struct { } { }
}
if f . Onegrams {
words [ s ] = struct { } { }
}
prev2 = prev
prev = s
}
br := bufio . NewReader ( r )
peekLetter := func ( ) bool {
c , _ , err := br . ReadRune ( )
br . UnreadRune ( )
return err == nil && unicode . IsLetter ( c )
}
for {
c , _ , err := br . ReadRune ( )
if err == io . EOF {
break
}
if err != nil {
return err
}
if ! unicode . IsLetter ( c ) && ! unicode . IsDigit ( c ) && ( c != '\'' || b . Len ( ) > 0 && peekLetter ( ) ) {
add ( )
} else {
b . WriteRune ( unicode . ToLower ( c ) )
}
}
add ( )
return nil
}
// tokenizeHTML parses html, and tokenizes its text into words.
func ( f * Filter ) tokenizeHTML ( r io . Reader , meta , words map [ string ] struct { } ) error {
htmlReader := & htmlTextReader {
t : html . NewTokenizer ( r ) ,
meta : map [ string ] struct { } { } ,
}
return f . tokenizeText ( htmlReader , words )
}
type htmlTextReader struct {
t * html . Tokenizer
meta map [ string ] struct { }
tagStack [ ] string
buf [ ] byte
err error
}
func ( r * htmlTextReader ) Read ( buf [ ] byte ) ( n int , err error ) {
// todo: deal with invalid html better. the tokenizer is just tokenizing, we need to fix up the nesting etc. eg, rules say some elements close certain open elements.
// todo: deal with inline elements? they shouldn't cause a word break.
give := func ( nbuf [ ] byte ) ( int , error ) {
n := len ( buf )
if n > len ( nbuf ) {
n = len ( nbuf )
}
copy ( buf , nbuf [ : n ] )
nbuf = nbuf [ n : ]
if len ( nbuf ) < cap ( r . buf ) {
r . buf = r . buf [ : len ( nbuf ) ]
} else {
r . buf = make ( [ ] byte , len ( nbuf ) , 3 * len ( nbuf ) / 2 )
}
copy ( r . buf , nbuf )
return n , nil
}
if len ( r . buf ) > 0 {
return give ( r . buf )
}
if r . err != nil {
return 0 , r . err
}
for {
switch r . t . Next ( ) {
case html . ErrorToken :
r . err = r . t . Err ( )
return 0 , r . err
case html . TextToken :
if len ( r . tagStack ) > 0 {
switch r . tagStack [ len ( r . tagStack ) - 1 ] {
case "script" , "style" , "svg" :
continue
}
}
buf := r . t . Text ( )
if len ( buf ) > 0 {
return give ( buf )
}
case html . StartTagToken :
tagBuf , moreAttr := r . t . TagName ( )
tag := string ( tagBuf )
//log.Printf("tag %q %v", tag, r.tagStack)
if tag == "img" && moreAttr {
var key , val [ ] byte
for moreAttr {
key , val , moreAttr = r . t . TagAttr ( )
if string ( key ) == "alt" && len ( val ) > 0 {
return give ( val )
}
}
}
// Empty elements, https://developer.mozilla.org/en-US/docs/Glossary/Empty_element
switch tag {
case "area" , "base" , "br" , "col" , "embed" , "hr" , "img" , "input" , "link" , "meta" , "param" , "source" , "track" , "wbr" :
continue
}
r . tagStack = append ( r . tagStack , tag )
case html . EndTagToken :
// log.Printf("tag pop %v", r.tagStack)
if len ( r . tagStack ) > 0 {
r . tagStack = r . tagStack [ : len ( r . tagStack ) - 1 ]
}
case html . SelfClosingTagToken :
case html . CommentToken :
case html . DoctypeToken :
}
}
}