mox/store/search.go

196 lines
5.2 KiB
Go
Raw Permalink Normal View History

package store
import (
"bytes"
"io"
"strings"
"unicode"
"unicode/utf8"
"github.com/mjl-/mox/message"
"github.com/mjl-/mox/mlog"
)
// WordSearch holds context for a search, with scratch buffers to prevent
// allocations for each message.
type WordSearch struct {
words, notWords [][]byte
searchBuf, keepBuf []byte
}
// PrepareWordSearch returns a search context that can be used to match multiple
// messages (after each other, not concurrently).
func PrepareWordSearch(words, notWords []string) WordSearch {
var wl, nwl [][]byte
for _, w := range words {
wl = append(wl, []byte(strings.ToLower(w)))
}
for _, w := range notWords {
nwl = append(nwl, []byte(strings.ToLower(w)))
}
keep := 0
for _, w := range words {
if len(w) > keep {
keep = len(w)
}
}
for _, w := range notWords {
if len(w) > keep {
keep = len(w)
}
}
keep += 6 // Max utf-8 character size.
bufSize := 8 * 1024
for bufSize/keep < 8 {
bufSize *= 2
}
keepBuf := make([]byte, keep)
searchBuf := make([]byte, bufSize)
return WordSearch{wl, nwl, searchBuf, keepBuf}
}
// MatchPart returns whether the part/mail message p matches the search.
// The search terms are matched against content-transfer-decoded and
// charset-decoded bodies and optionally headers.
// HTML parts are currently treated as regular text, without parsing HTML.
func (ws WordSearch) MatchPart(log mlog.Log, p *message.Part, headerToo bool) (bool, error) {
seen := map[int]bool{}
miss, err := ws.matchPart(log, p, headerToo, seen)
match := err == nil && !miss && len(seen) == len(ws.words)
return match, err
}
// If all words are seen, and we there are no not-words that force us to search
// till the end, we know we have a match.
func (ws WordSearch) isQuickHit(seen map[int]bool) bool {
return len(seen) == len(ws.words) && len(ws.notWords) == 0
}
// search a part as text and/or its subparts, recursively. Once we know we have
// a miss, we stop (either due to not-word match or error). In case of
// non-miss, the caller checks if there was a hit.
func (ws WordSearch) matchPart(log mlog.Log, p *message.Part, headerToo bool, seen map[int]bool) (miss bool, rerr error) {
if headerToo {
miss, err := ws.searchReader(log, p.HeaderReader(), seen)
if miss || err != nil || ws.isQuickHit(seen) {
return miss, err
}
}
if len(p.Parts) == 0 {
if p.MediaType != "TEXT" {
// todo: for other types we could try to find a library for parsing and search in there too.
return false, nil
}
tp := p.ReaderUTF8OrBinary()
// todo: for html and perhaps other types, we could try to parse as text and filter on the text.
miss, err := ws.searchReader(log, tp, seen)
if miss || err != nil || ws.isQuickHit(seen) {
return miss, err
}
}
for _, pp := range p.Parts {
if pp.Message != nil {
if err := pp.SetMessageReaderAt(); err != nil {
return false, err
}
pp = *pp.Message
}
miss, err := ws.matchPart(log, &pp, headerToo, seen)
if miss || err != nil || ws.isQuickHit(seen) {
return miss, err
}
}
return false, nil
}
func (ws WordSearch) searchReader(log mlog.Log, r io.Reader, seen map[int]bool) (miss bool, rerr error) {
// We will be reading through the content, stopping as soon as we known an answer:
// when all words have been seen and there are no "not words" (true), or one "not
// word" has been seen (false). We use bytes.Contains to look for the words. We
// advance our buffer in largish chunks, keeping the end of the buffer the size of
// the largest word plus the max of an utf-8 character to account for words
// spanning chunks.
have := 0
for {
n, err := io.ReadFull(r, ws.searchBuf[have:])
if n > 0 {
have += n
}
if err != nil && err != io.EOF && err != io.ErrUnexpectedEOF {
return true, err
}
if err == nil {
copy(ws.keepBuf, ws.searchBuf[have-len(ws.keepBuf):])
}
lower := toLower(ws.searchBuf[:have])
for i, w := range ws.words {
if !seen[i] && bytes.Contains(lower, w) {
seen[i] = true
if len(seen) == len(ws.words) && len(ws.notWords) == 0 {
return false, nil
}
}
}
for _, w := range ws.notWords {
if bytes.Contains(lower, w) {
return true, nil
}
}
if err != nil {
// Must be EOF or UnexpectedEOF now.
break
}
copy(ws.searchBuf, ws.keepBuf)
have = len(ws.keepBuf)
}
return false, nil
}
// in-place lower-casing, only allocating a new slice when lower-case would become
// larger. we replace RuneError (0xfffd) by byte value 0, because it would often
// increase size, but we assume no one wants to match it.
func toLower(buf []byte) []byte {
r := buf[:0]
copied := false
for i := 0; i < len(buf); {
if buf[i] < 0x80 {
b := buf[i]
if b >= 'A' && b <= 'Z' {
b += 0x20
}
r = append(r, b)
i++
continue
}
c, size := utf8.DecodeRune(buf[i:])
i += size
nc := unicode.ToLower(c)
if nc < 0 {
continue
}
if c == utf8.RuneError {
r = append(r, 0)
continue
}
nsize := utf8.RuneLen(nc)
// Take care not to overwrite the part of the buffer we still have to process.
if !copied && len(r)+nsize > i {
// eg Ⱥ 0x23a (2 bytes) to ⱥ 0x2c65 (3 bytes)
copied = true
nr := make([]byte, len(r), len(r)+nsize+len(buf)-i)
copy(nr, r)
nr = r
}
r = utf8.AppendRune(r, nc)
}
return r
}