mox/store/search.go
Mechiel Lukkien 5b20cba50a
switch to slog.Logger for logging, for easier reuse of packages by external software
we don't want external software to include internal details like mlog.
slog.Logger is/will be the standard.

we still have mlog for its helper functions, and its handler that logs in
concise logfmt used by mox.

packages that are not meant for reuse still pass around mlog.Log for
convenience.

we use golang.org/x/exp/slog because we also support the previous Go toolchain
version. with the next Go release, we'll switch to the builtin slog.
2023-12-14 13:45:52 +01:00

195 lines
5.2 KiB
Go

package store
import (
"bytes"
"io"
"strings"
"unicode"
"unicode/utf8"
"github.com/mjl-/mox/message"
"github.com/mjl-/mox/mlog"
)
// WordSearch holds context for a search, with scratch buffers to prevent
// allocations for each message.
type WordSearch struct {
words, notWords [][]byte
searchBuf, keepBuf []byte
}
// PrepareWordSearch returns a search context that can be used to match multiple
// messages (after each other, not concurrently).
func PrepareWordSearch(words, notWords []string) WordSearch {
var wl, nwl [][]byte
for _, w := range words {
wl = append(wl, []byte(strings.ToLower(w)))
}
for _, w := range notWords {
nwl = append(nwl, []byte(strings.ToLower(w)))
}
keep := 0
for _, w := range words {
if len(w) > keep {
keep = len(w)
}
}
for _, w := range notWords {
if len(w) > keep {
keep = len(w)
}
}
keep += 6 // Max utf-8 character size.
bufSize := 8 * 1024
for bufSize/keep < 8 {
bufSize *= 2
}
keepBuf := make([]byte, keep)
searchBuf := make([]byte, bufSize)
return WordSearch{wl, nwl, searchBuf, keepBuf}
}
// MatchPart returns whether the part/mail message p matches the search.
// The search terms are matched against content-transfer-decoded and
// charset-decoded bodies and optionally headers.
// HTML parts are currently treated as regular text, without parsing HTML.
func (ws WordSearch) MatchPart(log mlog.Log, p *message.Part, headerToo bool) (bool, error) {
seen := map[int]bool{}
miss, err := ws.matchPart(log, p, headerToo, seen)
match := err == nil && !miss && len(seen) == len(ws.words)
return match, err
}
// If all words are seen, and we there are no not-words that force us to search
// till the end, we know we have a match.
func (ws WordSearch) isQuickHit(seen map[int]bool) bool {
return len(seen) == len(ws.words) && len(ws.notWords) == 0
}
// search a part as text and/or its subparts, recursively. Once we know we have
// a miss, we stop (either due to not-word match or error). In case of
// non-miss, the caller checks if there was a hit.
func (ws WordSearch) matchPart(log mlog.Log, p *message.Part, headerToo bool, seen map[int]bool) (miss bool, rerr error) {
if headerToo {
miss, err := ws.searchReader(log, p.HeaderReader(), seen)
if miss || err != nil || ws.isQuickHit(seen) {
return miss, err
}
}
if len(p.Parts) == 0 {
if p.MediaType != "TEXT" {
// todo: for other types we could try to find a library for parsing and search in there too.
return false, nil
}
tp := p.ReaderUTF8OrBinary()
// todo: for html and perhaps other types, we could try to parse as text and filter on the text.
miss, err := ws.searchReader(log, tp, seen)
if miss || err != nil || ws.isQuickHit(seen) {
return miss, err
}
}
for _, pp := range p.Parts {
if pp.Message != nil {
if err := pp.SetMessageReaderAt(); err != nil {
return false, err
}
pp = *pp.Message
}
miss, err := ws.matchPart(log, &pp, headerToo, seen)
if miss || err != nil || ws.isQuickHit(seen) {
return miss, err
}
}
return false, nil
}
func (ws WordSearch) searchReader(log mlog.Log, r io.Reader, seen map[int]bool) (miss bool, rerr error) {
// We will be reading through the content, stopping as soon as we known an answer:
// when all words have been seen and there are no "not words" (true), or one "not
// word" has been seen (false). We use bytes.Contains to look for the words. We
// advance our buffer in largish chunks, keeping the end of the buffer the size of
// the largest word plus the max of an utf-8 character to account for words
// spanning chunks.
have := 0
for {
n, err := io.ReadFull(r, ws.searchBuf[have:])
if n > 0 {
have += n
}
if err != nil && err != io.EOF && err != io.ErrUnexpectedEOF {
return true, err
}
if err == nil {
copy(ws.keepBuf, ws.searchBuf[have-len(ws.keepBuf):])
}
lower := toLower(ws.searchBuf[:have])
for i, w := range ws.words {
if !seen[i] && bytes.Contains(lower, w) {
seen[i] = true
if len(seen) == len(ws.words) && len(ws.notWords) == 0 {
return false, nil
}
}
}
for _, w := range ws.notWords {
if bytes.Contains(lower, w) {
return true, nil
}
}
if err != nil {
// Must be EOF or UnexpectedEOF now.
break
}
copy(ws.searchBuf, ws.keepBuf)
have = len(ws.keepBuf)
}
return false, nil
}
// in-place lower-casing, only allocating a new slice when lower-case would become
// larger. we replace RuneError (0xfffd) by byte value 0, because it would often
// increase size, but we assume no one wants to match it.
func toLower(buf []byte) []byte {
r := buf[:0]
copied := false
for i := 0; i < len(buf); {
if buf[i] < 0x80 {
b := buf[i]
if b >= 'A' && b <= 'Z' {
b += 0x20
}
r = append(r, b)
i++
continue
}
c, size := utf8.DecodeRune(buf[i:])
i += size
nc := unicode.ToLower(c)
if nc < 0 {
continue
}
if c == utf8.RuneError {
r = append(r, 0)
continue
}
nsize := utf8.RuneLen(nc)
// Take care not to overwrite the part of the buffer we still have to process.
if !copied && len(r)+nsize > i {
// eg Ⱥ 0x23a (2 bytes) to ⱥ 0x2c65 (3 bytes)
copied = true
nr := make([]byte, len(r), len(r)+nsize+len(buf)-i)
copy(nr, r)
nr = r
}
r = utf8.AppendRune(r, nc)
}
return r
}