package store import ( "bytes" "io" "strings" "unicode" "unicode/utf8" "github.com/mjl-/mox/message" "github.com/mjl-/mox/mlog" ) // WordSearch holds context for a search, with scratch buffers to prevent // allocations for each message. type WordSearch struct { words, notWords [][]byte searchBuf, keepBuf []byte } // PrepareWordSearch returns a search context that can be used to match multiple // messages (after each other, not concurrently). func PrepareWordSearch(words, notWords []string) WordSearch { var wl, nwl [][]byte for _, w := range words { wl = append(wl, []byte(strings.ToLower(w))) } for _, w := range notWords { nwl = append(nwl, []byte(strings.ToLower(w))) } keep := 0 for _, w := range words { if len(w) > keep { keep = len(w) } } for _, w := range notWords { if len(w) > keep { keep = len(w) } } keep += 6 // Max utf-8 character size. bufSize := 8 * 1024 for bufSize/keep < 8 { bufSize *= 2 } keepBuf := make([]byte, keep) searchBuf := make([]byte, bufSize) return WordSearch{wl, nwl, searchBuf, keepBuf} } // MatchPart returns whether the part/mail message p matches the search. // The search terms are matched against content-transfer-decoded and // charset-decoded bodies and optionally headers. // HTML parts are currently treated as regular text, without parsing HTML. func (ws WordSearch) MatchPart(log *mlog.Log, p *message.Part, headerToo bool) (bool, error) { seen := map[int]bool{} miss, err := ws.matchPart(log, p, headerToo, seen) match := err == nil && !miss && len(seen) == len(ws.words) return match, err } // If all words are seen, and we there are no not-words that force us to search // till the end, we know we have a match. func (ws WordSearch) isQuickHit(seen map[int]bool) bool { return len(seen) == len(ws.words) && len(ws.notWords) == 0 } // search a part as text and/or its subparts, recursively. Once we know we have // a miss, we stop (either due to not-word match or error). In case of // non-miss, the caller checks if there was a hit. func (ws WordSearch) matchPart(log *mlog.Log, p *message.Part, headerToo bool, seen map[int]bool) (miss bool, rerr error) { if headerToo { miss, err := ws.searchReader(log, p.HeaderReader(), seen) if miss || err != nil || ws.isQuickHit(seen) { return miss, err } } if len(p.Parts) == 0 { if p.MediaType != "TEXT" { // todo: for other types we could try to find a library for parsing and search in there too. return false, nil } tp := p.ReaderUTF8OrBinary() // todo: for html and perhaps other types, we could try to parse as text and filter on the text. miss, err := ws.searchReader(log, tp, seen) if miss || err != nil || ws.isQuickHit(seen) { return miss, err } } for _, pp := range p.Parts { if pp.Message != nil { if err := pp.SetMessageReaderAt(); err != nil { return false, err } pp = *pp.Message } miss, err := ws.matchPart(log, &pp, headerToo, seen) if miss || err != nil || ws.isQuickHit(seen) { return miss, err } } return false, nil } func (ws WordSearch) searchReader(log *mlog.Log, r io.Reader, seen map[int]bool) (miss bool, rerr error) { // We will be reading through the content, stopping as soon as we known an answer: // when all words have been seen and there are no "not words" (true), or one "not // word" has been seen (false). We use bytes.Contains to look for the words. We // advance our buffer in largish chunks, keeping the end of the buffer the size of // the largest word plus the max of an utf-8 character to account for words // spanning chunks. have := 0 for { n, err := io.ReadFull(r, ws.searchBuf[have:]) if n > 0 { have += n } if err != nil && err != io.EOF && err != io.ErrUnexpectedEOF { return true, err } if err == nil { copy(ws.keepBuf, ws.searchBuf[have-len(ws.keepBuf):]) } lower := toLower(ws.searchBuf[:have]) for i, w := range ws.words { if !seen[i] && bytes.Contains(lower, w) { seen[i] = true if len(seen) == len(ws.words) && len(ws.notWords) == 0 { return false, nil } } } for _, w := range ws.notWords { if bytes.Contains(lower, w) { return true, nil } } if err != nil { // Must be EOF or UnexpectedEOF now. break } copy(ws.searchBuf, ws.keepBuf) have = len(ws.keepBuf) } return false, nil } // in-place lower-casing, only allocating a new slice when lower-case would become // larger. we replace RuneError (0xfffd) by byte value 0, because it would often // increase size, but we assume no one wants to match it. func toLower(buf []byte) []byte { r := buf[:0] copied := false for i := 0; i < len(buf); { if buf[i] < 0x80 { b := buf[i] if b >= 'A' && b <= 'Z' { b += 0x20 } r = append(r, b) i++ continue } c, size := utf8.DecodeRune(buf[i:]) i += size nc := unicode.ToLower(c) if nc < 0 { continue } if c == utf8.RuneError { r = append(r, 0) continue } nsize := utf8.RuneLen(nc) // Take care not to overwrite the part of the buffer we still have to process. if !copied && len(r)+nsize > i { // eg Ⱥ 0x23a (2 bytes) to ⱥ 0x2c65 (3 bytes) copied = true nr := make([]byte, len(r), len(r)+nsize+len(buf)-i) copy(nr, r) nr = r } r = utf8.AppendRune(r, nc) } return r }