mox/store/search.go

package store

import (
	"bytes"
	"io"
	"strings"
	"unicode"
	"unicode/utf8"

	"github.com/mjl-/mox/message"
	"github.com/mjl-/mox/mlog"
)

// WordSearch holds context for a search, with scratch buffers to prevent
// allocations for each message.
type WordSearch struct {
	words, notWords    [][]byte
	searchBuf, keepBuf []byte
}

// PrepareWordSearch returns a search context that can be used to match multiple
// messages (after each other, not concurrently).
func PrepareWordSearch(words, notWords []string) WordSearch {
	var wl, nwl [][]byte
	for _, w := range words {
		wl = append(wl, []byte(strings.ToLower(w)))
	}
	for _, w := range notWords {
		nwl = append(nwl, []byte(strings.ToLower(w)))
	}

	keep := 0
	for _, w := range words {
		if len(w) > keep {
			keep = len(w)
		}
	}
	for _, w := range notWords {
		if len(w) > keep {
			keep = len(w)
		}
	}
	keep += 6 // Max utf-8 character size.

	bufSize := 8 * 1024
	for bufSize/keep < 8 {
		bufSize *= 2
	}

	keepBuf := make([]byte, keep)
	searchBuf := make([]byte, bufSize)

	return WordSearch{wl, nwl, searchBuf, keepBuf}
}

// MatchPart returns whether the part/mail message p matches the search.
// The search terms are matched against content-transfer-decoded and
// charset-decoded bodies and optionally headers.
// HTML parts are currently treated as regular text, without parsing HTML.
func (ws WordSearch) MatchPart(log *mlog.Log, p *message.Part, headerToo bool) (bool, error) {
	seen := map[int]bool{}
	miss, err := ws.matchPart(log, p, headerToo, seen)
	match := err == nil && !miss && len(seen) == len(ws.words)
	return match, err
}

// If all words are seen, and we there are no not-words that force us to search
// till the end, we know we have a match.
func (ws WordSearch) isQuickHit(seen map[int]bool) bool {
	return len(seen) == len(ws.words) && len(ws.notWords) == 0
}

// search a part as text and/or its subparts, recursively. Once we know we have
// a miss, we stop (either due to not-word match or error). In case of
// non-miss, the caller checks if there was a hit.
func (ws WordSearch) matchPart(log *mlog.Log, p *message.Part, headerToo bool, seen map[int]bool) (miss bool, rerr error) {
	if headerToo {
		miss, err := ws.searchReader(log, p.HeaderReader(), seen)
		if miss || err != nil || ws.isQuickHit(seen) {
			return miss, err
		}
	}

	if len(p.Parts) == 0 {
		if p.MediaType != "TEXT" {
			// todo: for other types we could try to find a library for parsing and search in there too.
			return false, nil
		}
		tp := p.ReaderUTF8OrBinary()
		// todo: for html and perhaps other types, we could try to parse as text and filter on the text.
		miss, err := ws.searchReader(log, tp, seen)
		if miss || err != nil || ws.isQuickHit(seen) {
			return miss, err
		}
	}
	for _, pp := range p.Parts {
		if pp.Message != nil {
			if err := pp.SetMessageReaderAt(); err != nil {
				return false, err
			}
			pp = *pp.Message
		}
		miss, err := ws.matchPart(log, &pp, headerToo, seen)
		if miss || err != nil || ws.isQuickHit(seen) {
			return miss, err
		}
	}
	return false, nil
}

func (ws WordSearch) searchReader(log *mlog.Log, r io.Reader, seen map[int]bool) (miss bool, rerr error) {
	// We will be reading through the content, stopping as soon as we known an answer:
	// when all words have been seen and there are no "not words" (true), or one "not
	// word" has been seen (false). We use bytes.Contains to look for the words. We
	// advance our buffer in largish chunks, keeping the end of the buffer the size of
	// the largest word plus the max of an utf-8 character to account for words
	// spanning chunks.

	have := 0
	for {
		n, err := io.ReadFull(r, ws.searchBuf[have:])
		if n > 0 {
			have += n
		}
		if err != nil && err != io.EOF && err != io.ErrUnexpectedEOF {
			return true, err
		}
		if err == nil {
			copy(ws.keepBuf, ws.searchBuf[have-len(ws.keepBuf):])
		}

		lower := toLower(ws.searchBuf[:have])

		for i, w := range ws.words {
			if !seen[i] && bytes.Contains(lower, w) {
				seen[i] = true
				if len(seen) == len(ws.words) && len(ws.notWords) == 0 {
					return false, nil
				}
			}
		}
		for _, w := range ws.notWords {
			if bytes.Contains(lower, w) {
				return true, nil
			}
		}
		if err != nil {
			// Must be EOF or UnexpectedEOF now.
			break
		}
		copy(ws.searchBuf, ws.keepBuf)
		have = len(ws.keepBuf)
	}
	return false, nil
}

// in-place lower-casing, only allocating a new slice when lower-case would become
// larger. we replace RuneError (0xfffd) by byte value 0, because it would often
// increase size, but we assume no one wants to match it.
func toLower(buf []byte) []byte {
	r := buf[:0]
	copied := false
	for i := 0; i < len(buf); {
		if buf[i] < 0x80 {
			b := buf[i]
			if b >= 'A' && b <= 'Z' {
				b += 0x20
			}
			r = append(r, b)
			i++
			continue
		}
		c, size := utf8.DecodeRune(buf[i:])
		i += size
		nc := unicode.ToLower(c)
		if nc < 0 {
			continue
		}
		if c == utf8.RuneError {
			r = append(r, 0)
			continue
		}
		nsize := utf8.RuneLen(nc)
		// Take care not to overwrite the part of the buffer we still have to process.
		if !copied && len(r)+nsize > i {
			// eg Ⱥ 0x23a (2 bytes) to ⱥ 0x2c65 (3 bytes)
			copied = true
			nr := make([]byte, len(r), len(r)+nsize+len(buf)-i)
			copy(nr, r)
			nr = r
		}
		r = utf8.AppendRune(r, nc)
	}
	return r
}
implement decoding charsets (other than ascii and utf-8) while reading textual message parts, and improve search message.Part now has a ReaderUTF8OrBinary() along with the existing Reader(). the new function returns a reader of decoded content. we now use it in a few places, including search. we only support the charsets in golang.org/x/text/encoding/ianaindex. search has also been changed to not read the entire message in memory. instead, we make one 8k buffer for reading and search in that, and we keep the buffer around for all messages. saves quite some allocations when searching large mailboxes. 2023-07-28 23:15:23 +03:00			`package store`

			`import (`
			`"bytes"`
			`"io"`
			`"strings"`
			`"unicode"`
			`"unicode/utf8"`

			`"github.com/mjl-/mox/message"`
			`"github.com/mjl-/mox/mlog"`
			`)`

			`// WordSearch holds context for a search, with scratch buffers to prevent`
			`// allocations for each message.`
			`type WordSearch struct {`
			`words, notWords [][]byte`
			`searchBuf, keepBuf []byte`
			`}`

			`// PrepareWordSearch returns a search context that can be used to match multiple`
			`// messages (after each other, not concurrently).`
			`func PrepareWordSearch(words, notWords []string) WordSearch {`
			`var wl, nwl [][]byte`
			`for _, w := range words {`
			`wl = append(wl, []byte(strings.ToLower(w)))`
			`}`
			`for _, w := range notWords {`
			`nwl = append(nwl, []byte(strings.ToLower(w)))`
			`}`

			`keep := 0`
			`for _, w := range words {`
			`if len(w) > keep {`
			`keep = len(w)`
			`}`
			`}`
			`for _, w := range notWords {`
			`if len(w) > keep {`
			`keep = len(w)`
			`}`
			`}`
			`keep += 6 // Max utf-8 character size.`

			`bufSize := 8 * 1024`
			`for bufSize/keep < 8 {`
			`bufSize *= 2`
			`}`

			`keepBuf := make([]byte, keep)`
			`searchBuf := make([]byte, bufSize)`

			`return WordSearch{wl, nwl, searchBuf, keepBuf}`
			`}`

			`// MatchPart returns whether the part/mail message p matches the search.`
			`// The search terms are matched against content-transfer-decoded and`
			`// charset-decoded bodies and optionally headers.`
			`// HTML parts are currently treated as regular text, without parsing HTML.`
			`func (ws WordSearch) MatchPart(log mlog.Log, p message.Part, headerToo bool) (bool, error) {`
			`seen := map[int]bool{}`
			`miss, err := ws.matchPart(log, p, headerToo, seen)`
			`match := err == nil && !miss && len(seen) == len(ws.words)`
			`return match, err`
			`}`

			`// If all words are seen, and we there are no not-words that force us to search`
			`// till the end, we know we have a match.`
			`func (ws WordSearch) isQuickHit(seen map[int]bool) bool {`
			`return len(seen) == len(ws.words) && len(ws.notWords) == 0`
			`}`

			`// search a part as text and/or its subparts, recursively. Once we know we have`
			`// a miss, we stop (either due to not-word match or error). In case of`
			`// non-miss, the caller checks if there was a hit.`
			`func (ws WordSearch) matchPart(log mlog.Log, p message.Part, headerToo bool, seen map[int]bool) (miss bool, rerr error) {`
			`if headerToo {`
			`miss, err := ws.searchReader(log, p.HeaderReader(), seen)`
			`if miss \|\| err != nil \|\| ws.isQuickHit(seen) {`
			`return miss, err`
			`}`
			`}`

			`if len(p.Parts) == 0 {`
			`if p.MediaType != "TEXT" {`
			`// todo: for other types we could try to find a library for parsing and search in there too.`
			`return false, nil`
			`}`
			`tp := p.ReaderUTF8OrBinary()`
			`// todo: for html and perhaps other types, we could try to parse as text and filter on the text.`
			`miss, err := ws.searchReader(log, tp, seen)`
			`if miss \|\| err != nil \|\| ws.isQuickHit(seen) {`
			`return miss, err`
			`}`
			`}`
			`for _, pp := range p.Parts {`
			`if pp.Message != nil {`
			`if err := pp.SetMessageReaderAt(); err != nil {`
			`return false, err`
			`}`
			`pp = *pp.Message`
			`}`
			`miss, err := ws.matchPart(log, &pp, headerToo, seen)`
			`if miss \|\| err != nil \|\| ws.isQuickHit(seen) {`
			`return miss, err`
			`}`
			`}`
			`return false, nil`
			`}`

			`func (ws WordSearch) searchReader(log *mlog.Log, r io.Reader, seen map[int]bool) (miss bool, rerr error) {`
			`// We will be reading through the content, stopping as soon as we known an answer:`
			`// when all words have been seen and there are no "not words" (true), or one "not`
			`// word" has been seen (false). We use bytes.Contains to look for the words. We`
			`// advance our buffer in largish chunks, keeping the end of the buffer the size of`
			`// the largest word plus the max of an utf-8 character to account for words`
			`// spanning chunks.`

			`have := 0`
			`for {`
			`n, err := io.ReadFull(r, ws.searchBuf[have:])`
			`if n > 0 {`
			`have += n`
			`}`
			`if err != nil && err != io.EOF && err != io.ErrUnexpectedEOF {`
			`return true, err`
			`}`
			`if err == nil {`
			`copy(ws.keepBuf, ws.searchBuf[have-len(ws.keepBuf):])`
			`}`

			`lower := toLower(ws.searchBuf[:have])`

			`for i, w := range ws.words {`
			`if !seen[i] && bytes.Contains(lower, w) {`
			`seen[i] = true`
			`if len(seen) == len(ws.words) && len(ws.notWords) == 0 {`
			`return false, nil`
			`}`
			`}`
			`}`
			`for _, w := range ws.notWords {`
			`if bytes.Contains(lower, w) {`
			`return true, nil`
			`}`
			`}`
			`if err != nil {`
			`// Must be EOF or UnexpectedEOF now.`
			`break`
			`}`
			`copy(ws.searchBuf, ws.keepBuf)`
			`have = len(ws.keepBuf)`
			`}`
			`return false, nil`
			`}`

			`// in-place lower-casing, only allocating a new slice when lower-case would become`
			`// larger. we replace RuneError (0xfffd) by byte value 0, because it would often`
			`// increase size, but we assume no one wants to match it.`
			`func toLower(buf []byte) []byte {`
			`r := buf[:0]`
			`copied := false`
			`for i := 0; i < len(buf); {`
			`if buf[i] < 0x80 {`
			`b := buf[i]`
			`if b >= 'A' && b <= 'Z' {`
			`b += 0x20`
			`}`
			`r = append(r, b)`
			`i++`
			`continue`
			`}`
			`c, size := utf8.DecodeRune(buf[i:])`
			`i += size`
			`nc := unicode.ToLower(c)`
			`if nc < 0 {`
			`continue`
			`}`
			`if c == utf8.RuneError {`
			`r = append(r, 0)`
			`continue`
			`}`
			`nsize := utf8.RuneLen(nc)`
			`// Take care not to overwrite the part of the buffer we still have to process.`
			`if !copied && len(r)+nsize > i {`
			`// eg Ⱥ 0x23a (2 bytes) to ⱥ 0x2c65 (3 bytes)`
			`copied = true`
			`nr := make([]byte, len(r), len(r)+nsize+len(buf)-i)`
			`copy(nr, r)`
			`nr = r`
			`}`
			`r = utf8.AppendRune(r, nc)`
			`}`
			`return r`
			`}`