From 21ed331c7d3bf1de9fc50dca72aadd1c5b6addd5 Mon Sep 17 00:00:00 2001 From: mizho Date: Sat, 19 Oct 2024 19:09:09 +0900 Subject: [PATCH] Match Multipart mail --- message/part.go | 4 ++++ store/search.go | 21 ++++++++++++++++--- store/search_test.go | 48 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+), 3 deletions(-) diff --git a/message/part.go b/message/part.go index 4d158d7..c604fd8 100644 --- a/message/part.go +++ b/message/part.go @@ -245,6 +245,10 @@ func (p *Part) String() string { return fmt.Sprintf("&Part{%s/%s offsets %d/%d/%d/%d lines %d decodedsize %d next %d last %d bound %q parts %v}", p.MediaType, p.MediaSubType, p.BoundaryOffset, p.HeaderOffset, p.BodyOffset, p.EndOffset, p.RawLineCount, p.DecodedSize, p.nextBoundOffset, p.lastBoundOffset, p.bound, p.Parts) } +func (p *Part) GetBound() string { + return string(p.bound) +} + // newPart parses a new part, which can be the top-level message. // offset is the bound offset for parts, and the start of message for top-level messages. parent indicates if this is a top-level message or sub-part. // If an error occurs, p's exported values can still be relevant. EnsurePart uses these values. diff --git a/store/search.go b/store/search.go index 1e2354f..66dd5c0 100644 --- a/store/search.go +++ b/store/search.go @@ -91,11 +91,26 @@ func (ws WordSearch) matchPart(log mlog.Log, p *message.Part, headerToo bool, se } if len(p.Parts) == 0 { + var tp io.Reader if p.MediaType != "TEXT" { - // todo: for other types we could try to find a library for parsing and search in there too. - return false, nil + if p.MediaType == "MULTIPART" { + // Decode and make io.Reader + // todo: avoid to load all content + content, err := io.ReadAll(p.RawReader()) + if err != nil { + return false, err + } + tp, err = decodeMultiPart(string(content), p.GetBound()) + if err != nil { + return false, err + } + } else { + // todo: for other types we could try to find a library for parsing and search in there too. + return false, nil + } + } else { + tp = p.ReaderUTF8OrBinary() } - tp := p.ReaderUTF8OrBinary() // todo: for html and perhaps other types, we could try to parse as text and filter on the text. miss, err := ws.searchReader(log, tp, seen) if miss || err != nil || ws.isQuickHit(seen) { diff --git a/store/search_test.go b/store/search_test.go index c573e4c..d4389cd 100644 --- a/store/search_test.go +++ b/store/search_test.go @@ -2,7 +2,14 @@ package store import ( "fmt" + "io" + "log/slog" + "os" + "strings" "testing" + + "github.com/mjl-/mox/message" + "github.com/mjl-/mox/mlog" ) func TestSubjectMatch(t *testing.T) { @@ -36,3 +43,44 @@ func TestSubjectMatch(t *testing.T) { } } } + +func TestMultipartMailDecode(t *testing.T) { + log := mlog.New("search", nil) + + // Load raw mail file + filePath := "../../data/mail_raw.txt" // multipart mail raw data + wordFilePath := "../../data/word.txt" + + msgFile, err := os.Open(filePath) + if err != nil { + t.Fatalf("Failed to open file: %v", err) + } + defer msgFile.Close() + + // load word + wordFile, err := os.Open(wordFilePath) + if err != nil { + t.Fatalf("Failed to open file: %v", err) + } + defer wordFile.Close() + tmp, err := io.ReadAll(wordFile) + if err != nil { + t.Fatalf("Failed to load search word: %v", err) + } + searchWord := strings.TrimSpace(string(tmp)) + + // Parse mail + mr := FileMsgReader([]byte{}, msgFile) + p, err := message.Parse(log.Logger, false, mr) + if err != nil { + t.Fatalf("parsing message for evaluating rulesets, continuing with headers %v, %s", err, slog.String("parse", "")) + } + + // Match + ws := PrepareWordSearch([]string{searchWord}, []string{}) + ok, _ := ws.MatchPart(log, &p, true) + if !ok { + t.Fatalf("Match failed %s", ws.words) + } + log.Debug("Check match", slog.String("word", string(searchWord)), slog.Bool("ok", ok)) +}