From 19550cc041aae9a0cc32041ff73059cc477cc490 Mon Sep 17 00:00:00 2001 From: Mechiel Lukkien Date: Tue, 1 Aug 2023 09:50:26 +0200 Subject: [PATCH] use Go's mail.ReadMessage instead of textproto.ReadMIMEHeaders and decode RFC 2047 charsets in subject header when parsing message as the recent Go patch release showed, textproto.ReadMIMEHeaders is parsing http headers, strictly. too strict for email message headers. valid headers, e.g. with a slash in them, were rejected by textproto.ReadMIMEHeaders. the functions in Go's mail package handle RFC 2047 charset-encoded words in address headers. it can do that because we tell it those headers are addresses, where such encodings are valid. but that encoding isn't valid in all places in all headers. for other cases, we must decode explicitly, such as for the subject header. with this change, some messages that could not be parsed before can now be parsed (where headers were previously rejected for being invalid). and the subject of parsed messages could now be properly decoded. you could run "mox ensureparsed -all " (while mox isn't running) to force reparsing all messages. mox needs a subcommand to reparse while running... it wasn't much of a problem before, because imap email clients typically do their own parsing (of headers, including subject decoding) again. but with the upcoming webmail client, any wrong parsing quickly reveals itself. --- message/part.go | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/message/part.go b/message/part.go index 974d124..5ded570 100644 --- a/message/part.go +++ b/message/part.go @@ -25,6 +25,8 @@ import ( "strings" "time" + "golang.org/x/text/encoding/ianaindex" + "github.com/mjl-/mox/mlog" "github.com/mjl-/mox/moxio" "github.com/mjl-/mox/moxvar" @@ -352,7 +354,32 @@ func (p *Part) HeaderReader() io.Reader { } func parseHeader(r io.Reader) (textproto.MIMEHeader, error) { - return textproto.NewReader(bufio.NewReader(r)).ReadMIMEHeader() + // We read using mail.ReadMessage instead of textproto.ReadMIMEHeaders because the + // first handles email messages properly, while the second only works for HTTP + // headers. + var zero textproto.MIMEHeader + msg, err := mail.ReadMessage(bufio.NewReader(r)) + if err != nil { + return zero, err + } + return textproto.MIMEHeader(msg.Header), nil +} + +var wordDecoder = mime.WordDecoder{ + CharsetReader: func(charset string, r io.Reader) (io.Reader, error) { + switch strings.ToLower(charset) { + case "", "us-ascii", "utf-8": + return r, nil + } + enc, _ := ianaindex.MIME.Encoding(charset) + if enc == nil { + enc, _ = ianaindex.IANA.Encoding(charset) + } + if enc == nil { + return r, fmt.Errorf("unknown charset %q", charset) + } + return enc.NewDecoder().Reader(r), nil + }, } func parseEnvelope(h mail.Header) (*Envelope, error) { @@ -369,9 +396,14 @@ func parseEnvelope(h mail.Header) (*Envelope, error) { date = time.Unix(date.Unix(), 0).UTC() } + subject := h.Get("Subject") + if s, err := wordDecoder.DecodeHeader(subject); err == nil { + subject = s + } + env := &Envelope{ date, - h.Get("Subject"), + subject, parseAddressList(h, "from"), parseAddressList(h, "sender"), parseAddressList(h, "reply-to"),