use Go's mail.ReadMessage instead of textproto.ReadMIMEHeaders and decode RFC 2047 charsets in subject header when parsing message

as the recent Go patch release showed, textproto.ReadMIMEHeaders is parsing
http headers, strictly. too strict for email message headers. valid headers,
e.g. with a slash in them, were rejected by textproto.ReadMIMEHeaders.

the functions in Go's mail package handle RFC 2047 charset-encoded words in
address headers. it can do that because we tell it those headers are addresses,
where such encodings are valid. but that encoding isn't valid in all places in
all headers. for other cases, we must decode explicitly, such as for the
subject header.

with this change, some messages that could not be parsed before can now be
parsed (where headers were previously rejected for being invalid). and the
subject of parsed messages could now be properly decoded. you could run "mox
ensureparsed -all <account>" (while mox isn't running) to force reparsing all
messages. mox needs a subcommand to reparse while running...

it wasn't much of a problem before, because imap email clients typically do
their own parsing (of headers, including subject decoding) again.  but with the
upcoming webmail client, any wrong parsing quickly reveals itself.
This commit is contained in:
Mechiel Lukkien 2023-08-01 09:50:26 +02:00
parent 3ef1f31359
commit 19550cc041
No known key found for this signature in database

View file

@ -25,6 +25,8 @@ import (
"strings" "strings"
"time" "time"
"golang.org/x/text/encoding/ianaindex"
"github.com/mjl-/mox/mlog" "github.com/mjl-/mox/mlog"
"github.com/mjl-/mox/moxio" "github.com/mjl-/mox/moxio"
"github.com/mjl-/mox/moxvar" "github.com/mjl-/mox/moxvar"
@ -352,7 +354,32 @@ func (p *Part) HeaderReader() io.Reader {
} }
func parseHeader(r io.Reader) (textproto.MIMEHeader, error) { func parseHeader(r io.Reader) (textproto.MIMEHeader, error) {
return textproto.NewReader(bufio.NewReader(r)).ReadMIMEHeader() // We read using mail.ReadMessage instead of textproto.ReadMIMEHeaders because the
// first handles email messages properly, while the second only works for HTTP
// headers.
var zero textproto.MIMEHeader
msg, err := mail.ReadMessage(bufio.NewReader(r))
if err != nil {
return zero, err
}
return textproto.MIMEHeader(msg.Header), nil
}
var wordDecoder = mime.WordDecoder{
CharsetReader: func(charset string, r io.Reader) (io.Reader, error) {
switch strings.ToLower(charset) {
case "", "us-ascii", "utf-8":
return r, nil
}
enc, _ := ianaindex.MIME.Encoding(charset)
if enc == nil {
enc, _ = ianaindex.IANA.Encoding(charset)
}
if enc == nil {
return r, fmt.Errorf("unknown charset %q", charset)
}
return enc.NewDecoder().Reader(r), nil
},
} }
func parseEnvelope(h mail.Header) (*Envelope, error) { func parseEnvelope(h mail.Header) (*Envelope, error) {
@ -369,9 +396,14 @@ func parseEnvelope(h mail.Header) (*Envelope, error) {
date = time.Unix(date.Unix(), 0).UTC() date = time.Unix(date.Unix(), 0).UTC()
} }
subject := h.Get("Subject")
if s, err := wordDecoder.DecodeHeader(subject); err == nil {
subject = s
}
env := &Envelope{ env := &Envelope{
date, date,
h.Get("Subject"), subject,
parseAddressList(h, "from"), parseAddressList(h, "from"),
parseAddressList(h, "sender"), parseAddressList(h, "sender"),
parseAddressList(h, "reply-to"), parseAddressList(h, "reply-to"),