mirror of
https://github.com/mjl-/mox.git
synced 2025-01-01 03:13:48 +03:00
01adad62b2
message.Part now has a ReaderUTF8OrBinary() along with the existing Reader(). the new function returns a reader of decoded content. we now use it in a few places, including search. we only support the charsets in golang.org/x/text/encoding/ianaindex. search has also been changed to not read the entire message in memory. instead, we make one 8k buffer for reading and search in that, and we keep the buffer around for all messages. saves quite some allocations when searching large mailboxes.
335 lines
9.5 KiB
Go
335 lines
9.5 KiB
Go
// Copyright 2013 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// Package encoding defines an interface for character encodings, such as Shift
|
|
// JIS and Windows 1252, that can convert to and from UTF-8.
|
|
//
|
|
// Encoding implementations are provided in other packages, such as
|
|
// golang.org/x/text/encoding/charmap and
|
|
// golang.org/x/text/encoding/japanese.
|
|
package encoding // import "golang.org/x/text/encoding"
|
|
|
|
import (
|
|
"errors"
|
|
"io"
|
|
"strconv"
|
|
"unicode/utf8"
|
|
|
|
"golang.org/x/text/encoding/internal/identifier"
|
|
"golang.org/x/text/transform"
|
|
)
|
|
|
|
// TODO:
|
|
// - There seems to be some inconsistency in when decoders return errors
|
|
// and when not. Also documentation seems to suggest they shouldn't return
|
|
// errors at all (except for UTF-16).
|
|
// - Encoders seem to rely on or at least benefit from the input being in NFC
|
|
// normal form. Perhaps add an example how users could prepare their output.
|
|
|
|
// Encoding is a character set encoding that can be transformed to and from
|
|
// UTF-8.
|
|
type Encoding interface {
|
|
// NewDecoder returns a Decoder.
|
|
NewDecoder() *Decoder
|
|
|
|
// NewEncoder returns an Encoder.
|
|
NewEncoder() *Encoder
|
|
}
|
|
|
|
// A Decoder converts bytes to UTF-8. It implements transform.Transformer.
|
|
//
|
|
// Transforming source bytes that are not of that encoding will not result in an
|
|
// error per se. Each byte that cannot be transcoded will be represented in the
|
|
// output by the UTF-8 encoding of '\uFFFD', the replacement rune.
|
|
type Decoder struct {
|
|
transform.Transformer
|
|
|
|
// This forces external creators of Decoders to use names in struct
|
|
// initializers, allowing for future extendibility without having to break
|
|
// code.
|
|
_ struct{}
|
|
}
|
|
|
|
// Bytes converts the given encoded bytes to UTF-8. It returns the converted
|
|
// bytes or nil, err if any error occurred.
|
|
func (d *Decoder) Bytes(b []byte) ([]byte, error) {
|
|
b, _, err := transform.Bytes(d, b)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return b, nil
|
|
}
|
|
|
|
// String converts the given encoded string to UTF-8. It returns the converted
|
|
// string or "", err if any error occurred.
|
|
func (d *Decoder) String(s string) (string, error) {
|
|
s, _, err := transform.String(d, s)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return s, nil
|
|
}
|
|
|
|
// Reader wraps another Reader to decode its bytes.
|
|
//
|
|
// The Decoder may not be used for any other operation as long as the returned
|
|
// Reader is in use.
|
|
func (d *Decoder) Reader(r io.Reader) io.Reader {
|
|
return transform.NewReader(r, d)
|
|
}
|
|
|
|
// An Encoder converts bytes from UTF-8. It implements transform.Transformer.
|
|
//
|
|
// Each rune that cannot be transcoded will result in an error. In this case,
|
|
// the transform will consume all source byte up to, not including the offending
|
|
// rune. Transforming source bytes that are not valid UTF-8 will be replaced by
|
|
// `\uFFFD`. To return early with an error instead, use transform.Chain to
|
|
// preprocess the data with a UTF8Validator.
|
|
type Encoder struct {
|
|
transform.Transformer
|
|
|
|
// This forces external creators of Encoders to use names in struct
|
|
// initializers, allowing for future extendibility without having to break
|
|
// code.
|
|
_ struct{}
|
|
}
|
|
|
|
// Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if
|
|
// any error occurred.
|
|
func (e *Encoder) Bytes(b []byte) ([]byte, error) {
|
|
b, _, err := transform.Bytes(e, b)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return b, nil
|
|
}
|
|
|
|
// String converts a string from UTF-8. It returns the converted string or
|
|
// "", err if any error occurred.
|
|
func (e *Encoder) String(s string) (string, error) {
|
|
s, _, err := transform.String(e, s)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return s, nil
|
|
}
|
|
|
|
// Writer wraps another Writer to encode its UTF-8 output.
|
|
//
|
|
// The Encoder may not be used for any other operation as long as the returned
|
|
// Writer is in use.
|
|
func (e *Encoder) Writer(w io.Writer) io.Writer {
|
|
return transform.NewWriter(w, e)
|
|
}
|
|
|
|
// ASCIISub is the ASCII substitute character, as recommended by
|
|
// https://unicode.org/reports/tr36/#Text_Comparison
|
|
const ASCIISub = '\x1a'
|
|
|
|
// Nop is the nop encoding. Its transformed bytes are the same as the source
|
|
// bytes; it does not replace invalid UTF-8 sequences.
|
|
var Nop Encoding = nop{}
|
|
|
|
type nop struct{}
|
|
|
|
func (nop) NewDecoder() *Decoder {
|
|
return &Decoder{Transformer: transform.Nop}
|
|
}
|
|
func (nop) NewEncoder() *Encoder {
|
|
return &Encoder{Transformer: transform.Nop}
|
|
}
|
|
|
|
// Replacement is the replacement encoding. Decoding from the replacement
|
|
// encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to
|
|
// the replacement encoding yields the same as the source bytes except that
|
|
// invalid UTF-8 is converted to '\uFFFD'.
|
|
//
|
|
// It is defined at http://encoding.spec.whatwg.org/#replacement
|
|
var Replacement Encoding = replacement{}
|
|
|
|
type replacement struct{}
|
|
|
|
func (replacement) NewDecoder() *Decoder {
|
|
return &Decoder{Transformer: replacementDecoder{}}
|
|
}
|
|
|
|
func (replacement) NewEncoder() *Encoder {
|
|
return &Encoder{Transformer: replacementEncoder{}}
|
|
}
|
|
|
|
func (replacement) ID() (mib identifier.MIB, other string) {
|
|
return identifier.Replacement, ""
|
|
}
|
|
|
|
type replacementDecoder struct{ transform.NopResetter }
|
|
|
|
func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
|
if len(dst) < 3 {
|
|
return 0, 0, transform.ErrShortDst
|
|
}
|
|
if atEOF {
|
|
const fffd = "\ufffd"
|
|
dst[0] = fffd[0]
|
|
dst[1] = fffd[1]
|
|
dst[2] = fffd[2]
|
|
nDst = 3
|
|
}
|
|
return nDst, len(src), nil
|
|
}
|
|
|
|
type replacementEncoder struct{ transform.NopResetter }
|
|
|
|
func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
|
r, size := rune(0), 0
|
|
|
|
for ; nSrc < len(src); nSrc += size {
|
|
r = rune(src[nSrc])
|
|
|
|
// Decode a 1-byte rune.
|
|
if r < utf8.RuneSelf {
|
|
size = 1
|
|
|
|
} else {
|
|
// Decode a multi-byte rune.
|
|
r, size = utf8.DecodeRune(src[nSrc:])
|
|
if size == 1 {
|
|
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
|
// handled above. We have invalid UTF-8 or we haven't seen the
|
|
// full character yet.
|
|
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
|
err = transform.ErrShortSrc
|
|
break
|
|
}
|
|
r = '\ufffd'
|
|
}
|
|
}
|
|
|
|
if nDst+utf8.RuneLen(r) > len(dst) {
|
|
err = transform.ErrShortDst
|
|
break
|
|
}
|
|
nDst += utf8.EncodeRune(dst[nDst:], r)
|
|
}
|
|
return nDst, nSrc, err
|
|
}
|
|
|
|
// HTMLEscapeUnsupported wraps encoders to replace source runes outside the
|
|
// repertoire of the destination encoding with HTML escape sequences.
|
|
//
|
|
// This wrapper exists to comply to URL and HTML forms requiring a
|
|
// non-terminating legacy encoder. The produced sequences may lead to data
|
|
// loss as they are indistinguishable from legitimate input. To avoid this
|
|
// issue, use UTF-8 encodings whenever possible.
|
|
func HTMLEscapeUnsupported(e *Encoder) *Encoder {
|
|
return &Encoder{Transformer: &errorHandler{e, errorToHTML}}
|
|
}
|
|
|
|
// ReplaceUnsupported wraps encoders to replace source runes outside the
|
|
// repertoire of the destination encoding with an encoding-specific
|
|
// replacement.
|
|
//
|
|
// This wrapper is only provided for backwards compatibility and legacy
|
|
// handling. Its use is strongly discouraged. Use UTF-8 whenever possible.
|
|
func ReplaceUnsupported(e *Encoder) *Encoder {
|
|
return &Encoder{Transformer: &errorHandler{e, errorToReplacement}}
|
|
}
|
|
|
|
type errorHandler struct {
|
|
*Encoder
|
|
handler func(dst []byte, r rune, err repertoireError) (n int, ok bool)
|
|
}
|
|
|
|
// TODO: consider making this error public in some form.
|
|
type repertoireError interface {
|
|
Replacement() byte
|
|
}
|
|
|
|
func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
|
nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF)
|
|
for err != nil {
|
|
rerr, ok := err.(repertoireError)
|
|
if !ok {
|
|
return nDst, nSrc, err
|
|
}
|
|
r, sz := utf8.DecodeRune(src[nSrc:])
|
|
n, ok := h.handler(dst[nDst:], r, rerr)
|
|
if !ok {
|
|
return nDst, nSrc, transform.ErrShortDst
|
|
}
|
|
err = nil
|
|
nDst += n
|
|
if nSrc += sz; nSrc < len(src) {
|
|
var dn, sn int
|
|
dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF)
|
|
nDst += dn
|
|
nSrc += sn
|
|
}
|
|
}
|
|
return nDst, nSrc, err
|
|
}
|
|
|
|
func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) {
|
|
buf := [8]byte{}
|
|
b := strconv.AppendUint(buf[:0], uint64(r), 10)
|
|
if n = len(b) + len("&#;"); n >= len(dst) {
|
|
return 0, false
|
|
}
|
|
dst[0] = '&'
|
|
dst[1] = '#'
|
|
dst[copy(dst[2:], b)+2] = ';'
|
|
return n, true
|
|
}
|
|
|
|
func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) {
|
|
if len(dst) == 0 {
|
|
return 0, false
|
|
}
|
|
dst[0] = err.Replacement()
|
|
return 1, true
|
|
}
|
|
|
|
// ErrInvalidUTF8 means that a transformer encountered invalid UTF-8.
|
|
var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8")
|
|
|
|
// UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first
|
|
// input byte that is not valid UTF-8.
|
|
var UTF8Validator transform.Transformer = utf8Validator{}
|
|
|
|
type utf8Validator struct{ transform.NopResetter }
|
|
|
|
func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
|
n := len(src)
|
|
if n > len(dst) {
|
|
n = len(dst)
|
|
}
|
|
for i := 0; i < n; {
|
|
if c := src[i]; c < utf8.RuneSelf {
|
|
dst[i] = c
|
|
i++
|
|
continue
|
|
}
|
|
_, size := utf8.DecodeRune(src[i:])
|
|
if size == 1 {
|
|
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
|
// handled above. We have invalid UTF-8 or we haven't seen the
|
|
// full character yet.
|
|
err = ErrInvalidUTF8
|
|
if !atEOF && !utf8.FullRune(src[i:]) {
|
|
err = transform.ErrShortSrc
|
|
}
|
|
return i, i, err
|
|
}
|
|
if i+size > len(dst) {
|
|
return i, i, transform.ErrShortDst
|
|
}
|
|
for ; size > 0; size-- {
|
|
dst[i] = src[i]
|
|
i++
|
|
}
|
|
}
|
|
if len(src) > len(dst) {
|
|
err = transform.ErrShortDst
|
|
}
|
|
return n, n, err
|
|
}
|