mirror of
https://github.com/mjl-/mox.git
synced 2025-01-23 21:45:48 +03:00
01adad62b2
message.Part now has a ReaderUTF8OrBinary() along with the existing Reader(). the new function returns a reader of decoded content. we now use it in a few places, including search. we only support the charsets in golang.org/x/text/encoding/ianaindex. search has also been changed to not read the entire message in memory. instead, we make one 8k buffer for reading and search in that, and we keep the buffer around for all messages. saves quite some allocations when searching large mailboxes.
249 lines
6.4 KiB
Go
249 lines
6.4 KiB
Go
// Copyright 2013 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
//go:generate go run maketables.go
|
|
|
|
// Package charmap provides simple character encodings such as IBM Code Page 437
|
|
// and Windows 1252.
|
|
package charmap // import "golang.org/x/text/encoding/charmap"
|
|
|
|
import (
|
|
"unicode/utf8"
|
|
|
|
"golang.org/x/text/encoding"
|
|
"golang.org/x/text/encoding/internal"
|
|
"golang.org/x/text/encoding/internal/identifier"
|
|
"golang.org/x/text/transform"
|
|
)
|
|
|
|
// These encodings vary only in the way clients should interpret them. Their
|
|
// coded character set is identical and a single implementation can be shared.
|
|
var (
|
|
// ISO8859_6E is the ISO 8859-6E encoding.
|
|
ISO8859_6E encoding.Encoding = &iso8859_6E
|
|
|
|
// ISO8859_6I is the ISO 8859-6I encoding.
|
|
ISO8859_6I encoding.Encoding = &iso8859_6I
|
|
|
|
// ISO8859_8E is the ISO 8859-8E encoding.
|
|
ISO8859_8E encoding.Encoding = &iso8859_8E
|
|
|
|
// ISO8859_8I is the ISO 8859-8I encoding.
|
|
ISO8859_8I encoding.Encoding = &iso8859_8I
|
|
|
|
iso8859_6E = internal.Encoding{
|
|
Encoding: ISO8859_6,
|
|
Name: "ISO-8859-6E",
|
|
MIB: identifier.ISO88596E,
|
|
}
|
|
|
|
iso8859_6I = internal.Encoding{
|
|
Encoding: ISO8859_6,
|
|
Name: "ISO-8859-6I",
|
|
MIB: identifier.ISO88596I,
|
|
}
|
|
|
|
iso8859_8E = internal.Encoding{
|
|
Encoding: ISO8859_8,
|
|
Name: "ISO-8859-8E",
|
|
MIB: identifier.ISO88598E,
|
|
}
|
|
|
|
iso8859_8I = internal.Encoding{
|
|
Encoding: ISO8859_8,
|
|
Name: "ISO-8859-8I",
|
|
MIB: identifier.ISO88598I,
|
|
}
|
|
)
|
|
|
|
// All is a list of all defined encodings in this package.
|
|
var All []encoding.Encoding = listAll
|
|
|
|
// TODO: implement these encodings, in order of importance.
|
|
// ASCII, ISO8859_1: Rather common. Close to Windows 1252.
|
|
// ISO8859_9: Close to Windows 1254.
|
|
|
|
// utf8Enc holds a rune's UTF-8 encoding in data[:len].
|
|
type utf8Enc struct {
|
|
len uint8
|
|
data [3]byte
|
|
}
|
|
|
|
// Charmap is an 8-bit character set encoding.
|
|
type Charmap struct {
|
|
// name is the encoding's name.
|
|
name string
|
|
// mib is the encoding type of this encoder.
|
|
mib identifier.MIB
|
|
// asciiSuperset states whether the encoding is a superset of ASCII.
|
|
asciiSuperset bool
|
|
// low is the lower bound of the encoded byte for a non-ASCII rune. If
|
|
// Charmap.asciiSuperset is true then this will be 0x80, otherwise 0x00.
|
|
low uint8
|
|
// replacement is the encoded replacement character.
|
|
replacement byte
|
|
// decode is the map from encoded byte to UTF-8.
|
|
decode [256]utf8Enc
|
|
// encoding is the map from runes to encoded bytes. Each entry is a
|
|
// uint32: the high 8 bits are the encoded byte and the low 24 bits are
|
|
// the rune. The table entries are sorted by ascending rune.
|
|
encode [256]uint32
|
|
}
|
|
|
|
// NewDecoder implements the encoding.Encoding interface.
|
|
func (m *Charmap) NewDecoder() *encoding.Decoder {
|
|
return &encoding.Decoder{Transformer: charmapDecoder{charmap: m}}
|
|
}
|
|
|
|
// NewEncoder implements the encoding.Encoding interface.
|
|
func (m *Charmap) NewEncoder() *encoding.Encoder {
|
|
return &encoding.Encoder{Transformer: charmapEncoder{charmap: m}}
|
|
}
|
|
|
|
// String returns the Charmap's name.
|
|
func (m *Charmap) String() string {
|
|
return m.name
|
|
}
|
|
|
|
// ID implements an internal interface.
|
|
func (m *Charmap) ID() (mib identifier.MIB, other string) {
|
|
return m.mib, ""
|
|
}
|
|
|
|
// charmapDecoder implements transform.Transformer by decoding to UTF-8.
|
|
type charmapDecoder struct {
|
|
transform.NopResetter
|
|
charmap *Charmap
|
|
}
|
|
|
|
func (m charmapDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
|
for i, c := range src {
|
|
if m.charmap.asciiSuperset && c < utf8.RuneSelf {
|
|
if nDst >= len(dst) {
|
|
err = transform.ErrShortDst
|
|
break
|
|
}
|
|
dst[nDst] = c
|
|
nDst++
|
|
nSrc = i + 1
|
|
continue
|
|
}
|
|
|
|
decode := &m.charmap.decode[c]
|
|
n := int(decode.len)
|
|
if nDst+n > len(dst) {
|
|
err = transform.ErrShortDst
|
|
break
|
|
}
|
|
// It's 15% faster to avoid calling copy for these tiny slices.
|
|
for j := 0; j < n; j++ {
|
|
dst[nDst] = decode.data[j]
|
|
nDst++
|
|
}
|
|
nSrc = i + 1
|
|
}
|
|
return nDst, nSrc, err
|
|
}
|
|
|
|
// DecodeByte returns the Charmap's rune decoding of the byte b.
|
|
func (m *Charmap) DecodeByte(b byte) rune {
|
|
switch x := &m.decode[b]; x.len {
|
|
case 1:
|
|
return rune(x.data[0])
|
|
case 2:
|
|
return rune(x.data[0]&0x1f)<<6 | rune(x.data[1]&0x3f)
|
|
default:
|
|
return rune(x.data[0]&0x0f)<<12 | rune(x.data[1]&0x3f)<<6 | rune(x.data[2]&0x3f)
|
|
}
|
|
}
|
|
|
|
// charmapEncoder implements transform.Transformer by encoding from UTF-8.
|
|
type charmapEncoder struct {
|
|
transform.NopResetter
|
|
charmap *Charmap
|
|
}
|
|
|
|
func (m charmapEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
|
r, size := rune(0), 0
|
|
loop:
|
|
for nSrc < len(src) {
|
|
if nDst >= len(dst) {
|
|
err = transform.ErrShortDst
|
|
break
|
|
}
|
|
r = rune(src[nSrc])
|
|
|
|
// Decode a 1-byte rune.
|
|
if r < utf8.RuneSelf {
|
|
if m.charmap.asciiSuperset {
|
|
nSrc++
|
|
dst[nDst] = uint8(r)
|
|
nDst++
|
|
continue
|
|
}
|
|
size = 1
|
|
|
|
} else {
|
|
// Decode a multi-byte rune.
|
|
r, size = utf8.DecodeRune(src[nSrc:])
|
|
if size == 1 {
|
|
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
|
// handled above. We have invalid UTF-8 or we haven't seen the
|
|
// full character yet.
|
|
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
|
err = transform.ErrShortSrc
|
|
} else {
|
|
err = internal.RepertoireError(m.charmap.replacement)
|
|
}
|
|
break
|
|
}
|
|
}
|
|
|
|
// Binary search in [low, high) for that rune in the m.charmap.encode table.
|
|
for low, high := int(m.charmap.low), 0x100; ; {
|
|
if low >= high {
|
|
err = internal.RepertoireError(m.charmap.replacement)
|
|
break loop
|
|
}
|
|
mid := (low + high) / 2
|
|
got := m.charmap.encode[mid]
|
|
gotRune := rune(got & (1<<24 - 1))
|
|
if gotRune < r {
|
|
low = mid + 1
|
|
} else if gotRune > r {
|
|
high = mid
|
|
} else {
|
|
dst[nDst] = byte(got >> 24)
|
|
nDst++
|
|
break
|
|
}
|
|
}
|
|
nSrc += size
|
|
}
|
|
return nDst, nSrc, err
|
|
}
|
|
|
|
// EncodeRune returns the Charmap's byte encoding of the rune r. ok is whether
|
|
// r is in the Charmap's repertoire. If not, b is set to the Charmap's
|
|
// replacement byte. This is often the ASCII substitute character '\x1a'.
|
|
func (m *Charmap) EncodeRune(r rune) (b byte, ok bool) {
|
|
if r < utf8.RuneSelf && m.asciiSuperset {
|
|
return byte(r), true
|
|
}
|
|
for low, high := int(m.low), 0x100; ; {
|
|
if low >= high {
|
|
return m.replacement, false
|
|
}
|
|
mid := (low + high) / 2
|
|
got := m.encode[mid]
|
|
gotRune := rune(got & (1<<24 - 1))
|
|
if gotRune < r {
|
|
low = mid + 1
|
|
} else if gotRune > r {
|
|
high = mid
|
|
} else {
|
|
return byte(got >> 24), true
|
|
}
|
|
}
|
|
}
|