mirror of
https://github.com/mjl-/mox.git
synced 2025-01-23 13:35:48 +03:00
01adad62b2
message.Part now has a ReaderUTF8OrBinary() along with the existing Reader(). the new function returns a reader of decoded content. we now use it in a few places, including search. we only support the charsets in golang.org/x/text/encoding/ianaindex. search has also been changed to not read the entire message in memory. instead, we make one 8k buffer for reading and search in that, and we keep the buffer around for all messages. saves quite some allocations when searching large mailboxes.
214 lines
6.5 KiB
Go
214 lines
6.5 KiB
Go
// Copyright 2015 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
//go:generate go run gen.go
|
|
|
|
// Package ianaindex maps names to Encodings as specified by the IANA registry.
|
|
// This includes both the MIME and IANA names.
|
|
//
|
|
// See http://www.iana.org/assignments/character-sets/character-sets.xhtml for
|
|
// more details.
|
|
package ianaindex
|
|
|
|
import (
|
|
"errors"
|
|
"sort"
|
|
"strings"
|
|
|
|
"golang.org/x/text/encoding"
|
|
"golang.org/x/text/encoding/charmap"
|
|
"golang.org/x/text/encoding/internal/identifier"
|
|
"golang.org/x/text/encoding/japanese"
|
|
"golang.org/x/text/encoding/korean"
|
|
"golang.org/x/text/encoding/simplifiedchinese"
|
|
"golang.org/x/text/encoding/traditionalchinese"
|
|
"golang.org/x/text/encoding/unicode"
|
|
)
|
|
|
|
// TODO: remove the "Status... incomplete" in the package doc comment.
|
|
// TODO: allow users to specify their own aliases?
|
|
// TODO: allow users to specify their own indexes?
|
|
// TODO: allow canonicalizing names
|
|
|
|
// NOTE: only use these top-level variables if we can get the linker to drop
|
|
// the indexes when they are not used. Make them a function or perhaps only
|
|
// support MIME otherwise.
|
|
|
|
var (
|
|
// MIME is an index to map MIME names.
|
|
MIME *Index = mime
|
|
|
|
// IANA is an index that supports all names and aliases using IANA names as
|
|
// the canonical identifier.
|
|
IANA *Index = iana
|
|
|
|
// MIB is an index that associates the MIB display name with an Encoding.
|
|
MIB *Index = mib
|
|
|
|
mime = &Index{mimeName, ianaToMIB, ianaAliases, encodings[:]}
|
|
iana = &Index{ianaName, ianaToMIB, ianaAliases, encodings[:]}
|
|
mib = &Index{mibName, ianaToMIB, ianaAliases, encodings[:]}
|
|
)
|
|
|
|
// Index maps names registered by IANA to Encodings.
|
|
// Currently different Indexes only differ in the names they return for
|
|
// encodings. In the future they may also differ in supported aliases.
|
|
type Index struct {
|
|
names func(i int) string
|
|
toMIB []identifier.MIB // Sorted slice of supported MIBs
|
|
alias map[string]int
|
|
enc []encoding.Encoding
|
|
}
|
|
|
|
var (
|
|
errInvalidName = errors.New("ianaindex: invalid encoding name")
|
|
errUnknown = errors.New("ianaindex: unknown Encoding")
|
|
errUnsupported = errors.New("ianaindex: unsupported Encoding")
|
|
)
|
|
|
|
// Encoding returns an Encoding for IANA-registered names. Matching is
|
|
// case-insensitive.
|
|
//
|
|
// If the provided name doesn't match a IANA-registered charset, an error is
|
|
// returned. If the name matches a IANA-registered charset but isn't supported,
|
|
// a nil encoding and a nil error are returned.
|
|
func (x *Index) Encoding(name string) (encoding.Encoding, error) {
|
|
name = strings.TrimSpace(name)
|
|
// First try without lowercasing (possibly creating an allocation).
|
|
i, ok := x.alias[name]
|
|
if !ok {
|
|
i, ok = x.alias[strings.ToLower(name)]
|
|
if !ok {
|
|
return nil, errInvalidName
|
|
}
|
|
}
|
|
return x.enc[i], nil
|
|
}
|
|
|
|
// Name reports the canonical name of the given Encoding. It will return an
|
|
// error if the e is not associated with a known encoding scheme.
|
|
func (x *Index) Name(e encoding.Encoding) (string, error) {
|
|
id, ok := e.(identifier.Interface)
|
|
if !ok {
|
|
return "", errUnknown
|
|
}
|
|
mib, _ := id.ID()
|
|
if mib == 0 {
|
|
return "", errUnknown
|
|
}
|
|
v := findMIB(x.toMIB, mib)
|
|
if v == -1 {
|
|
return "", errUnsupported
|
|
}
|
|
return x.names(v), nil
|
|
}
|
|
|
|
// TODO: the coverage of this index is rather spotty. Allowing users to set
|
|
// encodings would allow:
|
|
// - users to increase coverage
|
|
// - allow a partially loaded set of encodings in case the user doesn't need to
|
|
// them all.
|
|
// - write an OS-specific wrapper for supported encodings and set them.
|
|
// The exact definition of Set depends a bit on if and how we want to let users
|
|
// write their own Encoding implementations. Also, it is not possible yet to
|
|
// only partially load the encodings without doing some refactoring. Until this
|
|
// is solved, we might as well not support Set.
|
|
// // Set sets the e to be used for the encoding scheme identified by name. Only
|
|
// // canonical names may be used. An empty name assigns e to its internally
|
|
// // associated encoding scheme.
|
|
// func (x *Index) Set(name string, e encoding.Encoding) error {
|
|
// panic("TODO: implement")
|
|
// }
|
|
|
|
func findMIB(x []identifier.MIB, mib identifier.MIB) int {
|
|
i := sort.Search(len(x), func(i int) bool { return x[i] >= mib })
|
|
if i < len(x) && x[i] == mib {
|
|
return i
|
|
}
|
|
return -1
|
|
}
|
|
|
|
const maxMIMENameLen = '0' - 1 // officially 40, but we leave some buffer.
|
|
|
|
func mimeName(x int) string {
|
|
n := ianaNames[x]
|
|
// See gen.go for a description of the encoding.
|
|
if n[0] <= maxMIMENameLen {
|
|
return n[1:n[0]]
|
|
}
|
|
return n
|
|
}
|
|
|
|
func ianaName(x int) string {
|
|
n := ianaNames[x]
|
|
// See gen.go for a description of the encoding.
|
|
if n[0] <= maxMIMENameLen {
|
|
return n[n[0]:]
|
|
}
|
|
return n
|
|
}
|
|
|
|
func mibName(x int) string {
|
|
return mibNames[x]
|
|
}
|
|
|
|
var encodings = [numIANA]encoding.Encoding{
|
|
enc3: asciiEnc,
|
|
enc106: unicode.UTF8,
|
|
enc1015: unicode.UTF16(unicode.BigEndian, unicode.UseBOM),
|
|
enc1013: unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM),
|
|
enc1014: unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM),
|
|
enc2028: charmap.CodePage037,
|
|
enc2011: charmap.CodePage437,
|
|
enc2009: charmap.CodePage850,
|
|
enc2010: charmap.CodePage852,
|
|
enc2046: charmap.CodePage855,
|
|
enc2089: charmap.CodePage858,
|
|
enc2048: charmap.CodePage860,
|
|
enc2013: charmap.CodePage862,
|
|
enc2050: charmap.CodePage863,
|
|
enc2052: charmap.CodePage865,
|
|
enc2086: charmap.CodePage866,
|
|
enc2102: charmap.CodePage1047,
|
|
enc2091: charmap.CodePage1140,
|
|
enc4: charmap.ISO8859_1,
|
|
enc5: charmap.ISO8859_2,
|
|
enc6: charmap.ISO8859_3,
|
|
enc7: charmap.ISO8859_4,
|
|
enc8: charmap.ISO8859_5,
|
|
enc9: charmap.ISO8859_6,
|
|
enc81: charmap.ISO8859_6E,
|
|
enc82: charmap.ISO8859_6I,
|
|
enc10: charmap.ISO8859_7,
|
|
enc11: charmap.ISO8859_8,
|
|
enc84: charmap.ISO8859_8E,
|
|
enc85: charmap.ISO8859_8I,
|
|
enc12: charmap.ISO8859_9,
|
|
enc13: charmap.ISO8859_10,
|
|
enc109: charmap.ISO8859_13,
|
|
enc110: charmap.ISO8859_14,
|
|
enc111: charmap.ISO8859_15,
|
|
enc112: charmap.ISO8859_16,
|
|
enc2084: charmap.KOI8R,
|
|
enc2088: charmap.KOI8U,
|
|
enc2027: charmap.Macintosh,
|
|
enc2109: charmap.Windows874,
|
|
enc2250: charmap.Windows1250,
|
|
enc2251: charmap.Windows1251,
|
|
enc2252: charmap.Windows1252,
|
|
enc2253: charmap.Windows1253,
|
|
enc2254: charmap.Windows1254,
|
|
enc2255: charmap.Windows1255,
|
|
enc2256: charmap.Windows1256,
|
|
enc2257: charmap.Windows1257,
|
|
enc2258: charmap.Windows1258,
|
|
enc18: japanese.EUCJP,
|
|
enc39: japanese.ISO2022JP,
|
|
enc17: japanese.ShiftJIS,
|
|
enc38: korean.EUCKR,
|
|
enc114: simplifiedchinese.GB18030,
|
|
enc113: simplifiedchinese.GBK,
|
|
enc2085: simplifiedchinese.HZGB2312,
|
|
enc2026: traditionalchinese.Big5,
|
|
}
|