mirror of
https://github.com/mjl-/mox.git
synced 2024-12-27 08:53:48 +03:00
01adad62b2
message.Part now has a ReaderUTF8OrBinary() along with the existing Reader(). the new function returns a reader of decoded content. we now use it in a few places, including search. we only support the charsets in golang.org/x/text/encoding/ianaindex. search has also been changed to not read the entire message in memory. instead, we make one 8k buffer for reading and search in that, and we keep the buffer around for all messages. saves quite some allocations when searching large mailboxes.
355 lines
8.5 KiB
Go
355 lines
8.5 KiB
Go
// Copyright 2014 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// Package runes provide transforms for UTF-8 encoded text.
|
|
package runes // import "golang.org/x/text/runes"
|
|
|
|
import (
|
|
"unicode"
|
|
"unicode/utf8"
|
|
|
|
"golang.org/x/text/transform"
|
|
)
|
|
|
|
// A Set is a collection of runes.
|
|
type Set interface {
|
|
// Contains returns true if r is contained in the set.
|
|
Contains(r rune) bool
|
|
}
|
|
|
|
type setFunc func(rune) bool
|
|
|
|
func (s setFunc) Contains(r rune) bool {
|
|
return s(r)
|
|
}
|
|
|
|
// Note: using funcs here instead of wrapping types result in cleaner
|
|
// documentation and a smaller API.
|
|
|
|
// In creates a Set with a Contains method that returns true for all runes in
|
|
// the given RangeTable.
|
|
func In(rt *unicode.RangeTable) Set {
|
|
return setFunc(func(r rune) bool { return unicode.Is(rt, r) })
|
|
}
|
|
|
|
// NotIn creates a Set with a Contains method that returns true for all runes not
|
|
// in the given RangeTable.
|
|
func NotIn(rt *unicode.RangeTable) Set {
|
|
return setFunc(func(r rune) bool { return !unicode.Is(rt, r) })
|
|
}
|
|
|
|
// Predicate creates a Set with a Contains method that returns f(r).
|
|
func Predicate(f func(rune) bool) Set {
|
|
return setFunc(f)
|
|
}
|
|
|
|
// Transformer implements the transform.Transformer interface.
|
|
type Transformer struct {
|
|
t transform.SpanningTransformer
|
|
}
|
|
|
|
func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
|
return t.t.Transform(dst, src, atEOF)
|
|
}
|
|
|
|
func (t Transformer) Span(b []byte, atEOF bool) (n int, err error) {
|
|
return t.t.Span(b, atEOF)
|
|
}
|
|
|
|
func (t Transformer) Reset() { t.t.Reset() }
|
|
|
|
// Bytes returns a new byte slice with the result of converting b using t. It
|
|
// calls Reset on t. It returns nil if any error was found. This can only happen
|
|
// if an error-producing Transformer is passed to If.
|
|
func (t Transformer) Bytes(b []byte) []byte {
|
|
b, _, err := transform.Bytes(t, b)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
return b
|
|
}
|
|
|
|
// String returns a string with the result of converting s using t. It calls
|
|
// Reset on t. It returns the empty string if any error was found. This can only
|
|
// happen if an error-producing Transformer is passed to If.
|
|
func (t Transformer) String(s string) string {
|
|
s, _, err := transform.String(t, s)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
return s
|
|
}
|
|
|
|
// TODO:
|
|
// - Copy: copying strings and bytes in whole-rune units.
|
|
// - Validation (maybe)
|
|
// - Well-formed-ness (maybe)
|
|
|
|
const runeErrorString = string(utf8.RuneError)
|
|
|
|
// Remove returns a Transformer that removes runes r for which s.Contains(r).
|
|
// Illegal input bytes are replaced by RuneError before being passed to f.
|
|
func Remove(s Set) Transformer {
|
|
if f, ok := s.(setFunc); ok {
|
|
// This little trick cuts the running time of BenchmarkRemove for sets
|
|
// created by Predicate roughly in half.
|
|
// TODO: special-case RangeTables as well.
|
|
return Transformer{remove(f)}
|
|
}
|
|
return Transformer{remove(s.Contains)}
|
|
}
|
|
|
|
// TODO: remove transform.RemoveFunc.
|
|
|
|
type remove func(r rune) bool
|
|
|
|
func (remove) Reset() {}
|
|
|
|
// Span implements transform.Spanner.
|
|
func (t remove) Span(src []byte, atEOF bool) (n int, err error) {
|
|
for r, size := rune(0), 0; n < len(src); {
|
|
if r = rune(src[n]); r < utf8.RuneSelf {
|
|
size = 1
|
|
} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
|
|
// Invalid rune.
|
|
if !atEOF && !utf8.FullRune(src[n:]) {
|
|
err = transform.ErrShortSrc
|
|
} else {
|
|
err = transform.ErrEndOfSpan
|
|
}
|
|
break
|
|
}
|
|
if t(r) {
|
|
err = transform.ErrEndOfSpan
|
|
break
|
|
}
|
|
n += size
|
|
}
|
|
return
|
|
}
|
|
|
|
// Transform implements transform.Transformer.
|
|
func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
|
for r, size := rune(0), 0; nSrc < len(src); {
|
|
if r = rune(src[nSrc]); r < utf8.RuneSelf {
|
|
size = 1
|
|
} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
|
|
// Invalid rune.
|
|
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
|
err = transform.ErrShortSrc
|
|
break
|
|
}
|
|
// We replace illegal bytes with RuneError. Not doing so might
|
|
// otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
|
|
// The resulting byte sequence may subsequently contain runes
|
|
// for which t(r) is true that were passed unnoticed.
|
|
if !t(utf8.RuneError) {
|
|
if nDst+3 > len(dst) {
|
|
err = transform.ErrShortDst
|
|
break
|
|
}
|
|
dst[nDst+0] = runeErrorString[0]
|
|
dst[nDst+1] = runeErrorString[1]
|
|
dst[nDst+2] = runeErrorString[2]
|
|
nDst += 3
|
|
}
|
|
nSrc++
|
|
continue
|
|
}
|
|
if t(r) {
|
|
nSrc += size
|
|
continue
|
|
}
|
|
if nDst+size > len(dst) {
|
|
err = transform.ErrShortDst
|
|
break
|
|
}
|
|
for i := 0; i < size; i++ {
|
|
dst[nDst] = src[nSrc]
|
|
nDst++
|
|
nSrc++
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
// Map returns a Transformer that maps the runes in the input using the given
|
|
// mapping. Illegal bytes in the input are converted to utf8.RuneError before
|
|
// being passed to the mapping func.
|
|
func Map(mapping func(rune) rune) Transformer {
|
|
return Transformer{mapper(mapping)}
|
|
}
|
|
|
|
type mapper func(rune) rune
|
|
|
|
func (mapper) Reset() {}
|
|
|
|
// Span implements transform.Spanner.
|
|
func (t mapper) Span(src []byte, atEOF bool) (n int, err error) {
|
|
for r, size := rune(0), 0; n < len(src); n += size {
|
|
if r = rune(src[n]); r < utf8.RuneSelf {
|
|
size = 1
|
|
} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
|
|
// Invalid rune.
|
|
if !atEOF && !utf8.FullRune(src[n:]) {
|
|
err = transform.ErrShortSrc
|
|
} else {
|
|
err = transform.ErrEndOfSpan
|
|
}
|
|
break
|
|
}
|
|
if t(r) != r {
|
|
err = transform.ErrEndOfSpan
|
|
break
|
|
}
|
|
}
|
|
return n, err
|
|
}
|
|
|
|
// Transform implements transform.Transformer.
|
|
func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
|
var replacement rune
|
|
var b [utf8.UTFMax]byte
|
|
|
|
for r, size := rune(0), 0; nSrc < len(src); {
|
|
if r = rune(src[nSrc]); r < utf8.RuneSelf {
|
|
if replacement = t(r); replacement < utf8.RuneSelf {
|
|
if nDst == len(dst) {
|
|
err = transform.ErrShortDst
|
|
break
|
|
}
|
|
dst[nDst] = byte(replacement)
|
|
nDst++
|
|
nSrc++
|
|
continue
|
|
}
|
|
size = 1
|
|
} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
|
|
// Invalid rune.
|
|
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
|
err = transform.ErrShortSrc
|
|
break
|
|
}
|
|
|
|
if replacement = t(utf8.RuneError); replacement == utf8.RuneError {
|
|
if nDst+3 > len(dst) {
|
|
err = transform.ErrShortDst
|
|
break
|
|
}
|
|
dst[nDst+0] = runeErrorString[0]
|
|
dst[nDst+1] = runeErrorString[1]
|
|
dst[nDst+2] = runeErrorString[2]
|
|
nDst += 3
|
|
nSrc++
|
|
continue
|
|
}
|
|
} else if replacement = t(r); replacement == r {
|
|
if nDst+size > len(dst) {
|
|
err = transform.ErrShortDst
|
|
break
|
|
}
|
|
for i := 0; i < size; i++ {
|
|
dst[nDst] = src[nSrc]
|
|
nDst++
|
|
nSrc++
|
|
}
|
|
continue
|
|
}
|
|
|
|
n := utf8.EncodeRune(b[:], replacement)
|
|
|
|
if nDst+n > len(dst) {
|
|
err = transform.ErrShortDst
|
|
break
|
|
}
|
|
for i := 0; i < n; i++ {
|
|
dst[nDst] = b[i]
|
|
nDst++
|
|
}
|
|
nSrc += size
|
|
}
|
|
return
|
|
}
|
|
|
|
// ReplaceIllFormed returns a transformer that replaces all input bytes that are
|
|
// not part of a well-formed UTF-8 code sequence with utf8.RuneError.
|
|
func ReplaceIllFormed() Transformer {
|
|
return Transformer{&replaceIllFormed{}}
|
|
}
|
|
|
|
type replaceIllFormed struct{ transform.NopResetter }
|
|
|
|
func (t replaceIllFormed) Span(src []byte, atEOF bool) (n int, err error) {
|
|
for n < len(src) {
|
|
// ASCII fast path.
|
|
if src[n] < utf8.RuneSelf {
|
|
n++
|
|
continue
|
|
}
|
|
|
|
r, size := utf8.DecodeRune(src[n:])
|
|
|
|
// Look for a valid non-ASCII rune.
|
|
if r != utf8.RuneError || size != 1 {
|
|
n += size
|
|
continue
|
|
}
|
|
|
|
// Look for short source data.
|
|
if !atEOF && !utf8.FullRune(src[n:]) {
|
|
err = transform.ErrShortSrc
|
|
break
|
|
}
|
|
|
|
// We have an invalid rune.
|
|
err = transform.ErrEndOfSpan
|
|
break
|
|
}
|
|
return n, err
|
|
}
|
|
|
|
func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
|
for nSrc < len(src) {
|
|
// ASCII fast path.
|
|
if r := src[nSrc]; r < utf8.RuneSelf {
|
|
if nDst == len(dst) {
|
|
err = transform.ErrShortDst
|
|
break
|
|
}
|
|
dst[nDst] = r
|
|
nDst++
|
|
nSrc++
|
|
continue
|
|
}
|
|
|
|
// Look for a valid non-ASCII rune.
|
|
if _, size := utf8.DecodeRune(src[nSrc:]); size != 1 {
|
|
if size != copy(dst[nDst:], src[nSrc:nSrc+size]) {
|
|
err = transform.ErrShortDst
|
|
break
|
|
}
|
|
nDst += size
|
|
nSrc += size
|
|
continue
|
|
}
|
|
|
|
// Look for short source data.
|
|
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
|
err = transform.ErrShortSrc
|
|
break
|
|
}
|
|
|
|
// We have an invalid rune.
|
|
if nDst+3 > len(dst) {
|
|
err = transform.ErrShortDst
|
|
break
|
|
}
|
|
dst[nDst+0] = runeErrorString[0]
|
|
dst[nDst+1] = runeErrorString[1]
|
|
dst[nDst+2] = runeErrorString[2]
|
|
nDst += 3
|
|
nSrc++
|
|
}
|
|
return nDst, nSrc, err
|
|
}
|