implement decoding charsets (other than ascii and utf-8) while reading textual message parts, and improve search

message.Part now has a ReaderUTF8OrBinary() along with the existing Reader().
the new function returns a reader of decoded content. we now use it in a few
places, including search. we only support the charsets in
golang.org/x/text/encoding/ianaindex.

search has also been changed to not read the entire message in memory. instead,
we make one 8k buffer for reading and search in that, and we keep the buffer
around for all messages. saves quite some allocations when searching large
mailboxes.
This commit is contained in:
Mechiel Lukkien 2023-07-28 22:15:23 +02:00
parent a31dfc573e
commit 01adad62b2
No known key found for this signature in database
34 changed files with 157887 additions and 64 deletions

View file

@ -76,7 +76,7 @@ func Parse(r io.ReaderAt) (*Message, *message.Part, error) {
}
}
m.Subject = part.Envelope.Subject
buf, err := io.ReadAll(p0.Reader())
buf, err := io.ReadAll(p0.ReaderUTF8OrBinary())
if err != nil {
return nil, nil, fmt.Errorf("reading human-readable text part: %v", err)
}

View file

@ -2,7 +2,6 @@ package imapserver
import (
"fmt"
"io"
"net/textproto"
"strings"
@ -77,6 +76,45 @@ func (c *conn) cmdxSearch(isUID bool, tag, cmd string, p *parser) {
c.searchResult = []store.UID{}
}
// We gather word and not-word searches from the top-level, turn them
// into a WordSearch for a more efficient search.
// todo optimize: also gather them out of AND searches.
var textWords, textNotWords, bodyWords, bodyNotWords []string
n := 0
for _, xsk := range sk.searchKeys {
switch xsk.op {
case "BODY":
bodyWords = append(bodyWords, xsk.astring)
continue
case "TEXT":
textWords = append(textWords, xsk.astring)
continue
case "NOT":
switch xsk.searchKey.op {
case "BODY":
bodyNotWords = append(bodyNotWords, xsk.searchKey.astring)
continue
case "TEXT":
textNotWords = append(textNotWords, xsk.searchKey.astring)
continue
}
}
sk.searchKeys[n] = xsk
n++
}
// We may be left with an empty but non-nil sk.searchKeys, which is important for
// matching.
sk.searchKeys = sk.searchKeys[:n]
var bodySearch, textSearch *store.WordSearch
if len(bodyWords) > 0 || len(bodyNotWords) > 0 {
ws := store.PrepareWordSearch(bodyWords, bodyNotWords)
bodySearch = &ws
}
if len(textWords) > 0 || len(textNotWords) > 0 {
ws := store.PrepareWordSearch(textWords, textNotWords)
textSearch = &ws
}
// Note: we only hold the account rlock for verifying the mailbox at the start.
c.account.RLock()
runlock := c.account.RUnlock
@ -109,7 +147,7 @@ func (c *conn) cmdxSearch(isUID bool, tag, cmd string, p *parser) {
if eargs == nil || max == 0 || len(eargs) != 1 {
for i, uid := range c.uids {
lastIndex = i
if match, modseq := c.searchMatch(tx, msgseq(i+1), uid, *sk, &expungeIssued); match {
if match, modseq := c.searchMatch(tx, msgseq(i+1), uid, *sk, bodySearch, textSearch, &expungeIssued); match {
uids = append(uids, uid)
if modseq > maxModSeq {
maxModSeq = modseq
@ -123,7 +161,7 @@ func (c *conn) cmdxSearch(isUID bool, tag, cmd string, p *parser) {
// And reverse search for MAX if we have only MAX or MAX combined with MIN.
if max == 1 && (len(eargs) == 1 || min+max == len(eargs)) {
for i := len(c.uids) - 1; i > lastIndex; i-- {
if match, modseq := c.searchMatch(tx, msgseq(i+1), c.uids[i], *sk, &expungeIssued); match {
if match, modseq := c.searchMatch(tx, msgseq(i+1), c.uids[i], *sk, bodySearch, textSearch, &expungeIssued); match {
uids = append(uids, c.uids[i])
if modseq > maxModSeq {
maxModSeq = modseq
@ -245,7 +283,7 @@ type search struct {
hasModseq bool
}
func (c *conn) searchMatch(tx *bstore.Tx, seq msgseq, uid store.UID, sk searchKey, expungeIssued *bool) (bool, store.ModSeq) {
func (c *conn) searchMatch(tx *bstore.Tx, seq msgseq, uid store.UID, sk searchKey, bodySearch, textSearch *store.WordSearch, expungeIssued *bool) (bool, store.ModSeq) {
s := search{c: c, tx: tx, seq: seq, uid: uid, expungeIssued: expungeIssued, hasModseq: sk.hasModseq()}
defer func() {
if s.mr != nil {
@ -254,26 +292,48 @@ func (c *conn) searchMatch(tx *bstore.Tx, seq msgseq, uid store.UID, sk searchKe
s.mr = nil
}
}()
return s.match(sk)
return s.match(sk, bodySearch, textSearch)
}
func (s *search) match(sk searchKey) (match bool, modseq store.ModSeq) {
func (s *search) match(sk searchKey, bodySearch, textSearch *store.WordSearch) (match bool, modseq store.ModSeq) {
// Instead of littering all the cases in match0 with calls to get modseq, we do it once
// here in case of a match.
defer func() {
if match && s.hasModseq {
if s.m.ID == 0 {
match = s.xloadMessage()
match = s.xensureMessage()
}
modseq = s.m.ModSeq
}
}()
match = s.match0(sk)
if match && bodySearch != nil {
if !s.xensurePart() {
match = false
return
}
var err error
match, err = bodySearch.MatchPart(s.c.log, s.p, false)
xcheckf(err, "search words in bodies")
}
if match && textSearch != nil {
if !s.xensurePart() {
match = false
return
}
var err error
match, err = textSearch.MatchPart(s.c.log, s.p, true)
xcheckf(err, "search words in headers and bodies")
}
return
}
func (s *search) xloadMessage() bool {
func (s *search) xensureMessage() bool {
if s.m.ID > 0 {
return true
}
q := bstore.QueryTx[store.Message](s.tx)
q.FilterNonzero(store.Message{MailboxID: s.c.mailboxID, UID: s.uid})
m, err := q.Get()
@ -287,9 +347,35 @@ func (s *search) xloadMessage() bool {
return true
}
// ensure message, reader and part are loaded. returns whether that was
// successful.
func (s *search) xensurePart() bool {
if s.mr != nil {
return s.p != nil
}
if !s.xensureMessage() {
return false
}
// Closed by searchMatch after all (recursive) search.match calls are finished.
s.mr = s.c.account.MessageReader(s.m)
if s.m.ParsedBuf == nil {
s.c.log.Error("missing parsed message")
return false
}
p, err := s.m.LoadPart(s.mr)
xcheckf(err, "load parsed message")
s.p = &p
return true
}
func (s *search) match0(sk searchKey) bool {
c := s.c
// Difference between sk.searchKeys nil and length 0 is important. Because we take
// out word/notword searches, the list may be empty but non-nil.
if sk.searchKeys != nil {
for _, ssk := range sk.searchKeys {
if !s.match0(ssk) {
@ -338,22 +424,9 @@ func (s *search) match0(sk searchKey) bool {
return sk.uidSet.containsUID(s.uid, c.uids, c.searchResult)
}
// Parsed message.
if s.mr == nil {
if !s.xloadMessage() {
return false
}
// Closed by searchMatch after all (recursive) search.match calls are finished.
s.mr = c.account.MessageReader(s.m)
if s.m.ParsedBuf == nil {
c.log.Error("missing parsed message")
} else {
p, err := s.m.LoadPart(s.mr)
xcheckf(err, "load parsed message")
s.p = &p
}
// Parsed part.
if !s.xensurePart() {
return false
}
// Parsed message, basic info.
@ -451,9 +524,13 @@ func (s *search) match0(sk searchKey) bool {
case "BCC":
return filterHeader("Bcc", sk.astring)
case "BODY", "TEXT":
// We gathered word/notword searches from the top-level, but we can also get them
// nested.
// todo optimize: handle deeper nested word/not-word searches more efficiently.
headerToo := sk.op == "TEXT"
lower := strings.ToLower(sk.astring)
return mailContains(c, s.uid, s.p, lower, headerToo)
match, err := store.PrepareWordSearch([]string{sk.astring}, nil).MatchPart(s.c.log, s.p, headerToo)
xcheckf(err, "word search")
return match
case "CC":
return filterHeader("Cc", sk.astring)
case "FROM":
@ -495,38 +572,3 @@ func (s *search) match0(sk searchKey) bool {
}
panic(serverError{fmt.Errorf("missing case for search key op %q", sk.op)})
}
// mailContains returns whether the mail message or part represented by p contains (case-insensitive) string lower.
// The (decoded) text bodies are tested for a match.
// If headerToo is set, the header part of the message is checked as well.
func mailContains(c *conn, uid store.UID, p *message.Part, lower string, headerToo bool) bool {
if headerToo && mailContainsReader(c, uid, p.HeaderReader(), lower) {
return true
}
if len(p.Parts) == 0 {
if p.MediaType != "TEXT" {
// todo: for types we could try to find a library for parsing and search in there too
return false
}
// todo: for html and perhaps other types, we could try to parse as text and filter on the text.
return mailContainsReader(c, uid, p.Reader(), lower)
}
for _, pp := range p.Parts {
headerToo = pp.MediaType == "MESSAGE" && (pp.MediaSubType == "RFC822" || pp.MediaSubType == "GLOBAL")
if mailContains(c, uid, &pp, lower, headerToo) {
return true
}
}
return false
}
func mailContainsReader(c *conn, uid store.UID, r io.Reader, lower string) bool {
// todo: match as we read
buf, err := io.ReadAll(r)
if err != nil {
c.log.Errorx("reading for search text match", err, mlog.Field("uid", uid))
return false
}
return strings.Contains(strings.ToLower(string(buf)), lower)
}

View file

@ -87,12 +87,12 @@ func (f *Filter) mailParse(p message.Part, metaWords, textWords, htmlWords map[s
ct := p.MediaType + "/" + p.MediaSubType
if ct == "TEXT/HTML" {
err := f.tokenizeHTML(p.Reader(), metaWords, htmlWords)
err := f.tokenizeHTML(p.ReaderUTF8OrBinary(), metaWords, htmlWords)
// log.Printf("html parsed, words %v", htmlWords)
return err
}
if ct == "" || strings.HasPrefix(ct, "TEXT/") {
err := f.tokenizeText(p.Reader(), textWords)
err := f.tokenizeText(p.ReaderUTF8OrBinary(), textWords)
// log.Printf("text parsed, words %v", textWords)
return err
}

View file

@ -26,6 +26,7 @@ import (
"time"
"github.com/mjl-/mox/mlog"
"github.com/mjl-/mox/moxio"
"github.com/mjl-/mox/moxvar"
"github.com/mjl-/mox/smtp"
)
@ -474,6 +475,14 @@ func (p *Part) Reader() io.Reader {
return p.bodyReader(p.RawReader())
}
// ReaderUTF8OrBinary returns a reader for the decode body content, transformed to
// utf-8 for known mime/iana encodings (only if they aren't us-ascii or utf-8
// already). For unknown or missing character sets/encodings, the original reader
// is returned.
func (p *Part) ReaderUTF8OrBinary() io.Reader {
return moxio.DecodeReader(p.ContentTypeParams["charset"], p.Reader())
}
func (p *Part) bodyReader(r io.Reader) io.Reader {
r = newDecoder(p.ContentTransferEncoding, r)
if p.MediaType == "TEXT" {

27
moxio/decode.go Normal file
View file

@ -0,0 +1,27 @@
package moxio
import (
"io"
"strings"
"golang.org/x/text/encoding/ianaindex"
)
// DecodeReader returns a reader that reads from r, decoding as charset. If
// charset is empty, us-ascii, utf-8 or unknown, the original reader is
// returned and no decoding takes place.
func DecodeReader(charset string, r io.Reader) io.Reader {
switch strings.ToLower(charset) {
case "", "us-ascii", "utf-8":
return r
}
enc, _ := ianaindex.MIME.Encoding(charset)
if enc == nil {
enc, _ = ianaindex.IANA.Encoding(charset)
}
// todo: ianaindex doesn't know all encodings, e.g. gb2312. should we transform them, with which code?
if enc == nil {
return r
}
return enc.NewDecoder().Reader(r)
}

195
store/search.go Normal file
View file

@ -0,0 +1,195 @@
package store
import (
"bytes"
"io"
"strings"
"unicode"
"unicode/utf8"
"github.com/mjl-/mox/message"
"github.com/mjl-/mox/mlog"
)
// WordSearch holds context for a search, with scratch buffers to prevent
// allocations for each message.
type WordSearch struct {
words, notWords [][]byte
searchBuf, keepBuf []byte
}
// PrepareWordSearch returns a search context that can be used to match multiple
// messages (after each other, not concurrently).
func PrepareWordSearch(words, notWords []string) WordSearch {
var wl, nwl [][]byte
for _, w := range words {
wl = append(wl, []byte(strings.ToLower(w)))
}
for _, w := range notWords {
nwl = append(nwl, []byte(strings.ToLower(w)))
}
keep := 0
for _, w := range words {
if len(w) > keep {
keep = len(w)
}
}
for _, w := range notWords {
if len(w) > keep {
keep = len(w)
}
}
keep += 6 // Max utf-8 character size.
bufSize := 8 * 1024
for bufSize/keep < 8 {
bufSize *= 2
}
keepBuf := make([]byte, keep)
searchBuf := make([]byte, bufSize)
return WordSearch{wl, nwl, searchBuf, keepBuf}
}
// MatchPart returns whether the part/mail message p matches the search.
// The search terms are matched against content-transfer-decoded and
// charset-decoded bodies and optionally headers.
// HTML parts are currently treated as regular text, without parsing HTML.
func (ws WordSearch) MatchPart(log *mlog.Log, p *message.Part, headerToo bool) (bool, error) {
seen := map[int]bool{}
miss, err := ws.matchPart(log, p, headerToo, seen)
match := err == nil && !miss && len(seen) == len(ws.words)
return match, err
}
// If all words are seen, and we there are no not-words that force us to search
// till the end, we know we have a match.
func (ws WordSearch) isQuickHit(seen map[int]bool) bool {
return len(seen) == len(ws.words) && len(ws.notWords) == 0
}
// search a part as text and/or its subparts, recursively. Once we know we have
// a miss, we stop (either due to not-word match or error). In case of
// non-miss, the caller checks if there was a hit.
func (ws WordSearch) matchPart(log *mlog.Log, p *message.Part, headerToo bool, seen map[int]bool) (miss bool, rerr error) {
if headerToo {
miss, err := ws.searchReader(log, p.HeaderReader(), seen)
if miss || err != nil || ws.isQuickHit(seen) {
return miss, err
}
}
if len(p.Parts) == 0 {
if p.MediaType != "TEXT" {
// todo: for other types we could try to find a library for parsing and search in there too.
return false, nil
}
tp := p.ReaderUTF8OrBinary()
// todo: for html and perhaps other types, we could try to parse as text and filter on the text.
miss, err := ws.searchReader(log, tp, seen)
if miss || err != nil || ws.isQuickHit(seen) {
return miss, err
}
}
for _, pp := range p.Parts {
if pp.Message != nil {
if err := pp.SetMessageReaderAt(); err != nil {
return false, err
}
pp = *pp.Message
}
miss, err := ws.matchPart(log, &pp, headerToo, seen)
if miss || err != nil || ws.isQuickHit(seen) {
return miss, err
}
}
return false, nil
}
func (ws WordSearch) searchReader(log *mlog.Log, r io.Reader, seen map[int]bool) (miss bool, rerr error) {
// We will be reading through the content, stopping as soon as we known an answer:
// when all words have been seen and there are no "not words" (true), or one "not
// word" has been seen (false). We use bytes.Contains to look for the words. We
// advance our buffer in largish chunks, keeping the end of the buffer the size of
// the largest word plus the max of an utf-8 character to account for words
// spanning chunks.
have := 0
for {
n, err := io.ReadFull(r, ws.searchBuf[have:])
if n > 0 {
have += n
}
if err != nil && err != io.EOF && err != io.ErrUnexpectedEOF {
return true, err
}
if err == nil {
copy(ws.keepBuf, ws.searchBuf[have-len(ws.keepBuf):])
}
lower := toLower(ws.searchBuf[:have])
for i, w := range ws.words {
if !seen[i] && bytes.Contains(lower, w) {
seen[i] = true
if len(seen) == len(ws.words) && len(ws.notWords) == 0 {
return false, nil
}
}
}
for _, w := range ws.notWords {
if bytes.Contains(lower, w) {
return true, nil
}
}
if err != nil {
// Must be EOF or UnexpectedEOF now.
break
}
copy(ws.searchBuf, ws.keepBuf)
have = len(ws.keepBuf)
}
return false, nil
}
// in-place lower-casing, only allocating a new slice when lower-case would become
// larger. we replace RuneError (0xfffd) by byte value 0, because it would often
// increase size, but we assume no one wants to match it.
func toLower(buf []byte) []byte {
r := buf[:0]
copied := false
for i := 0; i < len(buf); {
if buf[i] < 0x80 {
b := buf[i]
if b >= 'A' && b <= 'Z' {
b += 0x20
}
r = append(r, b)
i++
continue
}
c, size := utf8.DecodeRune(buf[i:])
i += size
nc := unicode.ToLower(c)
if nc < 0 {
continue
}
if c == utf8.RuneError {
r = append(r, 0)
continue
}
nsize := utf8.RuneLen(nc)
// Take care not to overwrite the part of the buffer we still have to process.
if !copied && len(r)+nsize > i {
// eg Ⱥ 0x23a (2 bytes) to ⱥ 0x2c65 (3 bytes)
copied = true
nr := make([]byte, len(r), len(r)+nsize+len(buf)-i)
copy(nr, r)
nr = r
}
r = utf8.AppendRune(r, nc)
}
return r
}

249
vendor/golang.org/x/text/encoding/charmap/charmap.go generated vendored Normal file
View file

@ -0,0 +1,249 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate go run maketables.go
// Package charmap provides simple character encodings such as IBM Code Page 437
// and Windows 1252.
package charmap // import "golang.org/x/text/encoding/charmap"
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// These encodings vary only in the way clients should interpret them. Their
// coded character set is identical and a single implementation can be shared.
var (
// ISO8859_6E is the ISO 8859-6E encoding.
ISO8859_6E encoding.Encoding = &iso8859_6E
// ISO8859_6I is the ISO 8859-6I encoding.
ISO8859_6I encoding.Encoding = &iso8859_6I
// ISO8859_8E is the ISO 8859-8E encoding.
ISO8859_8E encoding.Encoding = &iso8859_8E
// ISO8859_8I is the ISO 8859-8I encoding.
ISO8859_8I encoding.Encoding = &iso8859_8I
iso8859_6E = internal.Encoding{
Encoding: ISO8859_6,
Name: "ISO-8859-6E",
MIB: identifier.ISO88596E,
}
iso8859_6I = internal.Encoding{
Encoding: ISO8859_6,
Name: "ISO-8859-6I",
MIB: identifier.ISO88596I,
}
iso8859_8E = internal.Encoding{
Encoding: ISO8859_8,
Name: "ISO-8859-8E",
MIB: identifier.ISO88598E,
}
iso8859_8I = internal.Encoding{
Encoding: ISO8859_8,
Name: "ISO-8859-8I",
MIB: identifier.ISO88598I,
}
)
// All is a list of all defined encodings in this package.
var All []encoding.Encoding = listAll
// TODO: implement these encodings, in order of importance.
// ASCII, ISO8859_1: Rather common. Close to Windows 1252.
// ISO8859_9: Close to Windows 1254.
// utf8Enc holds a rune's UTF-8 encoding in data[:len].
type utf8Enc struct {
len uint8
data [3]byte
}
// Charmap is an 8-bit character set encoding.
type Charmap struct {
// name is the encoding's name.
name string
// mib is the encoding type of this encoder.
mib identifier.MIB
// asciiSuperset states whether the encoding is a superset of ASCII.
asciiSuperset bool
// low is the lower bound of the encoded byte for a non-ASCII rune. If
// Charmap.asciiSuperset is true then this will be 0x80, otherwise 0x00.
low uint8
// replacement is the encoded replacement character.
replacement byte
// decode is the map from encoded byte to UTF-8.
decode [256]utf8Enc
// encoding is the map from runes to encoded bytes. Each entry is a
// uint32: the high 8 bits are the encoded byte and the low 24 bits are
// the rune. The table entries are sorted by ascending rune.
encode [256]uint32
}
// NewDecoder implements the encoding.Encoding interface.
func (m *Charmap) NewDecoder() *encoding.Decoder {
return &encoding.Decoder{Transformer: charmapDecoder{charmap: m}}
}
// NewEncoder implements the encoding.Encoding interface.
func (m *Charmap) NewEncoder() *encoding.Encoder {
return &encoding.Encoder{Transformer: charmapEncoder{charmap: m}}
}
// String returns the Charmap's name.
func (m *Charmap) String() string {
return m.name
}
// ID implements an internal interface.
func (m *Charmap) ID() (mib identifier.MIB, other string) {
return m.mib, ""
}
// charmapDecoder implements transform.Transformer by decoding to UTF-8.
type charmapDecoder struct {
transform.NopResetter
charmap *Charmap
}
func (m charmapDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
for i, c := range src {
if m.charmap.asciiSuperset && c < utf8.RuneSelf {
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = c
nDst++
nSrc = i + 1
continue
}
decode := &m.charmap.decode[c]
n := int(decode.len)
if nDst+n > len(dst) {
err = transform.ErrShortDst
break
}
// It's 15% faster to avoid calling copy for these tiny slices.
for j := 0; j < n; j++ {
dst[nDst] = decode.data[j]
nDst++
}
nSrc = i + 1
}
return nDst, nSrc, err
}
// DecodeByte returns the Charmap's rune decoding of the byte b.
func (m *Charmap) DecodeByte(b byte) rune {
switch x := &m.decode[b]; x.len {
case 1:
return rune(x.data[0])
case 2:
return rune(x.data[0]&0x1f)<<6 | rune(x.data[1]&0x3f)
default:
return rune(x.data[0]&0x0f)<<12 | rune(x.data[1]&0x3f)<<6 | rune(x.data[2]&0x3f)
}
}
// charmapEncoder implements transform.Transformer by encoding from UTF-8.
type charmapEncoder struct {
transform.NopResetter
charmap *Charmap
}
func (m charmapEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for nSrc < len(src) {
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
if m.charmap.asciiSuperset {
nSrc++
dst[nDst] = uint8(r)
nDst++
continue
}
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
} else {
err = internal.RepertoireError(m.charmap.replacement)
}
break
}
}
// Binary search in [low, high) for that rune in the m.charmap.encode table.
for low, high := int(m.charmap.low), 0x100; ; {
if low >= high {
err = internal.RepertoireError(m.charmap.replacement)
break loop
}
mid := (low + high) / 2
got := m.charmap.encode[mid]
gotRune := rune(got & (1<<24 - 1))
if gotRune < r {
low = mid + 1
} else if gotRune > r {
high = mid
} else {
dst[nDst] = byte(got >> 24)
nDst++
break
}
}
nSrc += size
}
return nDst, nSrc, err
}
// EncodeRune returns the Charmap's byte encoding of the rune r. ok is whether
// r is in the Charmap's repertoire. If not, b is set to the Charmap's
// replacement byte. This is often the ASCII substitute character '\x1a'.
func (m *Charmap) EncodeRune(r rune) (b byte, ok bool) {
if r < utf8.RuneSelf && m.asciiSuperset {
return byte(r), true
}
for low, high := int(m.low), 0x100; ; {
if low >= high {
return m.replacement, false
}
mid := (low + high) / 2
got := m.encode[mid]
gotRune := rune(got & (1<<24 - 1))
if gotRune < r {
low = mid + 1
} else if gotRune > r {
high = mid
} else {
return byte(got >> 24), true
}
}
}

7410
vendor/golang.org/x/text/encoding/charmap/tables.go generated vendored Normal file

File diff suppressed because it is too large Load diff

335
vendor/golang.org/x/text/encoding/encoding.go generated vendored Normal file
View file

@ -0,0 +1,335 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package encoding defines an interface for character encodings, such as Shift
// JIS and Windows 1252, that can convert to and from UTF-8.
//
// Encoding implementations are provided in other packages, such as
// golang.org/x/text/encoding/charmap and
// golang.org/x/text/encoding/japanese.
package encoding // import "golang.org/x/text/encoding"
import (
"errors"
"io"
"strconv"
"unicode/utf8"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// TODO:
// - There seems to be some inconsistency in when decoders return errors
// and when not. Also documentation seems to suggest they shouldn't return
// errors at all (except for UTF-16).
// - Encoders seem to rely on or at least benefit from the input being in NFC
// normal form. Perhaps add an example how users could prepare their output.
// Encoding is a character set encoding that can be transformed to and from
// UTF-8.
type Encoding interface {
// NewDecoder returns a Decoder.
NewDecoder() *Decoder
// NewEncoder returns an Encoder.
NewEncoder() *Encoder
}
// A Decoder converts bytes to UTF-8. It implements transform.Transformer.
//
// Transforming source bytes that are not of that encoding will not result in an
// error per se. Each byte that cannot be transcoded will be represented in the
// output by the UTF-8 encoding of '\uFFFD', the replacement rune.
type Decoder struct {
transform.Transformer
// This forces external creators of Decoders to use names in struct
// initializers, allowing for future extendibility without having to break
// code.
_ struct{}
}
// Bytes converts the given encoded bytes to UTF-8. It returns the converted
// bytes or nil, err if any error occurred.
func (d *Decoder) Bytes(b []byte) ([]byte, error) {
b, _, err := transform.Bytes(d, b)
if err != nil {
return nil, err
}
return b, nil
}
// String converts the given encoded string to UTF-8. It returns the converted
// string or "", err if any error occurred.
func (d *Decoder) String(s string) (string, error) {
s, _, err := transform.String(d, s)
if err != nil {
return "", err
}
return s, nil
}
// Reader wraps another Reader to decode its bytes.
//
// The Decoder may not be used for any other operation as long as the returned
// Reader is in use.
func (d *Decoder) Reader(r io.Reader) io.Reader {
return transform.NewReader(r, d)
}
// An Encoder converts bytes from UTF-8. It implements transform.Transformer.
//
// Each rune that cannot be transcoded will result in an error. In this case,
// the transform will consume all source byte up to, not including the offending
// rune. Transforming source bytes that are not valid UTF-8 will be replaced by
// `\uFFFD`. To return early with an error instead, use transform.Chain to
// preprocess the data with a UTF8Validator.
type Encoder struct {
transform.Transformer
// This forces external creators of Encoders to use names in struct
// initializers, allowing for future extendibility without having to break
// code.
_ struct{}
}
// Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if
// any error occurred.
func (e *Encoder) Bytes(b []byte) ([]byte, error) {
b, _, err := transform.Bytes(e, b)
if err != nil {
return nil, err
}
return b, nil
}
// String converts a string from UTF-8. It returns the converted string or
// "", err if any error occurred.
func (e *Encoder) String(s string) (string, error) {
s, _, err := transform.String(e, s)
if err != nil {
return "", err
}
return s, nil
}
// Writer wraps another Writer to encode its UTF-8 output.
//
// The Encoder may not be used for any other operation as long as the returned
// Writer is in use.
func (e *Encoder) Writer(w io.Writer) io.Writer {
return transform.NewWriter(w, e)
}
// ASCIISub is the ASCII substitute character, as recommended by
// https://unicode.org/reports/tr36/#Text_Comparison
const ASCIISub = '\x1a'
// Nop is the nop encoding. Its transformed bytes are the same as the source
// bytes; it does not replace invalid UTF-8 sequences.
var Nop Encoding = nop{}
type nop struct{}
func (nop) NewDecoder() *Decoder {
return &Decoder{Transformer: transform.Nop}
}
func (nop) NewEncoder() *Encoder {
return &Encoder{Transformer: transform.Nop}
}
// Replacement is the replacement encoding. Decoding from the replacement
// encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to
// the replacement encoding yields the same as the source bytes except that
// invalid UTF-8 is converted to '\uFFFD'.
//
// It is defined at http://encoding.spec.whatwg.org/#replacement
var Replacement Encoding = replacement{}
type replacement struct{}
func (replacement) NewDecoder() *Decoder {
return &Decoder{Transformer: replacementDecoder{}}
}
func (replacement) NewEncoder() *Encoder {
return &Encoder{Transformer: replacementEncoder{}}
}
func (replacement) ID() (mib identifier.MIB, other string) {
return identifier.Replacement, ""
}
type replacementDecoder struct{ transform.NopResetter }
func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
if len(dst) < 3 {
return 0, 0, transform.ErrShortDst
}
if atEOF {
const fffd = "\ufffd"
dst[0] = fffd[0]
dst[1] = fffd[1]
dst[2] = fffd[2]
nDst = 3
}
return nDst, len(src), nil
}
type replacementEncoder struct{ transform.NopResetter }
func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
r = '\ufffd'
}
}
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return nDst, nSrc, err
}
// HTMLEscapeUnsupported wraps encoders to replace source runes outside the
// repertoire of the destination encoding with HTML escape sequences.
//
// This wrapper exists to comply to URL and HTML forms requiring a
// non-terminating legacy encoder. The produced sequences may lead to data
// loss as they are indistinguishable from legitimate input. To avoid this
// issue, use UTF-8 encodings whenever possible.
func HTMLEscapeUnsupported(e *Encoder) *Encoder {
return &Encoder{Transformer: &errorHandler{e, errorToHTML}}
}
// ReplaceUnsupported wraps encoders to replace source runes outside the
// repertoire of the destination encoding with an encoding-specific
// replacement.
//
// This wrapper is only provided for backwards compatibility and legacy
// handling. Its use is strongly discouraged. Use UTF-8 whenever possible.
func ReplaceUnsupported(e *Encoder) *Encoder {
return &Encoder{Transformer: &errorHandler{e, errorToReplacement}}
}
type errorHandler struct {
*Encoder
handler func(dst []byte, r rune, err repertoireError) (n int, ok bool)
}
// TODO: consider making this error public in some form.
type repertoireError interface {
Replacement() byte
}
func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF)
for err != nil {
rerr, ok := err.(repertoireError)
if !ok {
return nDst, nSrc, err
}
r, sz := utf8.DecodeRune(src[nSrc:])
n, ok := h.handler(dst[nDst:], r, rerr)
if !ok {
return nDst, nSrc, transform.ErrShortDst
}
err = nil
nDst += n
if nSrc += sz; nSrc < len(src) {
var dn, sn int
dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF)
nDst += dn
nSrc += sn
}
}
return nDst, nSrc, err
}
func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) {
buf := [8]byte{}
b := strconv.AppendUint(buf[:0], uint64(r), 10)
if n = len(b) + len("&#;"); n >= len(dst) {
return 0, false
}
dst[0] = '&'
dst[1] = '#'
dst[copy(dst[2:], b)+2] = ';'
return n, true
}
func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) {
if len(dst) == 0 {
return 0, false
}
dst[0] = err.Replacement()
return 1, true
}
// ErrInvalidUTF8 means that a transformer encountered invalid UTF-8.
var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8")
// UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first
// input byte that is not valid UTF-8.
var UTF8Validator transform.Transformer = utf8Validator{}
type utf8Validator struct{ transform.NopResetter }
func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
n := len(src)
if n > len(dst) {
n = len(dst)
}
for i := 0; i < n; {
if c := src[i]; c < utf8.RuneSelf {
dst[i] = c
i++
continue
}
_, size := utf8.DecodeRune(src[i:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
err = ErrInvalidUTF8
if !atEOF && !utf8.FullRune(src[i:]) {
err = transform.ErrShortSrc
}
return i, i, err
}
if i+size > len(dst) {
return i, i, transform.ErrShortDst
}
for ; size > 0; size-- {
dst[i] = src[i]
i++
}
}
if len(src) > len(dst) {
err = transform.ErrShortDst
}
return n, n, err
}

74
vendor/golang.org/x/text/encoding/ianaindex/ascii.go generated vendored Normal file
View file

@ -0,0 +1,74 @@
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package ianaindex
import (
"unicode"
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
type asciiDecoder struct {
transform.NopResetter
}
func (d asciiDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
for _, c := range src {
if c > unicode.MaxASCII {
r := unicode.ReplacementChar
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break
}
nDst += utf8.EncodeRune(dst[nDst:], r)
nSrc++
continue
}
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = c
nDst++
nSrc++
}
return nDst, nSrc, err
}
type asciiEncoder struct {
transform.NopResetter
}
func (d asciiEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
for _, c := range src {
if c > unicode.MaxASCII {
err = internal.RepertoireError(encoding.ASCIISub)
break
}
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = c
nDst++
nSrc++
}
return nDst, nSrc, err
}
var asciiEnc = &internal.Encoding{
Encoding: &internal.SimpleEncoding{
asciiDecoder{},
asciiEncoder{},
},
Name: "US-ASCII",
MIB: identifier.ASCII,
}

View file

@ -0,0 +1,214 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate go run gen.go
// Package ianaindex maps names to Encodings as specified by the IANA registry.
// This includes both the MIME and IANA names.
//
// See http://www.iana.org/assignments/character-sets/character-sets.xhtml for
// more details.
package ianaindex
import (
"errors"
"sort"
"strings"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/charmap"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/encoding/japanese"
"golang.org/x/text/encoding/korean"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/encoding/traditionalchinese"
"golang.org/x/text/encoding/unicode"
)
// TODO: remove the "Status... incomplete" in the package doc comment.
// TODO: allow users to specify their own aliases?
// TODO: allow users to specify their own indexes?
// TODO: allow canonicalizing names
// NOTE: only use these top-level variables if we can get the linker to drop
// the indexes when they are not used. Make them a function or perhaps only
// support MIME otherwise.
var (
// MIME is an index to map MIME names.
MIME *Index = mime
// IANA is an index that supports all names and aliases using IANA names as
// the canonical identifier.
IANA *Index = iana
// MIB is an index that associates the MIB display name with an Encoding.
MIB *Index = mib
mime = &Index{mimeName, ianaToMIB, ianaAliases, encodings[:]}
iana = &Index{ianaName, ianaToMIB, ianaAliases, encodings[:]}
mib = &Index{mibName, ianaToMIB, ianaAliases, encodings[:]}
)
// Index maps names registered by IANA to Encodings.
// Currently different Indexes only differ in the names they return for
// encodings. In the future they may also differ in supported aliases.
type Index struct {
names func(i int) string
toMIB []identifier.MIB // Sorted slice of supported MIBs
alias map[string]int
enc []encoding.Encoding
}
var (
errInvalidName = errors.New("ianaindex: invalid encoding name")
errUnknown = errors.New("ianaindex: unknown Encoding")
errUnsupported = errors.New("ianaindex: unsupported Encoding")
)
// Encoding returns an Encoding for IANA-registered names. Matching is
// case-insensitive.
//
// If the provided name doesn't match a IANA-registered charset, an error is
// returned. If the name matches a IANA-registered charset but isn't supported,
// a nil encoding and a nil error are returned.
func (x *Index) Encoding(name string) (encoding.Encoding, error) {
name = strings.TrimSpace(name)
// First try without lowercasing (possibly creating an allocation).
i, ok := x.alias[name]
if !ok {
i, ok = x.alias[strings.ToLower(name)]
if !ok {
return nil, errInvalidName
}
}
return x.enc[i], nil
}
// Name reports the canonical name of the given Encoding. It will return an
// error if the e is not associated with a known encoding scheme.
func (x *Index) Name(e encoding.Encoding) (string, error) {
id, ok := e.(identifier.Interface)
if !ok {
return "", errUnknown
}
mib, _ := id.ID()
if mib == 0 {
return "", errUnknown
}
v := findMIB(x.toMIB, mib)
if v == -1 {
return "", errUnsupported
}
return x.names(v), nil
}
// TODO: the coverage of this index is rather spotty. Allowing users to set
// encodings would allow:
// - users to increase coverage
// - allow a partially loaded set of encodings in case the user doesn't need to
// them all.
// - write an OS-specific wrapper for supported encodings and set them.
// The exact definition of Set depends a bit on if and how we want to let users
// write their own Encoding implementations. Also, it is not possible yet to
// only partially load the encodings without doing some refactoring. Until this
// is solved, we might as well not support Set.
// // Set sets the e to be used for the encoding scheme identified by name. Only
// // canonical names may be used. An empty name assigns e to its internally
// // associated encoding scheme.
// func (x *Index) Set(name string, e encoding.Encoding) error {
// panic("TODO: implement")
// }
func findMIB(x []identifier.MIB, mib identifier.MIB) int {
i := sort.Search(len(x), func(i int) bool { return x[i] >= mib })
if i < len(x) && x[i] == mib {
return i
}
return -1
}
const maxMIMENameLen = '0' - 1 // officially 40, but we leave some buffer.
func mimeName(x int) string {
n := ianaNames[x]
// See gen.go for a description of the encoding.
if n[0] <= maxMIMENameLen {
return n[1:n[0]]
}
return n
}
func ianaName(x int) string {
n := ianaNames[x]
// See gen.go for a description of the encoding.
if n[0] <= maxMIMENameLen {
return n[n[0]:]
}
return n
}
func mibName(x int) string {
return mibNames[x]
}
var encodings = [numIANA]encoding.Encoding{
enc3: asciiEnc,
enc106: unicode.UTF8,
enc1015: unicode.UTF16(unicode.BigEndian, unicode.UseBOM),
enc1013: unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM),
enc1014: unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM),
enc2028: charmap.CodePage037,
enc2011: charmap.CodePage437,
enc2009: charmap.CodePage850,
enc2010: charmap.CodePage852,
enc2046: charmap.CodePage855,
enc2089: charmap.CodePage858,
enc2048: charmap.CodePage860,
enc2013: charmap.CodePage862,
enc2050: charmap.CodePage863,
enc2052: charmap.CodePage865,
enc2086: charmap.CodePage866,
enc2102: charmap.CodePage1047,
enc2091: charmap.CodePage1140,
enc4: charmap.ISO8859_1,
enc5: charmap.ISO8859_2,
enc6: charmap.ISO8859_3,
enc7: charmap.ISO8859_4,
enc8: charmap.ISO8859_5,
enc9: charmap.ISO8859_6,
enc81: charmap.ISO8859_6E,
enc82: charmap.ISO8859_6I,
enc10: charmap.ISO8859_7,
enc11: charmap.ISO8859_8,
enc84: charmap.ISO8859_8E,
enc85: charmap.ISO8859_8I,
enc12: charmap.ISO8859_9,
enc13: charmap.ISO8859_10,
enc109: charmap.ISO8859_13,
enc110: charmap.ISO8859_14,
enc111: charmap.ISO8859_15,
enc112: charmap.ISO8859_16,
enc2084: charmap.KOI8R,
enc2088: charmap.KOI8U,
enc2027: charmap.Macintosh,
enc2109: charmap.Windows874,
enc2250: charmap.Windows1250,
enc2251: charmap.Windows1251,
enc2252: charmap.Windows1252,
enc2253: charmap.Windows1253,
enc2254: charmap.Windows1254,
enc2255: charmap.Windows1255,
enc2256: charmap.Windows1256,
enc2257: charmap.Windows1257,
enc2258: charmap.Windows1258,
enc18: japanese.EUCJP,
enc39: japanese.ISO2022JP,
enc17: japanese.ShiftJIS,
enc38: korean.EUCKR,
enc114: simplifiedchinese.GB18030,
enc113: simplifiedchinese.GBK,
enc2085: simplifiedchinese.HZGB2312,
enc2026: traditionalchinese.Big5,
}

2355
vendor/golang.org/x/text/encoding/ianaindex/tables.go generated vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,81 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate go run gen.go
// Package identifier defines the contract between implementations of Encoding
// and Index by defining identifiers that uniquely identify standardized coded
// character sets (CCS) and character encoding schemes (CES), which we will
// together refer to as encodings, for which Encoding implementations provide
// converters to and from UTF-8. This package is typically only of concern to
// implementers of Indexes and Encodings.
//
// One part of the identifier is the MIB code, which is defined by IANA and
// uniquely identifies a CCS or CES. Each code is associated with data that
// references authorities, official documentation as well as aliases and MIME
// names.
//
// Not all CESs are covered by the IANA registry. The "other" string that is
// returned by ID can be used to identify other character sets or versions of
// existing ones.
//
// It is recommended that each package that provides a set of Encodings provide
// the All and Common variables to reference all supported encodings and
// commonly used subset. This allows Index implementations to include all
// available encodings without explicitly referencing or knowing about them.
package identifier
// Note: this package is internal, but could be made public if there is a need
// for writing third-party Indexes and Encodings.
// References:
// - http://source.icu-project.org/repos/icu/icu/trunk/source/data/mappings/convrtrs.txt
// - http://www.iana.org/assignments/character-sets/character-sets.xhtml
// - http://www.iana.org/assignments/ianacharset-mib/ianacharset-mib
// - http://www.ietf.org/rfc/rfc2978.txt
// - https://www.unicode.org/reports/tr22/
// - http://www.w3.org/TR/encoding/
// - https://encoding.spec.whatwg.org/
// - https://encoding.spec.whatwg.org/encodings.json
// - https://tools.ietf.org/html/rfc6657#section-5
// Interface can be implemented by Encodings to define the CCS or CES for which
// it implements conversions.
type Interface interface {
// ID returns an encoding identifier. Exactly one of the mib and other
// values should be non-zero.
//
// In the usual case it is only necessary to indicate the MIB code. The
// other string can be used to specify encodings for which there is no MIB,
// such as "x-mac-dingbat".
//
// The other string may only contain the characters a-z, A-Z, 0-9, - and _.
ID() (mib MIB, other string)
// NOTE: the restrictions on the encoding are to allow extending the syntax
// with additional information such as versions, vendors and other variants.
}
// A MIB identifies an encoding. It is derived from the IANA MIB codes and adds
// some identifiers for some encodings that are not covered by the IANA
// standard.
//
// See http://www.iana.org/assignments/ianacharset-mib.
type MIB uint16
// These additional MIB types are not defined in IANA. They are added because
// they are common and defined within the text repo.
const (
// Unofficial marks the start of encodings not registered by IANA.
Unofficial MIB = 10000 + iota
// Replacement is the WhatWG replacement encoding.
Replacement
// XUserDefined is the code for x-user-defined.
XUserDefined
// MacintoshCyrillic is the code for x-mac-cyrillic.
MacintoshCyrillic
)

File diff suppressed because it is too large Load diff

75
vendor/golang.org/x/text/encoding/internal/internal.go generated vendored Normal file
View file

@ -0,0 +1,75 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package internal contains code that is shared among encoding implementations.
package internal
import (
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// Encoding is an implementation of the Encoding interface that adds the String
// and ID methods to an existing encoding.
type Encoding struct {
encoding.Encoding
Name string
MIB identifier.MIB
}
// _ verifies that Encoding implements identifier.Interface.
var _ identifier.Interface = (*Encoding)(nil)
func (e *Encoding) String() string {
return e.Name
}
func (e *Encoding) ID() (mib identifier.MIB, other string) {
return e.MIB, ""
}
// SimpleEncoding is an Encoding that combines two Transformers.
type SimpleEncoding struct {
Decoder transform.Transformer
Encoder transform.Transformer
}
func (e *SimpleEncoding) NewDecoder() *encoding.Decoder {
return &encoding.Decoder{Transformer: e.Decoder}
}
func (e *SimpleEncoding) NewEncoder() *encoding.Encoder {
return &encoding.Encoder{Transformer: e.Encoder}
}
// FuncEncoding is an Encoding that combines two functions returning a new
// Transformer.
type FuncEncoding struct {
Decoder func() transform.Transformer
Encoder func() transform.Transformer
}
func (e FuncEncoding) NewDecoder() *encoding.Decoder {
return &encoding.Decoder{Transformer: e.Decoder()}
}
func (e FuncEncoding) NewEncoder() *encoding.Encoder {
return &encoding.Encoder{Transformer: e.Encoder()}
}
// A RepertoireError indicates a rune is not in the repertoire of a destination
// encoding. It is associated with an encoding-specific suggested replacement
// byte.
type RepertoireError byte
// Error implements the error interface.
func (r RepertoireError) Error() string {
return "encoding: rune not supported by encoding."
}
// Replacement returns the replacement string associated with this error.
func (r RepertoireError) Replacement() byte { return byte(r) }
var ErrASCIIReplacement = RepertoireError(encoding.ASCIISub)

12
vendor/golang.org/x/text/encoding/japanese/all.go generated vendored Normal file
View file

@ -0,0 +1,12 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package japanese
import (
"golang.org/x/text/encoding"
)
// All is a list of all defined encodings in this package.
var All = []encoding.Encoding{EUCJP, ISO2022JP, ShiftJIS}

225
vendor/golang.org/x/text/encoding/japanese/eucjp.go generated vendored Normal file
View file

@ -0,0 +1,225 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package japanese
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// EUCJP is the EUC-JP encoding.
var EUCJP encoding.Encoding = &eucJP
var eucJP = internal.Encoding{
&internal.SimpleEncoding{eucJPDecoder{}, eucJPEncoder{}},
"EUC-JP",
identifier.EUCPkdFmtJapanese,
}
type eucJPDecoder struct{ transform.NopResetter }
// See https://encoding.spec.whatwg.org/#euc-jp-decoder.
func (eucJPDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for ; nSrc < len(src); nSrc += size {
switch c0 := src[nSrc]; {
case c0 < utf8.RuneSelf:
r, size = rune(c0), 1
case c0 == 0x8e:
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
break
}
c1 := src[nSrc+1]
switch {
case c1 < 0xa1:
r, size = utf8.RuneError, 1
case c1 > 0xdf:
r, size = utf8.RuneError, 2
if c1 == 0xff {
size = 1
}
default:
r, size = rune(c1)+(0xff61-0xa1), 2
}
case c0 == 0x8f:
if nSrc+2 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
if p := nSrc + 1; p < len(src) && 0xa1 <= src[p] && src[p] < 0xfe {
size = 2
}
break
}
c1 := src[nSrc+1]
if c1 < 0xa1 || 0xfe < c1 {
r, size = utf8.RuneError, 1
break
}
c2 := src[nSrc+2]
if c2 < 0xa1 || 0xfe < c2 {
r, size = utf8.RuneError, 2
break
}
r, size = utf8.RuneError, 3
if i := int(c1-0xa1)*94 + int(c2-0xa1); i < len(jis0212Decode) {
r = rune(jis0212Decode[i])
if r == 0 {
r = utf8.RuneError
}
}
case 0xa1 <= c0 && c0 <= 0xfe:
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
break
}
c1 := src[nSrc+1]
if c1 < 0xa1 || 0xfe < c1 {
r, size = utf8.RuneError, 1
break
}
r, size = utf8.RuneError, 2
if i := int(c0-0xa1)*94 + int(c1-0xa1); i < len(jis0208Decode) {
r = rune(jis0208Decode[i])
if r == 0 {
r = utf8.RuneError
}
}
default:
r, size = utf8.RuneError, 1
}
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break loop
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return nDst, nSrc, err
}
type eucJPEncoder struct{ transform.NopResetter }
func (eucJPEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
// func init checks that the switch covers all tables.
switch {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r != 0 {
goto write2or3
}
case encode1Low <= r && r < encode1High:
if r = rune(encode1[r-encode1Low]); r != 0 {
goto write2or3
}
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r != 0 {
goto write2or3
}
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r != 0 {
goto write2or3
}
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r != 0 {
goto write2or3
}
case encode5Low <= r && r < encode5High:
if 0xff61 <= r && r < 0xffa0 {
goto write2
}
if r = rune(encode5[r-encode5Low]); r != 0 {
goto write2or3
}
}
err = internal.ErrASCIIReplacement
break
}
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r)
nDst++
continue
write2or3:
if r>>tableShift == jis0208 {
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
} else {
if nDst+3 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = 0x8f
nDst++
}
dst[nDst+0] = 0xa1 + uint8(r>>codeShift)&codeMask
dst[nDst+1] = 0xa1 + uint8(r)&codeMask
nDst += 2
continue
write2:
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = 0x8e
dst[nDst+1] = uint8(r - (0xff61 - 0xa1))
nDst += 2
continue
}
return nDst, nSrc, err
}
func init() {
// Check that the hard-coded encode switch covers all tables.
if numEncodeTables != 6 {
panic("bad numEncodeTables")
}
}

299
vendor/golang.org/x/text/encoding/japanese/iso2022jp.go generated vendored Normal file
View file

@ -0,0 +1,299 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package japanese
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// ISO2022JP is the ISO-2022-JP encoding.
var ISO2022JP encoding.Encoding = &iso2022JP
var iso2022JP = internal.Encoding{
internal.FuncEncoding{iso2022JPNewDecoder, iso2022JPNewEncoder},
"ISO-2022-JP",
identifier.ISO2022JP,
}
func iso2022JPNewDecoder() transform.Transformer {
return new(iso2022JPDecoder)
}
func iso2022JPNewEncoder() transform.Transformer {
return new(iso2022JPEncoder)
}
const (
asciiState = iota
katakanaState
jis0208State
jis0212State
)
const asciiEsc = 0x1b
type iso2022JPDecoder int
func (d *iso2022JPDecoder) Reset() {
*d = asciiState
}
func (d *iso2022JPDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
for ; nSrc < len(src); nSrc += size {
c0 := src[nSrc]
if c0 >= utf8.RuneSelf {
r, size = '\ufffd', 1
goto write
}
if c0 == asciiEsc {
if nSrc+2 >= len(src) {
if !atEOF {
return nDst, nSrc, transform.ErrShortSrc
}
// TODO: is it correct to only skip 1??
r, size = '\ufffd', 1
goto write
}
size = 3
c1 := src[nSrc+1]
c2 := src[nSrc+2]
switch {
case c1 == '$' && (c2 == '@' || c2 == 'B'): // 0x24 {0x40, 0x42}
*d = jis0208State
continue
case c1 == '$' && c2 == '(': // 0x24 0x28
if nSrc+3 >= len(src) {
if !atEOF {
return nDst, nSrc, transform.ErrShortSrc
}
r, size = '\ufffd', 1
goto write
}
size = 4
if src[nSrc+3] == 'D' {
*d = jis0212State
continue
}
case c1 == '(' && (c2 == 'B' || c2 == 'J'): // 0x28 {0x42, 0x4A}
*d = asciiState
continue
case c1 == '(' && c2 == 'I': // 0x28 0x49
*d = katakanaState
continue
}
r, size = '\ufffd', 1
goto write
}
switch *d {
case asciiState:
r, size = rune(c0), 1
case katakanaState:
if c0 < 0x21 || 0x60 <= c0 {
r, size = '\ufffd', 1
goto write
}
r, size = rune(c0)+(0xff61-0x21), 1
default:
if c0 == 0x0a {
*d = asciiState
r, size = rune(c0), 1
goto write
}
if nSrc+1 >= len(src) {
if !atEOF {
return nDst, nSrc, transform.ErrShortSrc
}
r, size = '\ufffd', 1
goto write
}
size = 2
c1 := src[nSrc+1]
i := int(c0-0x21)*94 + int(c1-0x21)
if *d == jis0208State && i < len(jis0208Decode) {
r = rune(jis0208Decode[i])
} else if *d == jis0212State && i < len(jis0212Decode) {
r = rune(jis0212Decode[i])
} else {
r = '\ufffd'
goto write
}
if r == 0 {
r = '\ufffd'
}
}
write:
if nDst+utf8.RuneLen(r) > len(dst) {
return nDst, nSrc, transform.ErrShortDst
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return nDst, nSrc, err
}
type iso2022JPEncoder int
func (e *iso2022JPEncoder) Reset() {
*e = asciiState
}
func (e *iso2022JPEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
// func init checks that the switch covers all tables.
//
// http://encoding.spec.whatwg.org/#iso-2022-jp says that "the index jis0212
// is not used by the iso-2022-jp encoder due to lack of widespread support".
//
// TODO: do we have to special-case U+00A5 and U+203E, as per
// http://encoding.spec.whatwg.org/#iso-2022-jp
// Doing so would mean that "\u00a5" would not be preserved
// after an encode-decode round trip.
switch {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r>>tableShift == jis0208 {
goto writeJIS
}
case encode1Low <= r && r < encode1High:
if r = rune(encode1[r-encode1Low]); r>>tableShift == jis0208 {
goto writeJIS
}
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r>>tableShift == jis0208 {
goto writeJIS
}
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r>>tableShift == jis0208 {
goto writeJIS
}
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r>>tableShift == jis0208 {
goto writeJIS
}
case encode5Low <= r && r < encode5High:
if 0xff61 <= r && r < 0xffa0 {
goto writeKatakana
}
if r = rune(encode5[r-encode5Low]); r>>tableShift == jis0208 {
goto writeJIS
}
}
// Switch back to ASCII state in case of error so that an ASCII
// replacement character can be written in the correct state.
if *e != asciiState {
if nDst+3 > len(dst) {
err = transform.ErrShortDst
break
}
*e = asciiState
dst[nDst+0] = asciiEsc
dst[nDst+1] = '('
dst[nDst+2] = 'B'
nDst += 3
}
err = internal.ErrASCIIReplacement
break
}
if *e != asciiState {
if nDst+4 > len(dst) {
err = transform.ErrShortDst
break
}
*e = asciiState
dst[nDst+0] = asciiEsc
dst[nDst+1] = '('
dst[nDst+2] = 'B'
nDst += 3
} else if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r)
nDst++
continue
writeJIS:
if *e != jis0208State {
if nDst+5 > len(dst) {
err = transform.ErrShortDst
break
}
*e = jis0208State
dst[nDst+0] = asciiEsc
dst[nDst+1] = '$'
dst[nDst+2] = 'B'
nDst += 3
} else if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = 0x21 + uint8(r>>codeShift)&codeMask
dst[nDst+1] = 0x21 + uint8(r)&codeMask
nDst += 2
continue
writeKatakana:
if *e != katakanaState {
if nDst+4 > len(dst) {
err = transform.ErrShortDst
break
}
*e = katakanaState
dst[nDst+0] = asciiEsc
dst[nDst+1] = '('
dst[nDst+2] = 'I'
nDst += 3
} else if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r - (0xff61 - 0x21))
nDst++
continue
}
if atEOF && err == nil && *e != asciiState {
if nDst+3 > len(dst) {
err = transform.ErrShortDst
} else {
*e = asciiState
dst[nDst+0] = asciiEsc
dst[nDst+1] = '('
dst[nDst+2] = 'B'
nDst += 3
}
}
return nDst, nSrc, err
}

189
vendor/golang.org/x/text/encoding/japanese/shiftjis.go generated vendored Normal file
View file

@ -0,0 +1,189 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package japanese
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// ShiftJIS is the Shift JIS encoding, also known as Code Page 932 and
// Windows-31J.
var ShiftJIS encoding.Encoding = &shiftJIS
var shiftJIS = internal.Encoding{
&internal.SimpleEncoding{shiftJISDecoder{}, shiftJISEncoder{}},
"Shift JIS",
identifier.ShiftJIS,
}
type shiftJISDecoder struct{ transform.NopResetter }
func (shiftJISDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for ; nSrc < len(src); nSrc += size {
switch c0 := src[nSrc]; {
case c0 < utf8.RuneSelf:
r, size = rune(c0), 1
case 0xa1 <= c0 && c0 < 0xe0:
r, size = rune(c0)+(0xff61-0xa1), 1
case (0x81 <= c0 && c0 < 0xa0) || (0xe0 <= c0 && c0 < 0xfd):
if c0 <= 0x9f {
c0 -= 0x70
} else {
c0 -= 0xb0
}
c0 = 2*c0 - 0x21
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = '\ufffd', 1
goto write
}
c1 := src[nSrc+1]
switch {
case c1 < 0x40:
r, size = '\ufffd', 1 // c1 is ASCII so output on next round
goto write
case c1 < 0x7f:
c0--
c1 -= 0x40
case c1 == 0x7f:
r, size = '\ufffd', 1 // c1 is ASCII so output on next round
goto write
case c1 < 0x9f:
c0--
c1 -= 0x41
case c1 < 0xfd:
c1 -= 0x9f
default:
r, size = '\ufffd', 2
goto write
}
r, size = '\ufffd', 2
if i := int(c0)*94 + int(c1); i < len(jis0208Decode) {
r = rune(jis0208Decode[i])
if r == 0 {
r = '\ufffd'
}
}
case c0 == 0x80:
r, size = 0x80, 1
default:
r, size = '\ufffd', 1
}
write:
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break loop
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return nDst, nSrc, err
}
type shiftJISEncoder struct{ transform.NopResetter }
func (shiftJISEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break loop
}
}
// func init checks that the switch covers all tables.
switch {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r>>tableShift == jis0208 {
goto write2
}
case encode1Low <= r && r < encode1High:
if r = rune(encode1[r-encode1Low]); r>>tableShift == jis0208 {
goto write2
}
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r>>tableShift == jis0208 {
goto write2
}
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r>>tableShift == jis0208 {
goto write2
}
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r>>tableShift == jis0208 {
goto write2
}
case encode5Low <= r && r < encode5High:
if 0xff61 <= r && r < 0xffa0 {
r -= 0xff61 - 0xa1
goto write1
}
if r = rune(encode5[r-encode5Low]); r>>tableShift == jis0208 {
goto write2
}
}
err = internal.ErrASCIIReplacement
break
}
write1:
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r)
nDst++
continue
write2:
j1 := uint8(r>>codeShift) & codeMask
j2 := uint8(r) & codeMask
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break loop
}
if j1 <= 61 {
dst[nDst+0] = 129 + j1/2
} else {
dst[nDst+0] = 193 + j1/2
}
if j1&1 == 0 {
dst[nDst+1] = j2 + j2/63 + 64
} else {
dst[nDst+1] = j2 + 159
}
nDst += 2
continue
}
return nDst, nSrc, err
}

26971
vendor/golang.org/x/text/encoding/japanese/tables.go generated vendored Normal file

File diff suppressed because it is too large Load diff

177
vendor/golang.org/x/text/encoding/korean/euckr.go generated vendored Normal file
View file

@ -0,0 +1,177 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package korean
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// All is a list of all defined encodings in this package.
var All = []encoding.Encoding{EUCKR}
// EUCKR is the EUC-KR encoding, also known as Code Page 949.
var EUCKR encoding.Encoding = &eucKR
var eucKR = internal.Encoding{
&internal.SimpleEncoding{eucKRDecoder{}, eucKREncoder{}},
"EUC-KR",
identifier.EUCKR,
}
type eucKRDecoder struct{ transform.NopResetter }
func (eucKRDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for ; nSrc < len(src); nSrc += size {
switch c0 := src[nSrc]; {
case c0 < utf8.RuneSelf:
r, size = rune(c0), 1
case 0x81 <= c0 && c0 < 0xff:
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
break
}
c1 := src[nSrc+1]
size = 2
if c0 < 0xc7 {
r = 178 * rune(c0-0x81)
switch {
case 0x41 <= c1 && c1 < 0x5b:
r += rune(c1) - (0x41 - 0*26)
case 0x61 <= c1 && c1 < 0x7b:
r += rune(c1) - (0x61 - 1*26)
case 0x81 <= c1 && c1 < 0xff:
r += rune(c1) - (0x81 - 2*26)
default:
goto decError
}
} else if 0xa1 <= c1 && c1 < 0xff {
r = 178*(0xc7-0x81) + rune(c0-0xc7)*94 + rune(c1-0xa1)
} else {
goto decError
}
if int(r) < len(decode) {
r = rune(decode[r])
if r != 0 {
break
}
}
decError:
r = utf8.RuneError
if c1 < utf8.RuneSelf {
size = 1
}
default:
r, size = utf8.RuneError, 1
break
}
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return nDst, nSrc, err
}
type eucKREncoder struct{ transform.NopResetter }
func (eucKREncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r)
nDst++
continue
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
// func init checks that the switch covers all tables.
switch {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r != 0 {
goto write2
}
case encode1Low <= r && r < encode1High:
if r = rune(encode1[r-encode1Low]); r != 0 {
goto write2
}
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r != 0 {
goto write2
}
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r != 0 {
goto write2
}
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r != 0 {
goto write2
}
case encode5Low <= r && r < encode5High:
if r = rune(encode5[r-encode5Low]); r != 0 {
goto write2
}
case encode6Low <= r && r < encode6High:
if r = rune(encode6[r-encode6Low]); r != 0 {
goto write2
}
}
err = internal.ErrASCIIReplacement
break
}
write2:
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = uint8(r >> 8)
dst[nDst+1] = uint8(r)
nDst += 2
continue
}
return nDst, nSrc, err
}
func init() {
// Check that the hard-coded encode switch covers all tables.
if numEncodeTables != 7 {
panic("bad numEncodeTables")
}
}

34152
vendor/golang.org/x/text/encoding/korean/tables.go generated vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,12 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package simplifiedchinese
import (
"golang.org/x/text/encoding"
)
// All is a list of all defined encodings in this package.
var All = []encoding.Encoding{GB18030, GBK, HZGB2312}

View file

@ -0,0 +1,273 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package simplifiedchinese
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
var (
// GB18030 is the GB18030 encoding.
GB18030 encoding.Encoding = &gbk18030
// GBK is the GBK encoding. It encodes an extension of the GB2312 character set
// and is also known as Code Page 936.
GBK encoding.Encoding = &gbk
)
var gbk = internal.Encoding{
&internal.SimpleEncoding{
gbkDecoder{gb18030: false},
gbkEncoder{gb18030: false},
},
"GBK",
identifier.GBK,
}
var gbk18030 = internal.Encoding{
&internal.SimpleEncoding{
gbkDecoder{gb18030: true},
gbkEncoder{gb18030: true},
},
"GB18030",
identifier.GB18030,
}
type gbkDecoder struct {
transform.NopResetter
gb18030 bool
}
func (d gbkDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for ; nSrc < len(src); nSrc += size {
switch c0 := src[nSrc]; {
case c0 < utf8.RuneSelf:
r, size = rune(c0), 1
// Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
// as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
// says to treat "gbk" as Code Page 936.
// GBKs decoder is gb18030s decoder. https://encoding.spec.whatwg.org/#gbk-decoder
// If byte is 0x80, return code point U+20AC. https://encoding.spec.whatwg.org/#gb18030-decoder
case c0 == 0x80:
r, size = '€', 1
case c0 < 0xff:
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
goto write
}
c1 := src[nSrc+1]
switch {
case 0x40 <= c1 && c1 < 0x7f:
c1 -= 0x40
case 0x80 <= c1 && c1 < 0xff:
c1 -= 0x41
case d.gb18030 && 0x30 <= c1 && c1 < 0x40:
if nSrc+3 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
// The second byte here is always ASCII, so we can set size
// to 1 in all cases.
r, size = utf8.RuneError, 1
goto write
}
c2 := src[nSrc+2]
if c2 < 0x81 || 0xff <= c2 {
r, size = utf8.RuneError, 1
goto write
}
c3 := src[nSrc+3]
if c3 < 0x30 || 0x3a <= c3 {
r, size = utf8.RuneError, 1
goto write
}
size = 4
r = ((rune(c0-0x81)*10+rune(c1-0x30))*126+rune(c2-0x81))*10 + rune(c3-0x30)
if r < 39420 {
i, j := 0, len(gb18030)
for i < j {
h := i + (j-i)/2
if r >= rune(gb18030[h][0]) {
i = h + 1
} else {
j = h
}
}
dec := &gb18030[i-1]
r += rune(dec[1]) - rune(dec[0])
goto write
}
r -= 189000
if 0 <= r && r < 0x100000 {
r += 0x10000
} else {
r, size = utf8.RuneError, 1
}
goto write
default:
r, size = utf8.RuneError, 1
goto write
}
r, size = '\ufffd', 2
if i := int(c0-0x81)*190 + int(c1); i < len(decode) {
r = rune(decode[i])
if r == 0 {
r = '\ufffd'
}
}
default:
r, size = utf8.RuneError, 1
}
write:
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break loop
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return nDst, nSrc, err
}
type gbkEncoder struct {
transform.NopResetter
gb18030 bool
}
func (e gbkEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, r2, size := rune(0), rune(0), 0
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
// func init checks that the switch covers all tables.
switch {
case encode0Low <= r && r < encode0High:
if r2 = rune(encode0[r-encode0Low]); r2 != 0 {
goto write2
}
case encode1Low <= r && r < encode1High:
// Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
// as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
// says to treat "gbk" as Code Page 936.
// GBKs encoder is gb18030s encoder with its _is GBK_ set to true. https://encoding.spec.whatwg.org/#gbk-encoder
// If _is GBK_ is true and code point is U+20AC, return byte 0x80. https://encoding.spec.whatwg.org/#gb18030-encoder
if !e.gb18030 && r == '€' {
r = 0x80
goto write1
}
if r2 = rune(encode1[r-encode1Low]); r2 != 0 {
goto write2
}
case encode2Low <= r && r < encode2High:
if r2 = rune(encode2[r-encode2Low]); r2 != 0 {
goto write2
}
case encode3Low <= r && r < encode3High:
if r2 = rune(encode3[r-encode3Low]); r2 != 0 {
goto write2
}
case encode4Low <= r && r < encode4High:
if r2 = rune(encode4[r-encode4Low]); r2 != 0 {
goto write2
}
}
if e.gb18030 {
if r < 0x10000 {
i, j := 0, len(gb18030)
for i < j {
h := i + (j-i)/2
if r >= rune(gb18030[h][1]) {
i = h + 1
} else {
j = h
}
}
dec := &gb18030[i-1]
r += rune(dec[0]) - rune(dec[1])
goto write4
} else if r < 0x110000 {
r += 189000 - 0x10000
goto write4
}
}
err = internal.ErrASCIIReplacement
break
}
write1:
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r)
nDst++
continue
write2:
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = uint8(r2 >> 8)
dst[nDst+1] = uint8(r2)
nDst += 2
continue
write4:
if nDst+4 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+3] = uint8(r%10 + 0x30)
r /= 10
dst[nDst+2] = uint8(r%126 + 0x81)
r /= 126
dst[nDst+1] = uint8(r%10 + 0x30)
r /= 10
dst[nDst+0] = uint8(r + 0x81)
nDst += 4
continue
}
return nDst, nSrc, err
}
func init() {
// Check that the hard-coded encode switch covers all tables.
if numEncodeTables != 5 {
panic("bad numEncodeTables")
}
}

View file

@ -0,0 +1,245 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package simplifiedchinese
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// HZGB2312 is the HZ-GB2312 encoding.
var HZGB2312 encoding.Encoding = &hzGB2312
var hzGB2312 = internal.Encoding{
internal.FuncEncoding{hzGB2312NewDecoder, hzGB2312NewEncoder},
"HZ-GB2312",
identifier.HZGB2312,
}
func hzGB2312NewDecoder() transform.Transformer {
return new(hzGB2312Decoder)
}
func hzGB2312NewEncoder() transform.Transformer {
return new(hzGB2312Encoder)
}
const (
asciiState = iota
gbState
)
type hzGB2312Decoder int
func (d *hzGB2312Decoder) Reset() {
*d = asciiState
}
func (d *hzGB2312Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for ; nSrc < len(src); nSrc += size {
c0 := src[nSrc]
if c0 >= utf8.RuneSelf {
r, size = utf8.RuneError, 1
goto write
}
if c0 == '~' {
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
goto write
}
size = 2
switch src[nSrc+1] {
case '{':
*d = gbState
continue
case '}':
*d = asciiState
continue
case '~':
if nDst >= len(dst) {
err = transform.ErrShortDst
break loop
}
dst[nDst] = '~'
nDst++
continue
case '\n':
continue
default:
r = utf8.RuneError
goto write
}
}
if *d == asciiState {
r, size = rune(c0), 1
} else {
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
goto write
}
size = 2
c1 := src[nSrc+1]
if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 {
// error
} else if i := int(c0-0x01)*190 + int(c1+0x3f); i < len(decode) {
r = rune(decode[i])
if r != 0 {
goto write
}
}
if c1 > utf8.RuneSelf {
// Be consistent and always treat non-ASCII as a single error.
size = 1
}
r = utf8.RuneError
}
write:
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break loop
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return nDst, nSrc, err
}
type hzGB2312Encoder int
func (d *hzGB2312Encoder) Reset() {
*d = asciiState
}
func (e *hzGB2312Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
if r == '~' {
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = '~'
dst[nDst+1] = '~'
nDst += 2
continue
} else if *e != asciiState {
if nDst+3 > len(dst) {
err = transform.ErrShortDst
break
}
*e = asciiState
dst[nDst+0] = '~'
dst[nDst+1] = '}'
nDst += 2
} else if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r)
nDst += 1
continue
}
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
// func init checks that the switch covers all tables.
switch {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r != 0 {
goto writeGB
}
case encode1Low <= r && r < encode1High:
if r = rune(encode1[r-encode1Low]); r != 0 {
goto writeGB
}
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r != 0 {
goto writeGB
}
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r != 0 {
goto writeGB
}
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r != 0 {
goto writeGB
}
}
terminateInASCIIState:
// Switch back to ASCII state in case of error so that an ASCII
// replacement character can be written in the correct state.
if *e != asciiState {
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = '~'
dst[nDst+1] = '}'
nDst += 2
}
err = internal.ErrASCIIReplacement
break
writeGB:
c0 := uint8(r>>8) - 0x80
c1 := uint8(r) - 0x80
if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 {
goto terminateInASCIIState
}
if *e == asciiState {
if nDst+4 > len(dst) {
err = transform.ErrShortDst
break
}
*e = gbState
dst[nDst+0] = '~'
dst[nDst+1] = '{'
nDst += 2
} else if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = c0
dst[nDst+1] = c1
nDst += 2
continue
}
// TODO: should one always terminate in ASCII state to make it safe to
// concatenate two HZ-GB2312-encoded strings?
return nDst, nSrc, err
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,199 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package traditionalchinese
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// All is a list of all defined encodings in this package.
var All = []encoding.Encoding{Big5}
// Big5 is the Big5 encoding, also known as Code Page 950.
var Big5 encoding.Encoding = &big5
var big5 = internal.Encoding{
&internal.SimpleEncoding{big5Decoder{}, big5Encoder{}},
"Big5",
identifier.Big5,
}
type big5Decoder struct{ transform.NopResetter }
func (big5Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size, s := rune(0), 0, ""
loop:
for ; nSrc < len(src); nSrc += size {
switch c0 := src[nSrc]; {
case c0 < utf8.RuneSelf:
r, size = rune(c0), 1
case 0x81 <= c0 && c0 < 0xff:
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
goto write
}
c1 := src[nSrc+1]
switch {
case 0x40 <= c1 && c1 < 0x7f:
c1 -= 0x40
case 0xa1 <= c1 && c1 < 0xff:
c1 -= 0x62
case c1 < 0x40:
r, size = utf8.RuneError, 1
goto write
default:
r, size = utf8.RuneError, 2
goto write
}
r, size = '\ufffd', 2
if i := int(c0-0x81)*157 + int(c1); i < len(decode) {
if 1133 <= i && i < 1167 {
// The two-rune special cases for LATIN CAPITAL / SMALL E WITH CIRCUMFLEX
// AND MACRON / CARON are from http://encoding.spec.whatwg.org/#big5
switch i {
case 1133:
s = "\u00CA\u0304"
goto writeStr
case 1135:
s = "\u00CA\u030C"
goto writeStr
case 1164:
s = "\u00EA\u0304"
goto writeStr
case 1166:
s = "\u00EA\u030C"
goto writeStr
}
}
r = rune(decode[i])
if r == 0 {
r = '\ufffd'
}
}
default:
r, size = utf8.RuneError, 1
}
write:
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break loop
}
nDst += utf8.EncodeRune(dst[nDst:], r)
continue loop
writeStr:
if nDst+len(s) > len(dst) {
err = transform.ErrShortDst
break loop
}
nDst += copy(dst[nDst:], s)
continue loop
}
return nDst, nSrc, err
}
type big5Encoder struct{ transform.NopResetter }
func (big5Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r)
nDst++
continue
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
}
if r >= utf8.RuneSelf {
// func init checks that the switch covers all tables.
switch {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r != 0 {
goto write2
}
case encode1Low <= r && r < encode1High:
if r = rune(encode1[r-encode1Low]); r != 0 {
goto write2
}
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r != 0 {
goto write2
}
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r != 0 {
goto write2
}
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r != 0 {
goto write2
}
case encode5Low <= r && r < encode5High:
if r = rune(encode5[r-encode5Low]); r != 0 {
goto write2
}
case encode6Low <= r && r < encode6High:
if r = rune(encode6[r-encode6Low]); r != 0 {
goto write2
}
case encode7Low <= r && r < encode7High:
if r = rune(encode7[r-encode7Low]); r != 0 {
goto write2
}
}
err = internal.ErrASCIIReplacement
break
}
write2:
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = uint8(r >> 8)
dst[nDst+1] = uint8(r)
nDst += 2
continue
}
return nDst, nSrc, err
}
func init() {
// Check that the hard-coded encode switch covers all tables.
if numEncodeTables != 8 {
panic("bad numEncodeTables")
}
}

File diff suppressed because it is too large Load diff

82
vendor/golang.org/x/text/encoding/unicode/override.go generated vendored Normal file
View file

@ -0,0 +1,82 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package unicode
import (
"golang.org/x/text/transform"
)
// BOMOverride returns a new decoder transformer that is identical to fallback,
// except that the presence of a Byte Order Mark at the start of the input
// causes it to switch to the corresponding Unicode decoding. It will only
// consider BOMs for UTF-8, UTF-16BE, and UTF-16LE.
//
// This differs from using ExpectBOM by allowing a BOM to switch to UTF-8, not
// just UTF-16 variants, and allowing falling back to any encoding scheme.
//
// This technique is recommended by the W3C for use in HTML 5: "For
// compatibility with deployed content, the byte order mark (also known as BOM)
// is considered more authoritative than anything else."
// http://www.w3.org/TR/encoding/#specification-hooks
//
// Using BOMOverride is mostly intended for use cases where the first characters
// of a fallback encoding are known to not be a BOM, for example, for valid HTML
// and most encodings.
func BOMOverride(fallback transform.Transformer) transform.Transformer {
// TODO: possibly allow a variadic argument of unicode encodings to allow
// specifying details of which fallbacks are supported as well as
// specifying the details of the implementations. This would also allow for
// support for UTF-32, which should not be supported by default.
return &bomOverride{fallback: fallback}
}
type bomOverride struct {
fallback transform.Transformer
current transform.Transformer
}
func (d *bomOverride) Reset() {
d.current = nil
d.fallback.Reset()
}
var (
// TODO: we could use decode functions here, instead of allocating a new
// decoder on every NewDecoder as IgnoreBOM decoders can be stateless.
utf16le = UTF16(LittleEndian, IgnoreBOM)
utf16be = UTF16(BigEndian, IgnoreBOM)
)
const utf8BOM = "\ufeff"
func (d *bomOverride) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
if d.current != nil {
return d.current.Transform(dst, src, atEOF)
}
if len(src) < 3 && !atEOF {
return 0, 0, transform.ErrShortSrc
}
d.current = d.fallback
bomSize := 0
if len(src) >= 2 {
if src[0] == 0xFF && src[1] == 0xFE {
d.current = utf16le.NewDecoder()
bomSize = 2
} else if src[0] == 0xFE && src[1] == 0xFF {
d.current = utf16be.NewDecoder()
bomSize = 2
} else if len(src) >= 3 &&
src[0] == utf8BOM[0] &&
src[1] == utf8BOM[1] &&
src[2] == utf8BOM[2] {
d.current = transform.Nop
bomSize = 3
}
}
if bomSize < len(src) {
nDst, nSrc, err = d.current.Transform(dst, src[bomSize:], atEOF)
}
return nDst, nSrc + bomSize, err
}

512
vendor/golang.org/x/text/encoding/unicode/unicode.go generated vendored Normal file
View file

@ -0,0 +1,512 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package unicode provides Unicode encodings such as UTF-16.
package unicode // import "golang.org/x/text/encoding/unicode"
import (
"bytes"
"errors"
"unicode/utf16"
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/internal/utf8internal"
"golang.org/x/text/runes"
"golang.org/x/text/transform"
)
// TODO: I think the Transformers really should return errors on unmatched
// surrogate pairs and odd numbers of bytes. This is not required by RFC 2781,
// which leaves it open, but is suggested by WhatWG. It will allow for all error
// modes as defined by WhatWG: fatal, HTML and Replacement. This would require
// the introduction of some kind of error type for conveying the erroneous code
// point.
// UTF8 is the UTF-8 encoding. It neither removes nor adds byte order marks.
var UTF8 encoding.Encoding = utf8enc
// UTF8BOM is an UTF-8 encoding where the decoder strips a leading byte order
// mark while the encoder adds one.
//
// Some editors add a byte order mark as a signature to UTF-8 files. Although
// the byte order mark is not useful for detecting byte order in UTF-8, it is
// sometimes used as a convention to mark UTF-8-encoded files. This relies on
// the observation that the UTF-8 byte order mark is either an illegal or at
// least very unlikely sequence in any other character encoding.
var UTF8BOM encoding.Encoding = utf8bomEncoding{}
type utf8bomEncoding struct{}
func (utf8bomEncoding) String() string {
return "UTF-8-BOM"
}
func (utf8bomEncoding) ID() (identifier.MIB, string) {
return identifier.Unofficial, "x-utf8bom"
}
func (utf8bomEncoding) NewEncoder() *encoding.Encoder {
return &encoding.Encoder{
Transformer: &utf8bomEncoder{t: runes.ReplaceIllFormed()},
}
}
func (utf8bomEncoding) NewDecoder() *encoding.Decoder {
return &encoding.Decoder{Transformer: &utf8bomDecoder{}}
}
var utf8enc = &internal.Encoding{
&internal.SimpleEncoding{utf8Decoder{}, runes.ReplaceIllFormed()},
"UTF-8",
identifier.UTF8,
}
type utf8bomDecoder struct {
checked bool
}
func (t *utf8bomDecoder) Reset() {
t.checked = false
}
func (t *utf8bomDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
if !t.checked {
if !atEOF && len(src) < len(utf8BOM) {
if len(src) == 0 {
return 0, 0, nil
}
return 0, 0, transform.ErrShortSrc
}
if bytes.HasPrefix(src, []byte(utf8BOM)) {
nSrc += len(utf8BOM)
src = src[len(utf8BOM):]
}
t.checked = true
}
nDst, n, err := utf8Decoder.Transform(utf8Decoder{}, dst[nDst:], src, atEOF)
nSrc += n
return nDst, nSrc, err
}
type utf8bomEncoder struct {
written bool
t transform.Transformer
}
func (t *utf8bomEncoder) Reset() {
t.written = false
t.t.Reset()
}
func (t *utf8bomEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
if !t.written {
if len(dst) < len(utf8BOM) {
return nDst, 0, transform.ErrShortDst
}
nDst = copy(dst, utf8BOM)
t.written = true
}
n, nSrc, err := utf8Decoder.Transform(utf8Decoder{}, dst[nDst:], src, atEOF)
nDst += n
return nDst, nSrc, err
}
type utf8Decoder struct{ transform.NopResetter }
func (utf8Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
var pSrc int // point from which to start copy in src
var accept utf8internal.AcceptRange
// The decoder can only make the input larger, not smaller.
n := len(src)
if len(dst) < n {
err = transform.ErrShortDst
n = len(dst)
atEOF = false
}
for nSrc < n {
c := src[nSrc]
if c < utf8.RuneSelf {
nSrc++
continue
}
first := utf8internal.First[c]
size := int(first & utf8internal.SizeMask)
if first == utf8internal.FirstInvalid {
goto handleInvalid // invalid starter byte
}
accept = utf8internal.AcceptRanges[first>>utf8internal.AcceptShift]
if nSrc+size > n {
if !atEOF {
// We may stop earlier than necessary here if the short sequence
// has invalid bytes. Not checking for this simplifies the code
// and may avoid duplicate computations in certain conditions.
if err == nil {
err = transform.ErrShortSrc
}
break
}
// Determine the maximal subpart of an ill-formed subsequence.
switch {
case nSrc+1 >= n || src[nSrc+1] < accept.Lo || accept.Hi < src[nSrc+1]:
size = 1
case nSrc+2 >= n || src[nSrc+2] < utf8internal.LoCB || utf8internal.HiCB < src[nSrc+2]:
size = 2
default:
size = 3 // As we are short, the maximum is 3.
}
goto handleInvalid
}
if c = src[nSrc+1]; c < accept.Lo || accept.Hi < c {
size = 1
goto handleInvalid // invalid continuation byte
} else if size == 2 {
} else if c = src[nSrc+2]; c < utf8internal.LoCB || utf8internal.HiCB < c {
size = 2
goto handleInvalid // invalid continuation byte
} else if size == 3 {
} else if c = src[nSrc+3]; c < utf8internal.LoCB || utf8internal.HiCB < c {
size = 3
goto handleInvalid // invalid continuation byte
}
nSrc += size
continue
handleInvalid:
// Copy the scanned input so far.
nDst += copy(dst[nDst:], src[pSrc:nSrc])
// Append RuneError to the destination.
const runeError = "\ufffd"
if nDst+len(runeError) > len(dst) {
return nDst, nSrc, transform.ErrShortDst
}
nDst += copy(dst[nDst:], runeError)
// Skip the maximal subpart of an ill-formed subsequence according to
// the W3C standard way instead of the Go way. This Transform is
// probably the only place in the text repo where it is warranted.
nSrc += size
pSrc = nSrc
// Recompute the maximum source length.
if sz := len(dst) - nDst; sz < len(src)-nSrc {
err = transform.ErrShortDst
n = nSrc + sz
atEOF = false
}
}
return nDst + copy(dst[nDst:], src[pSrc:nSrc]), nSrc, err
}
// UTF16 returns a UTF-16 Encoding for the given default endianness and byte
// order mark (BOM) policy.
//
// When decoding from UTF-16 to UTF-8, if the BOMPolicy is IgnoreBOM then
// neither BOMs U+FEFF nor noncharacters U+FFFE in the input stream will affect
// the endianness used for decoding, and will instead be output as their
// standard UTF-8 encodings: "\xef\xbb\xbf" and "\xef\xbf\xbe". If the BOMPolicy
// is UseBOM or ExpectBOM a staring BOM is not written to the UTF-8 output.
// Instead, it overrides the default endianness e for the remainder of the
// transformation. Any subsequent BOMs U+FEFF or noncharacters U+FFFE will not
// affect the endianness used, and will instead be output as their standard
// UTF-8 encodings. For UseBOM, if there is no starting BOM, it will proceed
// with the default Endianness. For ExpectBOM, in that case, the transformation
// will return early with an ErrMissingBOM error.
//
// When encoding from UTF-8 to UTF-16, a BOM will be inserted at the start of
// the output if the BOMPolicy is UseBOM or ExpectBOM. Otherwise, a BOM will not
// be inserted. The UTF-8 input does not need to contain a BOM.
//
// There is no concept of a 'native' endianness. If the UTF-16 data is produced
// and consumed in a greater context that implies a certain endianness, use
// IgnoreBOM. Otherwise, use ExpectBOM and always produce and consume a BOM.
//
// In the language of https://www.unicode.org/faq/utf_bom.html#bom10, IgnoreBOM
// corresponds to "Where the precise type of the data stream is known... the
// BOM should not be used" and ExpectBOM corresponds to "A particular
// protocol... may require use of the BOM".
func UTF16(e Endianness, b BOMPolicy) encoding.Encoding {
return utf16Encoding{config{e, b}, mibValue[e][b&bomMask]}
}
// mibValue maps Endianness and BOMPolicy settings to MIB constants. Note that
// some configurations map to the same MIB identifier. RFC 2781 has requirements
// and recommendations. Some of the "configurations" are merely recommendations,
// so multiple configurations could match.
var mibValue = map[Endianness][numBOMValues]identifier.MIB{
BigEndian: [numBOMValues]identifier.MIB{
IgnoreBOM: identifier.UTF16BE,
UseBOM: identifier.UTF16, // BigEnding default is preferred by RFC 2781.
// TODO: acceptBOM | strictBOM would map to UTF16BE as well.
},
LittleEndian: [numBOMValues]identifier.MIB{
IgnoreBOM: identifier.UTF16LE,
UseBOM: identifier.UTF16, // LittleEndian default is allowed and preferred on Windows.
// TODO: acceptBOM | strictBOM would map to UTF16LE as well.
},
// ExpectBOM is not widely used and has no valid MIB identifier.
}
// All lists a configuration for each IANA-defined UTF-16 variant.
var All = []encoding.Encoding{
UTF8,
UTF16(BigEndian, UseBOM),
UTF16(BigEndian, IgnoreBOM),
UTF16(LittleEndian, IgnoreBOM),
}
// BOMPolicy is a UTF-16 encoding's byte order mark policy.
type BOMPolicy uint8
const (
writeBOM BOMPolicy = 0x01
acceptBOM BOMPolicy = 0x02
requireBOM BOMPolicy = 0x04
bomMask BOMPolicy = 0x07
// HACK: numBOMValues == 8 triggers a bug in the 1.4 compiler (cannot have a
// map of an array of length 8 of a type that is also used as a key or value
// in another map). See golang.org/issue/11354.
// TODO: consider changing this value back to 8 if the use of 1.4.* has
// been minimized.
numBOMValues = 8 + 1
// IgnoreBOM means to ignore any byte order marks.
IgnoreBOM BOMPolicy = 0
// Common and RFC 2781-compliant interpretation for UTF-16BE/LE.
// UseBOM means that the UTF-16 form may start with a byte order mark, which
// will be used to override the default encoding.
UseBOM BOMPolicy = writeBOM | acceptBOM
// Common and RFC 2781-compliant interpretation for UTF-16.
// ExpectBOM means that the UTF-16 form must start with a byte order mark,
// which will be used to override the default encoding.
ExpectBOM BOMPolicy = writeBOM | acceptBOM | requireBOM
// Used in Java as Unicode (not to be confused with Java's UTF-16) and
// ICU's UTF-16,version=1. Not compliant with RFC 2781.
// TODO (maybe): strictBOM: BOM must match Endianness. This would allow:
// - UTF-16(B|L)E,version=1: writeBOM | acceptBOM | requireBOM | strictBOM
// (UnicodeBig and UnicodeLittle in Java)
// - RFC 2781-compliant, but less common interpretation for UTF-16(B|L)E:
// acceptBOM | strictBOM (e.g. assigned to CheckBOM).
// This addition would be consistent with supporting ExpectBOM.
)
// Endianness is a UTF-16 encoding's default endianness.
type Endianness bool
const (
// BigEndian is UTF-16BE.
BigEndian Endianness = false
// LittleEndian is UTF-16LE.
LittleEndian Endianness = true
)
// ErrMissingBOM means that decoding UTF-16 input with ExpectBOM did not find a
// starting byte order mark.
var ErrMissingBOM = errors.New("encoding: missing byte order mark")
type utf16Encoding struct {
config
mib identifier.MIB
}
type config struct {
endianness Endianness
bomPolicy BOMPolicy
}
func (u utf16Encoding) NewDecoder() *encoding.Decoder {
return &encoding.Decoder{Transformer: &utf16Decoder{
initial: u.config,
current: u.config,
}}
}
func (u utf16Encoding) NewEncoder() *encoding.Encoder {
return &encoding.Encoder{Transformer: &utf16Encoder{
endianness: u.endianness,
initialBOMPolicy: u.bomPolicy,
currentBOMPolicy: u.bomPolicy,
}}
}
func (u utf16Encoding) ID() (mib identifier.MIB, other string) {
return u.mib, ""
}
func (u utf16Encoding) String() string {
e, b := "B", ""
if u.endianness == LittleEndian {
e = "L"
}
switch u.bomPolicy {
case ExpectBOM:
b = "Expect"
case UseBOM:
b = "Use"
case IgnoreBOM:
b = "Ignore"
}
return "UTF-16" + e + "E (" + b + " BOM)"
}
type utf16Decoder struct {
initial config
current config
}
func (u *utf16Decoder) Reset() {
u.current = u.initial
}
func (u *utf16Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
if len(src) < 2 && atEOF && u.current.bomPolicy&requireBOM != 0 {
return 0, 0, ErrMissingBOM
}
if len(src) == 0 {
return 0, 0, nil
}
if len(src) >= 2 && u.current.bomPolicy&acceptBOM != 0 {
switch {
case src[0] == 0xfe && src[1] == 0xff:
u.current.endianness = BigEndian
nSrc = 2
case src[0] == 0xff && src[1] == 0xfe:
u.current.endianness = LittleEndian
nSrc = 2
default:
if u.current.bomPolicy&requireBOM != 0 {
return 0, 0, ErrMissingBOM
}
}
u.current.bomPolicy = IgnoreBOM
}
var r rune
var dSize, sSize int
for nSrc < len(src) {
if nSrc+1 < len(src) {
x := uint16(src[nSrc+0])<<8 | uint16(src[nSrc+1])
if u.current.endianness == LittleEndian {
x = x>>8 | x<<8
}
r, sSize = rune(x), 2
if utf16.IsSurrogate(r) {
if nSrc+3 < len(src) {
x = uint16(src[nSrc+2])<<8 | uint16(src[nSrc+3])
if u.current.endianness == LittleEndian {
x = x>>8 | x<<8
}
// Save for next iteration if it is not a high surrogate.
if isHighSurrogate(rune(x)) {
r, sSize = utf16.DecodeRune(r, rune(x)), 4
}
} else if !atEOF {
err = transform.ErrShortSrc
break
}
}
if dSize = utf8.RuneLen(r); dSize < 0 {
r, dSize = utf8.RuneError, 3
}
} else if atEOF {
// Single trailing byte.
r, dSize, sSize = utf8.RuneError, 3, 1
} else {
err = transform.ErrShortSrc
break
}
if nDst+dSize > len(dst) {
err = transform.ErrShortDst
break
}
nDst += utf8.EncodeRune(dst[nDst:], r)
nSrc += sSize
}
return nDst, nSrc, err
}
func isHighSurrogate(r rune) bool {
return 0xDC00 <= r && r <= 0xDFFF
}
type utf16Encoder struct {
endianness Endianness
initialBOMPolicy BOMPolicy
currentBOMPolicy BOMPolicy
}
func (u *utf16Encoder) Reset() {
u.currentBOMPolicy = u.initialBOMPolicy
}
func (u *utf16Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
if u.currentBOMPolicy&writeBOM != 0 {
if len(dst) < 2 {
return 0, 0, transform.ErrShortDst
}
dst[0], dst[1] = 0xfe, 0xff
u.currentBOMPolicy = IgnoreBOM
nDst = 2
}
r, size := rune(0), 0
for nSrc < len(src) {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
}
if r <= 0xffff {
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = uint8(r >> 8)
dst[nDst+1] = uint8(r)
nDst += 2
} else {
if nDst+4 > len(dst) {
err = transform.ErrShortDst
break
}
r1, r2 := utf16.EncodeRune(r)
dst[nDst+0] = uint8(r1 >> 8)
dst[nDst+1] = uint8(r1)
dst[nDst+2] = uint8(r2 >> 8)
dst[nDst+3] = uint8(r2)
nDst += 4
}
nSrc += size
}
if u.endianness == LittleEndian {
for i := 0; i < nDst; i += 2 {
dst[i], dst[i+1] = dst[i+1], dst[i]
}
}
return nDst, nSrc, err
}

View file

@ -0,0 +1,87 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package utf8internal contains low-level utf8-related constants, tables, etc.
// that are used internally by the text package.
package utf8internal
// The default lowest and highest continuation byte.
const (
LoCB = 0x80 // 1000 0000
HiCB = 0xBF // 1011 1111
)
// Constants related to getting information of first bytes of UTF-8 sequences.
const (
// ASCII identifies a UTF-8 byte as ASCII.
ASCII = as
// FirstInvalid indicates a byte is invalid as a first byte of a UTF-8
// sequence.
FirstInvalid = xx
// SizeMask is a mask for the size bits. Use use x&SizeMask to get the size.
SizeMask = 7
// AcceptShift is the right-shift count for the first byte info byte to get
// the index into the AcceptRanges table. See AcceptRanges.
AcceptShift = 4
// The names of these constants are chosen to give nice alignment in the
// table below. The first nibble is an index into acceptRanges or F for
// special one-byte cases. The second nibble is the Rune length or the
// Status for the special one-byte case.
xx = 0xF1 // invalid: size 1
as = 0xF0 // ASCII: size 1
s1 = 0x02 // accept 0, size 2
s2 = 0x13 // accept 1, size 3
s3 = 0x03 // accept 0, size 3
s4 = 0x23 // accept 2, size 3
s5 = 0x34 // accept 3, size 4
s6 = 0x04 // accept 0, size 4
s7 = 0x44 // accept 4, size 4
)
// First is information about the first byte in a UTF-8 sequence.
var First = [256]uint8{
// 1 2 3 4 5 6 7 8 9 A B C D E F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
// 1 2 3 4 5 6 7 8 9 A B C D E F
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
}
// AcceptRange gives the range of valid values for the second byte in a UTF-8
// sequence for any value for First that is not ASCII or FirstInvalid.
type AcceptRange struct {
Lo uint8 // lowest value for second byte.
Hi uint8 // highest value for second byte.
}
// AcceptRanges is a slice of AcceptRange values. For a given byte sequence b
//
// AcceptRanges[First[b[0]]>>AcceptShift]
//
// will give the value of AcceptRange for the multi-byte UTF-8 sequence starting
// at b[0].
var AcceptRanges = [...]AcceptRange{
0: {LoCB, HiCB},
1: {0xA0, HiCB},
2: {LoCB, 0x9F},
3: {0x90, HiCB},
4: {LoCB, 0x8F},
}

187
vendor/golang.org/x/text/runes/cond.go generated vendored Normal file
View file

@ -0,0 +1,187 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package runes
import (
"unicode/utf8"
"golang.org/x/text/transform"
)
// Note: below we pass invalid UTF-8 to the tIn and tNotIn transformers as is.
// This is done for various reasons:
// - To retain the semantics of the Nop transformer: if input is passed to a Nop
// one would expect it to be unchanged.
// - It would be very expensive to pass a converted RuneError to a transformer:
// a transformer might need more source bytes after RuneError, meaning that
// the only way to pass it safely is to create a new buffer and manage the
// intermingling of RuneErrors and normal input.
// - Many transformers leave ill-formed UTF-8 as is, so this is not
// inconsistent. Generally ill-formed UTF-8 is only replaced if it is a
// logical consequence of the operation (as for Map) or if it otherwise would
// pose security concerns (as for Remove).
// - An alternative would be to return an error on ill-formed UTF-8, but this
// would be inconsistent with other operations.
// If returns a transformer that applies tIn to consecutive runes for which
// s.Contains(r) and tNotIn to consecutive runes for which !s.Contains(r). Reset
// is called on tIn and tNotIn at the start of each run. A Nop transformer will
// substitute a nil value passed to tIn or tNotIn. Invalid UTF-8 is translated
// to RuneError to determine which transformer to apply, but is passed as is to
// the respective transformer.
func If(s Set, tIn, tNotIn transform.Transformer) Transformer {
if tIn == nil && tNotIn == nil {
return Transformer{transform.Nop}
}
if tIn == nil {
tIn = transform.Nop
}
if tNotIn == nil {
tNotIn = transform.Nop
}
sIn, ok := tIn.(transform.SpanningTransformer)
if !ok {
sIn = dummySpan{tIn}
}
sNotIn, ok := tNotIn.(transform.SpanningTransformer)
if !ok {
sNotIn = dummySpan{tNotIn}
}
a := &cond{
tIn: sIn,
tNotIn: sNotIn,
f: s.Contains,
}
a.Reset()
return Transformer{a}
}
type dummySpan struct{ transform.Transformer }
func (d dummySpan) Span(src []byte, atEOF bool) (n int, err error) {
return 0, transform.ErrEndOfSpan
}
type cond struct {
tIn, tNotIn transform.SpanningTransformer
f func(rune) bool
check func(rune) bool // current check to perform
t transform.SpanningTransformer // current transformer to use
}
// Reset implements transform.Transformer.
func (t *cond) Reset() {
t.check = t.is
t.t = t.tIn
t.t.Reset() // notIn will be reset on first usage.
}
func (t *cond) is(r rune) bool {
if t.f(r) {
return true
}
t.check = t.isNot
t.t = t.tNotIn
t.tNotIn.Reset()
return false
}
func (t *cond) isNot(r rune) bool {
if !t.f(r) {
return true
}
t.check = t.is
t.t = t.tIn
t.tIn.Reset()
return false
}
// This implementation of Span doesn't help all too much, but it needs to be
// there to satisfy this package's Transformer interface.
// TODO: there are certainly room for improvements, though. For example, if
// t.t == transform.Nop (which will a common occurrence) it will save a bundle
// to special-case that loop.
func (t *cond) Span(src []byte, atEOF bool) (n int, err error) {
p := 0
for n < len(src) && err == nil {
// Don't process too much at a time as the Spanner that will be
// called on this block may terminate early.
const maxChunk = 4096
max := len(src)
if v := n + maxChunk; v < max {
max = v
}
atEnd := false
size := 0
current := t.t
for ; p < max; p += size {
r := rune(src[p])
if r < utf8.RuneSelf {
size = 1
} else if r, size = utf8.DecodeRune(src[p:]); size == 1 {
if !atEOF && !utf8.FullRune(src[p:]) {
err = transform.ErrShortSrc
break
}
}
if !t.check(r) {
// The next rune will be the start of a new run.
atEnd = true
break
}
}
n2, err2 := current.Span(src[n:p], atEnd || (atEOF && p == len(src)))
n += n2
if err2 != nil {
return n, err2
}
// At this point either err != nil or t.check will pass for the rune at p.
p = n + size
}
return n, err
}
func (t *cond) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
p := 0
for nSrc < len(src) && err == nil {
// Don't process too much at a time, as the work might be wasted if the
// destination buffer isn't large enough to hold the result or a
// transform returns an error early.
const maxChunk = 4096
max := len(src)
if n := nSrc + maxChunk; n < len(src) {
max = n
}
atEnd := false
size := 0
current := t.t
for ; p < max; p += size {
r := rune(src[p])
if r < utf8.RuneSelf {
size = 1
} else if r, size = utf8.DecodeRune(src[p:]); size == 1 {
if !atEOF && !utf8.FullRune(src[p:]) {
err = transform.ErrShortSrc
break
}
}
if !t.check(r) {
// The next rune will be the start of a new run.
atEnd = true
break
}
}
nDst2, nSrc2, err2 := current.Transform(dst[nDst:], src[nSrc:p], atEnd || (atEOF && p == len(src)))
nDst += nDst2
nSrc += nSrc2
if err2 != nil {
return nDst, nSrc, err2
}
// At this point either err != nil or t.check will pass for the rune at p.
p = nSrc + size
}
return nDst, nSrc, err
}

355
vendor/golang.org/x/text/runes/runes.go generated vendored Normal file
View file

@ -0,0 +1,355 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package runes provide transforms for UTF-8 encoded text.
package runes // import "golang.org/x/text/runes"
import (
"unicode"
"unicode/utf8"
"golang.org/x/text/transform"
)
// A Set is a collection of runes.
type Set interface {
// Contains returns true if r is contained in the set.
Contains(r rune) bool
}
type setFunc func(rune) bool
func (s setFunc) Contains(r rune) bool {
return s(r)
}
// Note: using funcs here instead of wrapping types result in cleaner
// documentation and a smaller API.
// In creates a Set with a Contains method that returns true for all runes in
// the given RangeTable.
func In(rt *unicode.RangeTable) Set {
return setFunc(func(r rune) bool { return unicode.Is(rt, r) })
}
// NotIn creates a Set with a Contains method that returns true for all runes not
// in the given RangeTable.
func NotIn(rt *unicode.RangeTable) Set {
return setFunc(func(r rune) bool { return !unicode.Is(rt, r) })
}
// Predicate creates a Set with a Contains method that returns f(r).
func Predicate(f func(rune) bool) Set {
return setFunc(f)
}
// Transformer implements the transform.Transformer interface.
type Transformer struct {
t transform.SpanningTransformer
}
func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
return t.t.Transform(dst, src, atEOF)
}
func (t Transformer) Span(b []byte, atEOF bool) (n int, err error) {
return t.t.Span(b, atEOF)
}
func (t Transformer) Reset() { t.t.Reset() }
// Bytes returns a new byte slice with the result of converting b using t. It
// calls Reset on t. It returns nil if any error was found. This can only happen
// if an error-producing Transformer is passed to If.
func (t Transformer) Bytes(b []byte) []byte {
b, _, err := transform.Bytes(t, b)
if err != nil {
return nil
}
return b
}
// String returns a string with the result of converting s using t. It calls
// Reset on t. It returns the empty string if any error was found. This can only
// happen if an error-producing Transformer is passed to If.
func (t Transformer) String(s string) string {
s, _, err := transform.String(t, s)
if err != nil {
return ""
}
return s
}
// TODO:
// - Copy: copying strings and bytes in whole-rune units.
// - Validation (maybe)
// - Well-formed-ness (maybe)
const runeErrorString = string(utf8.RuneError)
// Remove returns a Transformer that removes runes r for which s.Contains(r).
// Illegal input bytes are replaced by RuneError before being passed to f.
func Remove(s Set) Transformer {
if f, ok := s.(setFunc); ok {
// This little trick cuts the running time of BenchmarkRemove for sets
// created by Predicate roughly in half.
// TODO: special-case RangeTables as well.
return Transformer{remove(f)}
}
return Transformer{remove(s.Contains)}
}
// TODO: remove transform.RemoveFunc.
type remove func(r rune) bool
func (remove) Reset() {}
// Span implements transform.Spanner.
func (t remove) Span(src []byte, atEOF bool) (n int, err error) {
for r, size := rune(0), 0; n < len(src); {
if r = rune(src[n]); r < utf8.RuneSelf {
size = 1
} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
// Invalid rune.
if !atEOF && !utf8.FullRune(src[n:]) {
err = transform.ErrShortSrc
} else {
err = transform.ErrEndOfSpan
}
break
}
if t(r) {
err = transform.ErrEndOfSpan
break
}
n += size
}
return
}
// Transform implements transform.Transformer.
func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
for r, size := rune(0), 0; nSrc < len(src); {
if r = rune(src[nSrc]); r < utf8.RuneSelf {
size = 1
} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
// Invalid rune.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
// We replace illegal bytes with RuneError. Not doing so might
// otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
// The resulting byte sequence may subsequently contain runes
// for which t(r) is true that were passed unnoticed.
if !t(utf8.RuneError) {
if nDst+3 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = runeErrorString[0]
dst[nDst+1] = runeErrorString[1]
dst[nDst+2] = runeErrorString[2]
nDst += 3
}
nSrc++
continue
}
if t(r) {
nSrc += size
continue
}
if nDst+size > len(dst) {
err = transform.ErrShortDst
break
}
for i := 0; i < size; i++ {
dst[nDst] = src[nSrc]
nDst++
nSrc++
}
}
return
}
// Map returns a Transformer that maps the runes in the input using the given
// mapping. Illegal bytes in the input are converted to utf8.RuneError before
// being passed to the mapping func.
func Map(mapping func(rune) rune) Transformer {
return Transformer{mapper(mapping)}
}
type mapper func(rune) rune
func (mapper) Reset() {}
// Span implements transform.Spanner.
func (t mapper) Span(src []byte, atEOF bool) (n int, err error) {
for r, size := rune(0), 0; n < len(src); n += size {
if r = rune(src[n]); r < utf8.RuneSelf {
size = 1
} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
// Invalid rune.
if !atEOF && !utf8.FullRune(src[n:]) {
err = transform.ErrShortSrc
} else {
err = transform.ErrEndOfSpan
}
break
}
if t(r) != r {
err = transform.ErrEndOfSpan
break
}
}
return n, err
}
// Transform implements transform.Transformer.
func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
var replacement rune
var b [utf8.UTFMax]byte
for r, size := rune(0), 0; nSrc < len(src); {
if r = rune(src[nSrc]); r < utf8.RuneSelf {
if replacement = t(r); replacement < utf8.RuneSelf {
if nDst == len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = byte(replacement)
nDst++
nSrc++
continue
}
size = 1
} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
// Invalid rune.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
if replacement = t(utf8.RuneError); replacement == utf8.RuneError {
if nDst+3 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = runeErrorString[0]
dst[nDst+1] = runeErrorString[1]
dst[nDst+2] = runeErrorString[2]
nDst += 3
nSrc++
continue
}
} else if replacement = t(r); replacement == r {
if nDst+size > len(dst) {
err = transform.ErrShortDst
break
}
for i := 0; i < size; i++ {
dst[nDst] = src[nSrc]
nDst++
nSrc++
}
continue
}
n := utf8.EncodeRune(b[:], replacement)
if nDst+n > len(dst) {
err = transform.ErrShortDst
break
}
for i := 0; i < n; i++ {
dst[nDst] = b[i]
nDst++
}
nSrc += size
}
return
}
// ReplaceIllFormed returns a transformer that replaces all input bytes that are
// not part of a well-formed UTF-8 code sequence with utf8.RuneError.
func ReplaceIllFormed() Transformer {
return Transformer{&replaceIllFormed{}}
}
type replaceIllFormed struct{ transform.NopResetter }
func (t replaceIllFormed) Span(src []byte, atEOF bool) (n int, err error) {
for n < len(src) {
// ASCII fast path.
if src[n] < utf8.RuneSelf {
n++
continue
}
r, size := utf8.DecodeRune(src[n:])
// Look for a valid non-ASCII rune.
if r != utf8.RuneError || size != 1 {
n += size
continue
}
// Look for short source data.
if !atEOF && !utf8.FullRune(src[n:]) {
err = transform.ErrShortSrc
break
}
// We have an invalid rune.
err = transform.ErrEndOfSpan
break
}
return n, err
}
func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
for nSrc < len(src) {
// ASCII fast path.
if r := src[nSrc]; r < utf8.RuneSelf {
if nDst == len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = r
nDst++
nSrc++
continue
}
// Look for a valid non-ASCII rune.
if _, size := utf8.DecodeRune(src[nSrc:]); size != 1 {
if size != copy(dst[nDst:], src[nSrc:nSrc+size]) {
err = transform.ErrShortDst
break
}
nDst += size
nSrc += size
continue
}
// Look for short source data.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
// We have an invalid rune.
if nDst+3 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = runeErrorString[0]
dst[nDst+1] = runeErrorString[1]
dst[nDst+2] = runeErrorString[2]
nDst += 3
nSrc++
}
return nDst, nSrc, err
}

12
vendor/modules.txt vendored
View file

@ -88,6 +88,18 @@ golang.org/x/sys/unix
golang.org/x/sys/windows
# golang.org/x/text v0.10.0
## explicit; go 1.17
golang.org/x/text/encoding
golang.org/x/text/encoding/charmap
golang.org/x/text/encoding/ianaindex
golang.org/x/text/encoding/internal
golang.org/x/text/encoding/internal/identifier
golang.org/x/text/encoding/japanese
golang.org/x/text/encoding/korean
golang.org/x/text/encoding/simplifiedchinese
golang.org/x/text/encoding/traditionalchinese
golang.org/x/text/encoding/unicode
golang.org/x/text/internal/utf8internal
golang.org/x/text/runes
golang.org/x/text/secure/bidirule
golang.org/x/text/transform
golang.org/x/text/unicode/bidi