You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							335 lines
						
					
					
						
							9.5 KiB
						
					
					
				
			
		
		
	
	
							335 lines
						
					
					
						
							9.5 KiB
						
					
					
				| // Copyright 2013 The Go Authors. All rights reserved.
 | |
| // Use of this source code is governed by a BSD-style
 | |
| // license that can be found in the LICENSE file.
 | |
| 
 | |
| // Package encoding defines an interface for character encodings, such as Shift
 | |
| // JIS and Windows 1252, that can convert to and from UTF-8.
 | |
| //
 | |
| // Encoding implementations are provided in other packages, such as
 | |
| // golang.org/x/text/encoding/charmap and
 | |
| // golang.org/x/text/encoding/japanese.
 | |
| package encoding // import "golang.org/x/text/encoding"
 | |
| 
 | |
| import (
 | |
| 	"errors"
 | |
| 	"io"
 | |
| 	"strconv"
 | |
| 	"unicode/utf8"
 | |
| 
 | |
| 	"golang.org/x/text/encoding/internal/identifier"
 | |
| 	"golang.org/x/text/transform"
 | |
| )
 | |
| 
 | |
| // TODO:
 | |
| // - There seems to be some inconsistency in when decoders return errors
 | |
| //   and when not. Also documentation seems to suggest they shouldn't return
 | |
| //   errors at all (except for UTF-16).
 | |
| // - Encoders seem to rely on or at least benefit from the input being in NFC
 | |
| //   normal form. Perhaps add an example how users could prepare their output.
 | |
| 
 | |
| // Encoding is a character set encoding that can be transformed to and from
 | |
| // UTF-8.
 | |
| type Encoding interface {
 | |
| 	// NewDecoder returns a Decoder.
 | |
| 	NewDecoder() *Decoder
 | |
| 
 | |
| 	// NewEncoder returns an Encoder.
 | |
| 	NewEncoder() *Encoder
 | |
| }
 | |
| 
 | |
| // A Decoder converts bytes to UTF-8. It implements transform.Transformer.
 | |
| //
 | |
| // Transforming source bytes that are not of that encoding will not result in an
 | |
| // error per se. Each byte that cannot be transcoded will be represented in the
 | |
| // output by the UTF-8 encoding of '\uFFFD', the replacement rune.
 | |
| type Decoder struct {
 | |
| 	transform.Transformer
 | |
| 
 | |
| 	// This forces external creators of Decoders to use names in struct
 | |
| 	// initializers, allowing for future extendibility without having to break
 | |
| 	// code.
 | |
| 	_ struct{}
 | |
| }
 | |
| 
 | |
| // Bytes converts the given encoded bytes to UTF-8. It returns the converted
 | |
| // bytes or nil, err if any error occurred.
 | |
| func (d *Decoder) Bytes(b []byte) ([]byte, error) {
 | |
| 	b, _, err := transform.Bytes(d, b)
 | |
| 	if err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 	return b, nil
 | |
| }
 | |
| 
 | |
| // String converts the given encoded string to UTF-8. It returns the converted
 | |
| // string or "", err if any error occurred.
 | |
| func (d *Decoder) String(s string) (string, error) {
 | |
| 	s, _, err := transform.String(d, s)
 | |
| 	if err != nil {
 | |
| 		return "", err
 | |
| 	}
 | |
| 	return s, nil
 | |
| }
 | |
| 
 | |
| // Reader wraps another Reader to decode its bytes.
 | |
| //
 | |
| // The Decoder may not be used for any other operation as long as the returned
 | |
| // Reader is in use.
 | |
| func (d *Decoder) Reader(r io.Reader) io.Reader {
 | |
| 	return transform.NewReader(r, d)
 | |
| }
 | |
| 
 | |
| // An Encoder converts bytes from UTF-8. It implements transform.Transformer.
 | |
| //
 | |
| // Each rune that cannot be transcoded will result in an error. In this case,
 | |
| // the transform will consume all source byte up to, not including the offending
 | |
| // rune. Transforming source bytes that are not valid UTF-8 will be replaced by
 | |
| // `\uFFFD`. To return early with an error instead, use transform.Chain to
 | |
| // preprocess the data with a UTF8Validator.
 | |
| type Encoder struct {
 | |
| 	transform.Transformer
 | |
| 
 | |
| 	// This forces external creators of Encoders to use names in struct
 | |
| 	// initializers, allowing for future extendibility without having to break
 | |
| 	// code.
 | |
| 	_ struct{}
 | |
| }
 | |
| 
 | |
| // Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if
 | |
| // any error occurred.
 | |
| func (e *Encoder) Bytes(b []byte) ([]byte, error) {
 | |
| 	b, _, err := transform.Bytes(e, b)
 | |
| 	if err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 	return b, nil
 | |
| }
 | |
| 
 | |
| // String converts a string from UTF-8. It returns the converted string or
 | |
| // "", err if any error occurred.
 | |
| func (e *Encoder) String(s string) (string, error) {
 | |
| 	s, _, err := transform.String(e, s)
 | |
| 	if err != nil {
 | |
| 		return "", err
 | |
| 	}
 | |
| 	return s, nil
 | |
| }
 | |
| 
 | |
| // Writer wraps another Writer to encode its UTF-8 output.
 | |
| //
 | |
| // The Encoder may not be used for any other operation as long as the returned
 | |
| // Writer is in use.
 | |
| func (e *Encoder) Writer(w io.Writer) io.Writer {
 | |
| 	return transform.NewWriter(w, e)
 | |
| }
 | |
| 
 | |
| // ASCIISub is the ASCII substitute character, as recommended by
 | |
| // http://unicode.org/reports/tr36/#Text_Comparison
 | |
| const ASCIISub = '\x1a'
 | |
| 
 | |
| // Nop is the nop encoding. Its transformed bytes are the same as the source
 | |
| // bytes; it does not replace invalid UTF-8 sequences.
 | |
| var Nop Encoding = nop{}
 | |
| 
 | |
| type nop struct{}
 | |
| 
 | |
| func (nop) NewDecoder() *Decoder {
 | |
| 	return &Decoder{Transformer: transform.Nop}
 | |
| }
 | |
| func (nop) NewEncoder() *Encoder {
 | |
| 	return &Encoder{Transformer: transform.Nop}
 | |
| }
 | |
| 
 | |
| // Replacement is the replacement encoding. Decoding from the replacement
 | |
| // encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to
 | |
| // the replacement encoding yields the same as the source bytes except that
 | |
| // invalid UTF-8 is converted to '\uFFFD'.
 | |
| //
 | |
| // It is defined at http://encoding.spec.whatwg.org/#replacement
 | |
| var Replacement Encoding = replacement{}
 | |
| 
 | |
| type replacement struct{}
 | |
| 
 | |
| func (replacement) NewDecoder() *Decoder {
 | |
| 	return &Decoder{Transformer: replacementDecoder{}}
 | |
| }
 | |
| 
 | |
| func (replacement) NewEncoder() *Encoder {
 | |
| 	return &Encoder{Transformer: replacementEncoder{}}
 | |
| }
 | |
| 
 | |
| func (replacement) ID() (mib identifier.MIB, other string) {
 | |
| 	return identifier.Replacement, ""
 | |
| }
 | |
| 
 | |
| type replacementDecoder struct{ transform.NopResetter }
 | |
| 
 | |
| func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 | |
| 	if len(dst) < 3 {
 | |
| 		return 0, 0, transform.ErrShortDst
 | |
| 	}
 | |
| 	if atEOF {
 | |
| 		const fffd = "\ufffd"
 | |
| 		dst[0] = fffd[0]
 | |
| 		dst[1] = fffd[1]
 | |
| 		dst[2] = fffd[2]
 | |
| 		nDst = 3
 | |
| 	}
 | |
| 	return nDst, len(src), nil
 | |
| }
 | |
| 
 | |
| type replacementEncoder struct{ transform.NopResetter }
 | |
| 
 | |
| func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 | |
| 	r, size := rune(0), 0
 | |
| 
 | |
| 	for ; nSrc < len(src); nSrc += size {
 | |
| 		r = rune(src[nSrc])
 | |
| 
 | |
| 		// Decode a 1-byte rune.
 | |
| 		if r < utf8.RuneSelf {
 | |
| 			size = 1
 | |
| 
 | |
| 		} else {
 | |
| 			// Decode a multi-byte rune.
 | |
| 			r, size = utf8.DecodeRune(src[nSrc:])
 | |
| 			if size == 1 {
 | |
| 				// All valid runes of size 1 (those below utf8.RuneSelf) were
 | |
| 				// handled above. We have invalid UTF-8 or we haven't seen the
 | |
| 				// full character yet.
 | |
| 				if !atEOF && !utf8.FullRune(src[nSrc:]) {
 | |
| 					err = transform.ErrShortSrc
 | |
| 					break
 | |
| 				}
 | |
| 				r = '\ufffd'
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		if nDst+utf8.RuneLen(r) > len(dst) {
 | |
| 			err = transform.ErrShortDst
 | |
| 			break
 | |
| 		}
 | |
| 		nDst += utf8.EncodeRune(dst[nDst:], r)
 | |
| 	}
 | |
| 	return nDst, nSrc, err
 | |
| }
 | |
| 
 | |
| // HTMLEscapeUnsupported wraps encoders to replace source runes outside the
 | |
| // repertoire of the destination encoding with HTML escape sequences.
 | |
| //
 | |
| // This wrapper exists to comply to URL and HTML forms requiring a
 | |
| // non-terminating legacy encoder. The produced sequences may lead to data
 | |
| // loss as they are indistinguishable from legitimate input. To avoid this
 | |
| // issue, use UTF-8 encodings whenever possible.
 | |
| func HTMLEscapeUnsupported(e *Encoder) *Encoder {
 | |
| 	return &Encoder{Transformer: &errorHandler{e, errorToHTML}}
 | |
| }
 | |
| 
 | |
| // ReplaceUnsupported wraps encoders to replace source runes outside the
 | |
| // repertoire of the destination encoding with an encoding-specific
 | |
| // replacement.
 | |
| //
 | |
| // This wrapper is only provided for backwards compatibility and legacy
 | |
| // handling. Its use is strongly discouraged. Use UTF-8 whenever possible.
 | |
| func ReplaceUnsupported(e *Encoder) *Encoder {
 | |
| 	return &Encoder{Transformer: &errorHandler{e, errorToReplacement}}
 | |
| }
 | |
| 
 | |
| type errorHandler struct {
 | |
| 	*Encoder
 | |
| 	handler func(dst []byte, r rune, err repertoireError) (n int, ok bool)
 | |
| }
 | |
| 
 | |
| // TODO: consider making this error public in some form.
 | |
| type repertoireError interface {
 | |
| 	Replacement() byte
 | |
| }
 | |
| 
 | |
| func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 | |
| 	nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF)
 | |
| 	for err != nil {
 | |
| 		rerr, ok := err.(repertoireError)
 | |
| 		if !ok {
 | |
| 			return nDst, nSrc, err
 | |
| 		}
 | |
| 		r, sz := utf8.DecodeRune(src[nSrc:])
 | |
| 		n, ok := h.handler(dst[nDst:], r, rerr)
 | |
| 		if !ok {
 | |
| 			return nDst, nSrc, transform.ErrShortDst
 | |
| 		}
 | |
| 		err = nil
 | |
| 		nDst += n
 | |
| 		if nSrc += sz; nSrc < len(src) {
 | |
| 			var dn, sn int
 | |
| 			dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF)
 | |
| 			nDst += dn
 | |
| 			nSrc += sn
 | |
| 		}
 | |
| 	}
 | |
| 	return nDst, nSrc, err
 | |
| }
 | |
| 
 | |
| func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) {
 | |
| 	buf := [8]byte{}
 | |
| 	b := strconv.AppendUint(buf[:0], uint64(r), 10)
 | |
| 	if n = len(b) + len("&#;"); n >= len(dst) {
 | |
| 		return 0, false
 | |
| 	}
 | |
| 	dst[0] = '&'
 | |
| 	dst[1] = '#'
 | |
| 	dst[copy(dst[2:], b)+2] = ';'
 | |
| 	return n, true
 | |
| }
 | |
| 
 | |
| func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) {
 | |
| 	if len(dst) == 0 {
 | |
| 		return 0, false
 | |
| 	}
 | |
| 	dst[0] = err.Replacement()
 | |
| 	return 1, true
 | |
| }
 | |
| 
 | |
| // ErrInvalidUTF8 means that a transformer encountered invalid UTF-8.
 | |
| var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8")
 | |
| 
 | |
| // UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first
 | |
| // input byte that is not valid UTF-8.
 | |
| var UTF8Validator transform.Transformer = utf8Validator{}
 | |
| 
 | |
| type utf8Validator struct{ transform.NopResetter }
 | |
| 
 | |
| func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 | |
| 	n := len(src)
 | |
| 	if n > len(dst) {
 | |
| 		n = len(dst)
 | |
| 	}
 | |
| 	for i := 0; i < n; {
 | |
| 		if c := src[i]; c < utf8.RuneSelf {
 | |
| 			dst[i] = c
 | |
| 			i++
 | |
| 			continue
 | |
| 		}
 | |
| 		_, size := utf8.DecodeRune(src[i:])
 | |
| 		if size == 1 {
 | |
| 			// All valid runes of size 1 (those below utf8.RuneSelf) were
 | |
| 			// handled above. We have invalid UTF-8 or we haven't seen the
 | |
| 			// full character yet.
 | |
| 			err = ErrInvalidUTF8
 | |
| 			if !atEOF && !utf8.FullRune(src[i:]) {
 | |
| 				err = transform.ErrShortSrc
 | |
| 			}
 | |
| 			return i, i, err
 | |
| 		}
 | |
| 		if i+size > len(dst) {
 | |
| 			return i, i, transform.ErrShortDst
 | |
| 		}
 | |
| 		for ; size > 0; size-- {
 | |
| 			dst[i] = src[i]
 | |
| 			i++
 | |
| 		}
 | |
| 	}
 | |
| 	if len(src) > len(dst) {
 | |
| 		err = transform.ErrShortDst
 | |
| 	}
 | |
| 	return n, n, err
 | |
| }
 | |
| 
 |