You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							103 lines
						
					
					
						
							2.2 KiB
						
					
					
				
			
		
		
	
	
							103 lines
						
					
					
						
							2.2 KiB
						
					
					
				| package chardet
 | |
| 
 | |
| import (
 | |
| 	"bytes"
 | |
| )
 | |
| 
 | |
| var (
 | |
| 	utf16beBom = []byte{0xFE, 0xFF}
 | |
| 	utf16leBom = []byte{0xFF, 0xFE}
 | |
| 	utf32beBom = []byte{0x00, 0x00, 0xFE, 0xFF}
 | |
| 	utf32leBom = []byte{0xFF, 0xFE, 0x00, 0x00}
 | |
| )
 | |
| 
 | |
| type recognizerUtf16be struct {
 | |
| }
 | |
| 
 | |
| func newRecognizer_utf16be() *recognizerUtf16be {
 | |
| 	return &recognizerUtf16be{}
 | |
| }
 | |
| 
 | |
| func (*recognizerUtf16be) Match(input *recognizerInput) (output recognizerOutput) {
 | |
| 	output = recognizerOutput{
 | |
| 		Charset: "UTF-16BE",
 | |
| 	}
 | |
| 	if bytes.HasPrefix(input.raw, utf16beBom) {
 | |
| 		output.Confidence = 100
 | |
| 	}
 | |
| 	return
 | |
| }
 | |
| 
 | |
| type recognizerUtf16le struct {
 | |
| }
 | |
| 
 | |
| func newRecognizer_utf16le() *recognizerUtf16le {
 | |
| 	return &recognizerUtf16le{}
 | |
| }
 | |
| 
 | |
| func (*recognizerUtf16le) Match(input *recognizerInput) (output recognizerOutput) {
 | |
| 	output = recognizerOutput{
 | |
| 		Charset: "UTF-16LE",
 | |
| 	}
 | |
| 	if bytes.HasPrefix(input.raw, utf16leBom) && !bytes.HasPrefix(input.raw, utf32leBom) {
 | |
| 		output.Confidence = 100
 | |
| 	}
 | |
| 	return
 | |
| }
 | |
| 
 | |
| type recognizerUtf32 struct {
 | |
| 	name       string
 | |
| 	bom        []byte
 | |
| 	decodeChar func(input []byte) uint32
 | |
| }
 | |
| 
 | |
| func decodeUtf32be(input []byte) uint32 {
 | |
| 	return uint32(input[0])<<24 | uint32(input[1])<<16 | uint32(input[2])<<8 | uint32(input[3])
 | |
| }
 | |
| 
 | |
| func decodeUtf32le(input []byte) uint32 {
 | |
| 	return uint32(input[3])<<24 | uint32(input[2])<<16 | uint32(input[1])<<8 | uint32(input[0])
 | |
| }
 | |
| 
 | |
| func newRecognizer_utf32be() *recognizerUtf32 {
 | |
| 	return &recognizerUtf32{
 | |
| 		"UTF-32BE",
 | |
| 		utf32beBom,
 | |
| 		decodeUtf32be,
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func newRecognizer_utf32le() *recognizerUtf32 {
 | |
| 	return &recognizerUtf32{
 | |
| 		"UTF-32LE",
 | |
| 		utf32leBom,
 | |
| 		decodeUtf32le,
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (r *recognizerUtf32) Match(input *recognizerInput) (output recognizerOutput) {
 | |
| 	output = recognizerOutput{
 | |
| 		Charset: r.name,
 | |
| 	}
 | |
| 	hasBom := bytes.HasPrefix(input.raw, r.bom)
 | |
| 	var numValid, numInvalid uint32
 | |
| 	for b := input.raw; len(b) >= 4; b = b[4:] {
 | |
| 		if c := r.decodeChar(b); c >= 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF) {
 | |
| 			numInvalid++
 | |
| 		} else {
 | |
| 			numValid++
 | |
| 		}
 | |
| 	}
 | |
| 	if hasBom && numInvalid == 0 {
 | |
| 		output.Confidence = 100
 | |
| 	} else if hasBom && numValid > numInvalid*10 {
 | |
| 		output.Confidence = 80
 | |
| 	} else if numValid > 3 && numInvalid == 0 {
 | |
| 		output.Confidence = 100
 | |
| 	} else if numValid > 0 && numInvalid == 0 {
 | |
| 		output.Confidence = 80
 | |
| 	} else if numValid > numInvalid*10 {
 | |
| 		output.Confidence = 25
 | |
| 	}
 | |
| 	return
 | |
| }
 | |
| 
 |