Convert files to utf-8 for indexing (#7814)
	
		
	
				
					
				
			* Convert files to utf-8 for indexing * Move utf8 functions to modules/base * Bump repoIndexerLatestVersion to 3 * Add tests for base/encoding.go * Changes to pass gosimple * Move UTF8 funcs into new modules/charset packagetokarchuk/v1.17
							parent
							
								
									c2c35d169c
								
							
						
					
					
						commit
						5a44be627c
					
				| @ -0,0 +1,152 @@ | ||||
| // Copyright 2014 The Gogs Authors. All rights reserved.
 | ||||
| // Use of this source code is governed by a MIT-style
 | ||||
| // license that can be found in the LICENSE file.
 | ||||
| 
 | ||||
| package charset | ||||
| 
 | ||||
| import ( | ||||
| 	"bytes" | ||||
| 	"fmt" | ||||
| 	"unicode/utf8" | ||||
| 
 | ||||
| 	"code.gitea.io/gitea/modules/log" | ||||
| 	"code.gitea.io/gitea/modules/setting" | ||||
| 
 | ||||
| 	"github.com/gogits/chardet" | ||||
| 	"golang.org/x/net/html/charset" | ||||
| 	"golang.org/x/text/transform" | ||||
| ) | ||||
| 
 | ||||
| // UTF8BOM is the utf-8 byte-order marker
 | ||||
| var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'} | ||||
| 
 | ||||
| // ToUTF8WithErr converts content to UTF8 encoding
 | ||||
| func ToUTF8WithErr(content []byte) (string, error) { | ||||
| 	charsetLabel, err := DetectEncoding(content) | ||||
| 	if err != nil { | ||||
| 		return "", err | ||||
| 	} else if charsetLabel == "UTF-8" { | ||||
| 		return string(RemoveBOMIfPresent(content)), nil | ||||
| 	} | ||||
| 
 | ||||
| 	encoding, _ := charset.Lookup(charsetLabel) | ||||
| 	if encoding == nil { | ||||
| 		return string(content), fmt.Errorf("Unknown encoding: %s", charsetLabel) | ||||
| 	} | ||||
| 
 | ||||
| 	// If there is an error, we concatenate the nicely decoded part and the
 | ||||
| 	// original left over. This way we won't lose data.
 | ||||
| 	result, n, err := transform.Bytes(encoding.NewDecoder(), content) | ||||
| 	if err != nil { | ||||
| 		result = append(result, content[n:]...) | ||||
| 	} | ||||
| 
 | ||||
| 	result = RemoveBOMIfPresent(result) | ||||
| 
 | ||||
| 	return string(result), err | ||||
| } | ||||
| 
 | ||||
| // ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible
 | ||||
| func ToUTF8WithFallback(content []byte) []byte { | ||||
| 	charsetLabel, err := DetectEncoding(content) | ||||
| 	if err != nil || charsetLabel == "UTF-8" { | ||||
| 		return RemoveBOMIfPresent(content) | ||||
| 	} | ||||
| 
 | ||||
| 	encoding, _ := charset.Lookup(charsetLabel) | ||||
| 	if encoding == nil { | ||||
| 		return content | ||||
| 	} | ||||
| 
 | ||||
| 	// If there is an error, we concatenate the nicely decoded part and the
 | ||||
| 	// original left over. This way we won't lose data.
 | ||||
| 	result, n, err := transform.Bytes(encoding.NewDecoder(), content) | ||||
| 	if err != nil { | ||||
| 		return append(result, content[n:]...) | ||||
| 	} | ||||
| 
 | ||||
| 	return RemoveBOMIfPresent(result) | ||||
| } | ||||
| 
 | ||||
| // ToUTF8 converts content to UTF8 encoding and ignore error
 | ||||
| func ToUTF8(content string) string { | ||||
| 	res, _ := ToUTF8WithErr([]byte(content)) | ||||
| 	return res | ||||
| } | ||||
| 
 | ||||
| // ToUTF8DropErrors makes sure the return string is valid utf-8; attempts conversion if possible
 | ||||
| func ToUTF8DropErrors(content []byte) []byte { | ||||
| 	charsetLabel, err := DetectEncoding(content) | ||||
| 	if err != nil || charsetLabel == "UTF-8" { | ||||
| 		return RemoveBOMIfPresent(content) | ||||
| 	} | ||||
| 
 | ||||
| 	encoding, _ := charset.Lookup(charsetLabel) | ||||
| 	if encoding == nil { | ||||
| 		return content | ||||
| 	} | ||||
| 
 | ||||
| 	// We ignore any non-decodable parts from the file.
 | ||||
| 	// Some parts might be lost
 | ||||
| 	var decoded []byte | ||||
| 	decoder := encoding.NewDecoder() | ||||
| 	idx := 0 | ||||
| 	for { | ||||
| 		result, n, err := transform.Bytes(decoder, content[idx:]) | ||||
| 		decoded = append(decoded, result...) | ||||
| 		if err == nil { | ||||
| 			break | ||||
| 		} | ||||
| 		decoded = append(decoded, ' ') | ||||
| 		idx = idx + n + 1 | ||||
| 		if idx >= len(content) { | ||||
| 			break | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	return RemoveBOMIfPresent(decoded) | ||||
| } | ||||
| 
 | ||||
| // RemoveBOMIfPresent removes a UTF-8 BOM from a []byte
 | ||||
| func RemoveBOMIfPresent(content []byte) []byte { | ||||
| 	if len(content) > 2 && bytes.Equal(content[0:3], UTF8BOM) { | ||||
| 		return content[3:] | ||||
| 	} | ||||
| 	return content | ||||
| } | ||||
| 
 | ||||
| // DetectEncoding detect the encoding of content
 | ||||
| func DetectEncoding(content []byte) (string, error) { | ||||
| 	if utf8.Valid(content) { | ||||
| 		log.Debug("Detected encoding: utf-8 (fast)") | ||||
| 		return "UTF-8", nil | ||||
| 	} | ||||
| 
 | ||||
| 	textDetector := chardet.NewTextDetector() | ||||
| 	var detectContent []byte | ||||
| 	if len(content) < 1024 { | ||||
| 		// Check if original content is valid
 | ||||
| 		if _, err := textDetector.DetectBest(content); err != nil { | ||||
| 			return "", err | ||||
| 		} | ||||
| 		times := 1024 / len(content) | ||||
| 		detectContent = make([]byte, 0, times*len(content)) | ||||
| 		for i := 0; i < times; i++ { | ||||
| 			detectContent = append(detectContent, content...) | ||||
| 		} | ||||
| 	} else { | ||||
| 		detectContent = content | ||||
| 	} | ||||
| 	result, err := textDetector.DetectBest(detectContent) | ||||
| 	if err != nil { | ||||
| 		return "", err | ||||
| 	} | ||||
| 	// FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument
 | ||||
| 	if result.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 { | ||||
| 		log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset) | ||||
| 		return setting.Repository.AnsiCharset, err | ||||
| 	} | ||||
| 
 | ||||
| 	log.Debug("Detected encoding: %s", result.Charset) | ||||
| 	return result.Charset, err | ||||
| } | ||||
| @ -0,0 +1,191 @@ | ||||
| // Copyright 2019 The Gitea Authors. All rights reserved.
 | ||||
| // Use of this source code is governed by a MIT-style
 | ||||
| // license that can be found in the LICENSE file.
 | ||||
| 
 | ||||
| package charset | ||||
| 
 | ||||
| import ( | ||||
| 	"testing" | ||||
| 
 | ||||
| 	"code.gitea.io/gitea/modules/setting" | ||||
| 
 | ||||
| 	"github.com/stretchr/testify/assert" | ||||
| ) | ||||
| 
 | ||||
| func TestRemoveBOMIfPresent(t *testing.T) { | ||||
| 	res := RemoveBOMIfPresent([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}) | ||||
| 	assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res) | ||||
| 
 | ||||
| 	res = RemoveBOMIfPresent([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}) | ||||
| 	assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res) | ||||
| } | ||||
| 
 | ||||
| func TestToUTF8WithErr(t *testing.T) { | ||||
| 	var res string | ||||
| 	var err error | ||||
| 
 | ||||
| 	res, err = ToUTF8WithErr([]byte{0x41, 0x42, 0x43}) | ||||
| 	assert.Equal(t, "ABC", res) | ||||
| 	assert.NoError(t, err) | ||||
| 
 | ||||
| 	res, err = ToUTF8WithErr([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}) | ||||
| 	assert.Equal(t, "áéíóú", res) | ||||
| 	assert.NoError(t, err) | ||||
| 
 | ||||
| 	res, err = ToUTF8WithErr([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}) | ||||
| 	assert.Equal(t, "áéíóú", res) | ||||
| 	assert.NoError(t, err) | ||||
| 
 | ||||
| 	// This test FAILS
 | ||||
| 	res, err = ToUTF8WithErr([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73}) | ||||
| 	assert.Equal(t, "Hola, así cómo ños", res) | ||||
| 	assert.NoError(t, err) | ||||
| 
 | ||||
| 	res, err = ToUTF8WithErr([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73}) | ||||
| 	// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
 | ||||
| 	assert.Regexp(t, "^Hola, así cómo", res) | ||||
| 	assert.NoError(t, err) | ||||
| 
 | ||||
| 	res, err = ToUTF8WithErr([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73}) | ||||
| 	// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
 | ||||
| 	assert.Regexp(t, "^Hola, así cómo", res) | ||||
| 	assert.NoError(t, err) | ||||
| 
 | ||||
| 	// Japanese (Shift-JIS)
 | ||||
| 	res, err = ToUTF8WithErr([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42}) | ||||
| 	assert.Equal(t, "日属秘ぞしちゅ。", res) | ||||
| 	assert.NoError(t, err) | ||||
| 
 | ||||
| 	res, err = ToUTF8WithErr([]byte{0x00, 0x00, 0x00, 0x00}) | ||||
| 	assert.Equal(t, "\x00\x00\x00\x00", res) | ||||
| 	assert.NoError(t, err) | ||||
| } | ||||
| 
 | ||||
| func TestToUTF8WithFallback(t *testing.T) { | ||||
| 	res := ToUTF8WithFallback([]byte{0x41, 0x42, 0x43}) | ||||
| 	assert.Equal(t, []byte("ABC"), res) | ||||
| 
 | ||||
| 	res = ToUTF8WithFallback([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}) | ||||
| 	assert.Equal(t, []byte("áéíóú"), res) | ||||
| 
 | ||||
| 	res = ToUTF8WithFallback([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}) | ||||
| 	assert.Equal(t, []byte("áéíóú"), res) | ||||
| 
 | ||||
| 	res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73}) | ||||
| 	assert.Equal(t, []byte("Hola, así cómo ños"), res) | ||||
| 
 | ||||
| 	minmatch := []byte("Hola, así cómo ") | ||||
| 
 | ||||
| 	res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73}) | ||||
| 	// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
 | ||||
| 	assert.Equal(t, minmatch, res[0:len(minmatch)]) | ||||
| 
 | ||||
| 	res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73}) | ||||
| 	// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
 | ||||
| 	assert.Equal(t, minmatch, res[0:len(minmatch)]) | ||||
| 
 | ||||
| 	// Japanese (Shift-JIS)
 | ||||
| 	res = ToUTF8WithFallback([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42}) | ||||
| 	assert.Equal(t, []byte("日属秘ぞしちゅ。"), res) | ||||
| 
 | ||||
| 	res = ToUTF8WithFallback([]byte{0x00, 0x00, 0x00, 0x00}) | ||||
| 	assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res) | ||||
| } | ||||
| 
 | ||||
| func TestToUTF8(t *testing.T) { | ||||
| 	res := ToUTF8("ABC") | ||||
| 	assert.Equal(t, "ABC", res) | ||||
| 
 | ||||
| 	res = ToUTF8("áéíóú") | ||||
| 	assert.Equal(t, "áéíóú", res) | ||||
| 
 | ||||
| 	// With utf-8 BOM
 | ||||
| 	res = ToUTF8("\ufeffáéíóú") | ||||
| 	assert.Equal(t, "áéíóú", res) | ||||
| 
 | ||||
| 	res = ToUTF8("Hola, así cómo ños") | ||||
| 	assert.Equal(t, "Hola, así cómo ños", res) | ||||
| 
 | ||||
| 	res = ToUTF8("Hola, así cómo \x07ños") | ||||
| 	// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
 | ||||
| 	assert.Regexp(t, "^Hola, así cómo", res) | ||||
| 
 | ||||
| 	// This test FAILS
 | ||||
| 	// res = ToUTF8("Hola, así cómo \x81ños")
 | ||||
| 	// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
 | ||||
| 	// assert.Regexp(t, "^Hola, así cómo", res)
 | ||||
| 
 | ||||
| 	// Japanese (Shift-JIS)
 | ||||
| 	res = ToUTF8("\x93\xFA\x91\xAE\x94\xE9\x82\xBC\x82\xB5\x82\xBF\x82\xE3\x81\x42") | ||||
| 	assert.Equal(t, "日属秘ぞしちゅ。", res) | ||||
| 
 | ||||
| 	res = ToUTF8("\x00\x00\x00\x00") | ||||
| 	assert.Equal(t, "\x00\x00\x00\x00", res) | ||||
| } | ||||
| 
 | ||||
| func TestToUTF8DropErrors(t *testing.T) { | ||||
| 	res := ToUTF8DropErrors([]byte{0x41, 0x42, 0x43}) | ||||
| 	assert.Equal(t, []byte("ABC"), res) | ||||
| 
 | ||||
| 	res = ToUTF8DropErrors([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}) | ||||
| 	assert.Equal(t, []byte("áéíóú"), res) | ||||
| 
 | ||||
| 	res = ToUTF8DropErrors([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}) | ||||
| 	assert.Equal(t, []byte("áéíóú"), res) | ||||
| 
 | ||||
| 	res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73}) | ||||
| 	assert.Equal(t, []byte("Hola, así cómo ños"), res) | ||||
| 
 | ||||
| 	minmatch := []byte("Hola, así cómo ") | ||||
| 
 | ||||
| 	res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73}) | ||||
| 	// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
 | ||||
| 	assert.Equal(t, minmatch, res[0:len(minmatch)]) | ||||
| 
 | ||||
| 	res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73}) | ||||
| 	// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
 | ||||
| 	assert.Equal(t, minmatch, res[0:len(minmatch)]) | ||||
| 
 | ||||
| 	// Japanese (Shift-JIS)
 | ||||
| 	res = ToUTF8DropErrors([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42}) | ||||
| 	assert.Equal(t, []byte("日属秘ぞしちゅ。"), res) | ||||
| 
 | ||||
| 	res = ToUTF8DropErrors([]byte{0x00, 0x00, 0x00, 0x00}) | ||||
| 	assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res) | ||||
| } | ||||
| 
 | ||||
| func TestDetectEncoding(t *testing.T) { | ||||
| 	testSuccess := func(b []byte, expected string) { | ||||
| 		encoding, err := DetectEncoding(b) | ||||
| 		assert.NoError(t, err) | ||||
| 		assert.Equal(t, expected, encoding) | ||||
| 	} | ||||
| 	// utf-8
 | ||||
| 	b := []byte("just some ascii") | ||||
| 	testSuccess(b, "UTF-8") | ||||
| 
 | ||||
| 	// utf-8-sig: "hey" (with BOM)
 | ||||
| 	b = []byte{0xef, 0xbb, 0xbf, 0x68, 0x65, 0x79} | ||||
| 	testSuccess(b, "UTF-8") | ||||
| 
 | ||||
| 	// utf-16: "hey<accented G>"
 | ||||
| 	b = []byte{0xff, 0xfe, 0x68, 0x00, 0x65, 0x00, 0x79, 0x00, 0xf4, 0x01} | ||||
| 	testSuccess(b, "UTF-16LE") | ||||
| 
 | ||||
| 	// iso-8859-1: d<accented e>cor<newline>
 | ||||
| 	b = []byte{0x44, 0xe9, 0x63, 0x6f, 0x72, 0x0a} | ||||
| 	encoding, err := DetectEncoding(b) | ||||
| 	assert.NoError(t, err) | ||||
| 	// due to a race condition in `chardet` library, it could either detect
 | ||||
| 	// "ISO-8859-1" or "IS0-8859-2" here. Technically either is correct, so
 | ||||
| 	// we accept either.
 | ||||
| 	assert.Contains(t, encoding, "ISO-8859") | ||||
| 
 | ||||
| 	setting.Repository.AnsiCharset = "placeholder" | ||||
| 	testSuccess(b, "placeholder") | ||||
| 
 | ||||
| 	// invalid bytes
 | ||||
| 	b = []byte{0xfa} | ||||
| 	_, err = DetectEncoding(b) | ||||
| 	assert.Error(t, err) | ||||
| } | ||||
					Loading…
					
					
				
		Reference in new issue