gitea/modules/charset/charset.go

// Copyright 2014 The Gogs Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

package charset

import (
	"bytes"
	"fmt"
	"io"
	"strings"
	"unicode/utf8"

	"code.gitea.io/gitea/modules/log"
	"code.gitea.io/gitea/modules/setting"
	"code.gitea.io/gitea/modules/util"

	"github.com/gogs/chardet"
	"golang.org/x/net/html/charset"
	"golang.org/x/text/transform"
)

// UTF8BOM is the utf-8 byte-order marker
var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'}

// ToUTF8WithFallbackReader detects the encoding of content and coverts to UTF-8 reader if possible
func ToUTF8WithFallbackReader(rd io.Reader) io.Reader {
	buf := make([]byte, 2048)
	n, err := util.ReadAtMost(rd, buf)
	if err != nil {
		return io.MultiReader(bytes.NewReader(RemoveBOMIfPresent(buf[:n])), rd)
	}

	charsetLabel, err := DetectEncoding(buf[:n])
	if err != nil || charsetLabel == "UTF-8" {
		return io.MultiReader(bytes.NewReader(RemoveBOMIfPresent(buf[:n])), rd)
	}

	encoding, _ := charset.Lookup(charsetLabel)
	if encoding == nil {
		return io.MultiReader(bytes.NewReader(buf[:n]), rd)
	}

	return transform.NewReader(
		io.MultiReader(
			bytes.NewReader(RemoveBOMIfPresent(buf[:n])),
			rd,
		),
		encoding.NewDecoder(),
	)
}

// ToUTF8WithErr converts content to UTF8 encoding
func ToUTF8WithErr(content []byte) (string, error) {
	charsetLabel, err := DetectEncoding(content)
	if err != nil {
		return "", err
	} else if charsetLabel == "UTF-8" {
		return string(RemoveBOMIfPresent(content)), nil
	}

	encoding, _ := charset.Lookup(charsetLabel)
	if encoding == nil {
		return string(content), fmt.Errorf("Unknown encoding: %s", charsetLabel)
	}

	// If there is an error, we concatenate the nicely decoded part and the
	// original left over. This way we won't lose much data.
	result, n, err := transform.Bytes(encoding.NewDecoder(), content)
	if err != nil {
		result = append(result, content[n:]...)
	}

	result = RemoveBOMIfPresent(result)

	return string(result), err
}

// ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible
func ToUTF8WithFallback(content []byte) []byte {
	bs, _ := io.ReadAll(ToUTF8WithFallbackReader(bytes.NewReader(content)))
	return bs
}

// ToUTF8 converts content to UTF8 encoding and ignore error
func ToUTF8(content string) string {
	res, _ := ToUTF8WithErr([]byte(content))
	return res
}

// ToUTF8DropErrors makes sure the return string is valid utf-8; attempts conversion if possible
func ToUTF8DropErrors(content []byte) []byte {
	charsetLabel, err := DetectEncoding(content)
	if err != nil || charsetLabel == "UTF-8" {
		return RemoveBOMIfPresent(content)
	}

	encoding, _ := charset.Lookup(charsetLabel)
	if encoding == nil {
		return content
	}

	// We ignore any non-decodable parts from the file.
	// Some parts might be lost
	var decoded []byte
	decoder := encoding.NewDecoder()
	idx := 0
	for {
		result, n, err := transform.Bytes(decoder, content[idx:])
		decoded = append(decoded, result...)
		if err == nil {
			break
		}
		decoded = append(decoded, ' ')
		idx = idx + n + 1
		if idx >= len(content) {
			break
		}
	}

	return RemoveBOMIfPresent(decoded)
}

// RemoveBOMIfPresent removes a UTF-8 BOM from a []byte
func RemoveBOMIfPresent(content []byte) []byte {
	if len(content) > 2 && bytes.Equal(content[0:3], UTF8BOM) {
		return content[3:]
	}
	return content
}

// DetectEncoding detect the encoding of content
func DetectEncoding(content []byte) (string, error) {
	// First we check if the content represents valid utf8 content excepting a truncated character at the end.

	// Now we could decode all the runes in turn but this is not necessarily the cheapest thing to do
	// instead we walk backwards from the end to trim off a the incomplete character
	toValidate := content
	end := len(toValidate) - 1

	if end < 0 {
		// no-op
	} else if toValidate[end]>>5 == 0b110 {
		// Incomplete 1 byte extension e.g. © <c2><a9> which has been truncated to <c2>
		toValidate = toValidate[:end]
	} else if end > 0 && toValidate[end]>>6 == 0b10 && toValidate[end-1]>>4 == 0b1110 {
		// Incomplete 2 byte extension e.g. ⛔ <e2><9b><94> which has been truncated to <e2><9b>
		toValidate = toValidate[:end-1]
	} else if end > 1 && toValidate[end]>>6 == 0b10 && toValidate[end-1]>>6 == 0b10 && toValidate[end-2]>>3 == 0b11110 {
		// Incomplete 3 byte extension e.g. 💩 <f0><9f><92><a9> which has been truncated to <f0><9f><92>
		toValidate = toValidate[:end-2]
	}
	if utf8.Valid(toValidate) {
		log.Debug("Detected encoding: utf-8 (fast)")
		return "UTF-8", nil
	}

	textDetector := chardet.NewTextDetector()
	var detectContent []byte
	if len(content) < 1024 {
		// Check if original content is valid
		if _, err := textDetector.DetectBest(content); err != nil {
			return "", err
		}
		times := 1024 / len(content)
		detectContent = make([]byte, 0, times*len(content))
		for i := 0; i < times; i++ {
			detectContent = append(detectContent, content...)
		}
	} else {
		detectContent = content
	}

	// Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie break
	results, err := textDetector.DetectAll(detectContent)
	if err != nil {
		if err == chardet.NotDetectedError && len(setting.Repository.AnsiCharset) > 0 {
			log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)
			return setting.Repository.AnsiCharset, nil
		}
		return "", err
	}

	topConfidence := results[0].Confidence
	topResult := results[0]
	priority, has := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(topResult.Charset))]
	for _, result := range results {
		// As results are sorted in confidence order - if we have a different confidence
		// we know it's less than the current confidence and can break out of the loop early
		if result.Confidence != topConfidence {
			break
		}

		// Otherwise check if this results is earlier in the DetectedCharsetOrder than our current top guesss
		resultPriority, resultHas := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(result.Charset))]
		if resultHas && (!has || resultPriority < priority) {
			topResult = result
			priority = resultPriority
			has = true
		}
	}

	// FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument
	if topResult.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 {
		log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)
		return setting.Repository.AnsiCharset, err
	}

	log.Debug("Detected encoding: %s", topResult.Charset)
	return topResult.Charset, err
}
Convert files to utf-8 for indexing (#7814) * Convert files to utf-8 for indexing * Move utf8 functions to modules/base * Bump repoIndexerLatestVersion to 3 * Add tests for base/encoding.go * Changes to pass gosimple * Move UTF8 funcs into new modules/charset package 5 years ago			`// Copyright 2014 The Gogs Authors. All rights reserved.`
			`// Use of this source code is governed by a MIT-style`
			`// license that can be found in the LICENSE file.`

			`package charset`

			`import (`
			`"bytes"`
			`"fmt"`
Refactor renders (#15175) * Refactor renders * Some performance optimization * Fix comment * Transform reader * Fix csv test * Fix test * Fix tests * Improve optimaziation * Fix test * Fix test * Detect file encoding with reader * Improve optimaziation * reduce memory usage * improve code * fix build * Fix test * Fix for go1.15 * Fix render * Fix comment * Fix lint * Fix test * Don't use NormalEOF when unnecessary * revert change on util.go * Apply suggestions from code review Co-authored-by: zeripath <art27@cantab.net> * rename function * Take NormalEOF back Co-authored-by: zeripath <art27@cantab.net> 4 years ago			`"io"`
Fix chardet test and add ordering option (#11621) * Fix chardet test and add ordering option Signed-off-by: Andrew Thornton <art27@cantab.net> * minor fixes Signed-off-by: Andrew Thornton <art27@cantab.net> * remove log Signed-off-by: Andrew Thornton <art27@cantab.net> * remove log2 Signed-off-by: Andrew Thornton <art27@cantab.net> * only iterate through top results Signed-off-by: Andrew Thornton <art27@cantab.net> * Update docs/content/doc/advanced/config-cheat-sheet.en-us.md * slight restructure of for loop Signed-off-by: Andrew Thornton <art27@cantab.net> Co-authored-by: techknowlogick <techknowlogick@gitea.io> 4 years ago			`"strings"`
Convert files to utf-8 for indexing (#7814) * Convert files to utf-8 for indexing * Move utf8 functions to modules/base * Bump repoIndexerLatestVersion to 3 * Add tests for base/encoding.go * Changes to pass gosimple * Move UTF8 funcs into new modules/charset package 5 years ago			`"unicode/utf8"`

			`"code.gitea.io/gitea/modules/log"`
			`"code.gitea.io/gitea/modules/setting"`
Read expected buffer size (#17409) * Read expected buffer size. * Changed name. 3 years ago			`"code.gitea.io/gitea/modules/util"`
Convert files to utf-8 for indexing (#7814) * Convert files to utf-8 for indexing * Move utf8 functions to modules/base * Bump repoIndexerLatestVersion to 3 * Add tests for base/encoding.go * Changes to pass gosimple * Move UTF8 funcs into new modules/charset package 5 years ago
deps: update and fix chardet import (#9351) 5 years ago			`"github.com/gogs/chardet"`
Convert files to utf-8 for indexing (#7814) * Convert files to utf-8 for indexing * Move utf8 functions to modules/base * Bump repoIndexerLatestVersion to 3 * Add tests for base/encoding.go * Changes to pass gosimple * Move UTF8 funcs into new modules/charset package 5 years ago			`"golang.org/x/net/html/charset"`
			`"golang.org/x/text/transform"`
			`)`

			`// UTF8BOM is the utf-8 byte-order marker`
			`var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'}`

Refactor renders (#15175) * Refactor renders * Some performance optimization * Fix comment * Transform reader * Fix csv test * Fix test * Fix tests * Improve optimaziation * Fix test * Fix test * Detect file encoding with reader * Improve optimaziation * reduce memory usage * improve code * fix build * Fix test * Fix for go1.15 * Fix render * Fix comment * Fix lint * Fix test * Don't use NormalEOF when unnecessary * revert change on util.go * Apply suggestions from code review Co-authored-by: zeripath <art27@cantab.net> * rename function * Take NormalEOF back Co-authored-by: zeripath <art27@cantab.net> 4 years ago			`// ToUTF8WithFallbackReader detects the encoding of content and coverts to UTF-8 reader if possible`
			`func ToUTF8WithFallbackReader(rd io.Reader) io.Reader {`
format with gofumpt (#18184) * gofumpt -w -l . * gofumpt -w -l -extra . * Add linter * manual fix * change make fmt 3 years ago			`buf := make([]byte, 2048)`
Read expected buffer size (#17409) * Read expected buffer size. * Changed name. 3 years ago			`n, err := util.ReadAtMost(rd, buf)`
Refactor renders (#15175) * Refactor renders * Some performance optimization * Fix comment * Transform reader * Fix csv test * Fix test * Fix tests * Improve optimaziation * Fix test * Fix test * Detect file encoding with reader * Improve optimaziation * reduce memory usage * improve code * fix build * Fix test * Fix for go1.15 * Fix render * Fix comment * Fix lint * Fix test * Don't use NormalEOF when unnecessary * revert change on util.go * Apply suggestions from code review Co-authored-by: zeripath <art27@cantab.net> * rename function * Take NormalEOF back Co-authored-by: zeripath <art27@cantab.net> 4 years ago			`if err != nil {`
Read expected buffer size (#17409) * Read expected buffer size. * Changed name. 3 years ago			`return io.MultiReader(bytes.NewReader(RemoveBOMIfPresent(buf[:n])), rd)`
Refactor renders (#15175) * Refactor renders * Some performance optimization * Fix comment * Transform reader * Fix csv test * Fix test * Fix tests * Improve optimaziation * Fix test * Fix test * Detect file encoding with reader * Improve optimaziation * reduce memory usage * improve code * fix build * Fix test * Fix for go1.15 * Fix render * Fix comment * Fix lint * Fix test * Don't use NormalEOF when unnecessary * revert change on util.go * Apply suggestions from code review Co-authored-by: zeripath <art27@cantab.net> * rename function * Take NormalEOF back Co-authored-by: zeripath <art27@cantab.net> 4 years ago			`}`

			`charsetLabel, err := DetectEncoding(buf[:n])`
			`if err != nil \|\| charsetLabel == "UTF-8" {`
			`return io.MultiReader(bytes.NewReader(RemoveBOMIfPresent(buf[:n])), rd)`
			`}`

			`encoding, _ := charset.Lookup(charsetLabel)`
			`if encoding == nil {`
			`return io.MultiReader(bytes.NewReader(buf[:n]), rd)`
			`}`

			`return transform.NewReader(`
			`io.MultiReader(`
			`bytes.NewReader(RemoveBOMIfPresent(buf[:n])),`
			`rd,`
			`),`
			`encoding.NewDecoder(),`
			`)`
			`}`

Convert files to utf-8 for indexing (#7814) * Convert files to utf-8 for indexing * Move utf8 functions to modules/base * Bump repoIndexerLatestVersion to 3 * Add tests for base/encoding.go * Changes to pass gosimple * Move UTF8 funcs into new modules/charset package 5 years ago			`// ToUTF8WithErr converts content to UTF8 encoding`
			`func ToUTF8WithErr(content []byte) (string, error) {`
			`charsetLabel, err := DetectEncoding(content)`
			`if err != nil {`
			`return "", err`
			`} else if charsetLabel == "UTF-8" {`
			`return string(RemoveBOMIfPresent(content)), nil`
			`}`

			`encoding, _ := charset.Lookup(charsetLabel)`
			`if encoding == nil {`
			`return string(content), fmt.Errorf("Unknown encoding: %s", charsetLabel)`
			`}`

			`// If there is an error, we concatenate the nicely decoded part and the`
Fix utf8 tests (#8192) * Prevent compiler environment from making the tests fail * Remove unused function * Pass lint 5 years ago			`// original left over. This way we won't lose much data.`
Convert files to utf-8 for indexing (#7814) * Convert files to utf-8 for indexing * Move utf8 functions to modules/base * Bump repoIndexerLatestVersion to 3 * Add tests for base/encoding.go * Changes to pass gosimple * Move UTF8 funcs into new modules/charset package 5 years ago			`result, n, err := transform.Bytes(encoding.NewDecoder(), content)`
			`if err != nil {`
			`result = append(result, content[n:]...)`
			`}`

			`result = RemoveBOMIfPresent(result)`

			`return string(result), err`
			`}`

			`// ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible`
			`func ToUTF8WithFallback(content []byte) []byte {`
refactor: move from io/ioutil to io and os package (#17109) The io/ioutil package has been deprecated as of Go 1.16, see https://golang.org/doc/go1.16#ioutil. This commit replaces the existing io/ioutil functions with their new definitions in io and os packages. Signed-off-by: Eng Zer Jun <engzerjun@gmail.com> Co-authored-by: techknowlogick <techknowlogick@gitea.io> 3 years ago			`bs, _ := io.ReadAll(ToUTF8WithFallbackReader(bytes.NewReader(content)))`
Refactor renders (#15175) * Refactor renders * Some performance optimization * Fix comment * Transform reader * Fix csv test * Fix test * Fix tests * Improve optimaziation * Fix test * Fix test * Detect file encoding with reader * Improve optimaziation * reduce memory usage * improve code * fix build * Fix test * Fix for go1.15 * Fix render * Fix comment * Fix lint * Fix test * Don't use NormalEOF when unnecessary * revert change on util.go * Apply suggestions from code review Co-authored-by: zeripath <art27@cantab.net> * rename function * Take NormalEOF back Co-authored-by: zeripath <art27@cantab.net> 4 years ago			`return bs`
Convert files to utf-8 for indexing (#7814) * Convert files to utf-8 for indexing * Move utf8 functions to modules/base * Bump repoIndexerLatestVersion to 3 * Add tests for base/encoding.go * Changes to pass gosimple * Move UTF8 funcs into new modules/charset package 5 years ago			`}`

			`// ToUTF8 converts content to UTF8 encoding and ignore error`
			`func ToUTF8(content string) string {`
			`res, _ := ToUTF8WithErr([]byte(content))`
			`return res`
			`}`

			`// ToUTF8DropErrors makes sure the return string is valid utf-8; attempts conversion if possible`
			`func ToUTF8DropErrors(content []byte) []byte {`
			`charsetLabel, err := DetectEncoding(content)`
			`if err != nil \|\| charsetLabel == "UTF-8" {`
			`return RemoveBOMIfPresent(content)`
			`}`

			`encoding, _ := charset.Lookup(charsetLabel)`
			`if encoding == nil {`
			`return content`
			`}`

			`// We ignore any non-decodable parts from the file.`
			`// Some parts might be lost`
			`var decoded []byte`
			`decoder := encoding.NewDecoder()`
			`idx := 0`
			`for {`
			`result, n, err := transform.Bytes(decoder, content[idx:])`
			`decoded = append(decoded, result...)`
			`if err == nil {`
			`break`
			`}`
			`decoded = append(decoded, ' ')`
			`idx = idx + n + 1`
			`if idx >= len(content) {`
			`break`
			`}`
			`}`

			`return RemoveBOMIfPresent(decoded)`
			`}`

			`// RemoveBOMIfPresent removes a UTF-8 BOM from a []byte`
			`func RemoveBOMIfPresent(content []byte) []byte {`
			`if len(content) > 2 && bytes.Equal(content[0:3], UTF8BOM) {`
			`return content[3:]`
			`}`
			`return content`
			`}`

			`// DetectEncoding detect the encoding of content`
			`func DetectEncoding(content []byte) (string, error) {`
Detect truncated utf-8 characters at the end of content as still representing utf-8 (#19773) Our character detection algorithm can potentially incorrectly detect utf-8 as iso-8859-x if there is a truncated character at the end of the partially read file. This PR changes the detection algorithm to truncated utf8 characters at the end of the buffer. Fix #19743 Signed-off-by: Andrew Thornton <art27@cantab.net> 2 years ago			`// First we check if the content represents valid utf8 content excepting a truncated character at the end.`

			`// Now we could decode all the runes in turn but this is not necessarily the cheapest thing to do`
			`// instead we walk backwards from the end to trim off a the incomplete character`
			`toValidate := content`
			`end := len(toValidate) - 1`

			`if end < 0 {`
			`// no-op`
			`} else if toValidate[end]>>5 == 0b110 {`
			`// Incomplete 1 byte extension e.g. © <c2><a9> which has been truncated to <c2>`
			`toValidate = toValidate[:end]`
			`} else if end > 0 && toValidate[end]>>6 == 0b10 && toValidate[end-1]>>4 == 0b1110 {`
			`// Incomplete 2 byte extension e.g. ⛔ <e2><9b><94> which has been truncated to <e2><9b>`
			`toValidate = toValidate[:end-1]`
			`} else if end > 1 && toValidate[end]>>6 == 0b10 && toValidate[end-1]>>6 == 0b10 && toValidate[end-2]>>3 == 0b11110 {`
			`// Incomplete 3 byte extension e.g. 💩 <f0><9f><92><a9> which has been truncated to <f0><9f><92>`
			`toValidate = toValidate[:end-2]`
			`}`
			`if utf8.Valid(toValidate) {`
Convert files to utf-8 for indexing (#7814) * Convert files to utf-8 for indexing * Move utf8 functions to modules/base * Bump repoIndexerLatestVersion to 3 * Add tests for base/encoding.go * Changes to pass gosimple * Move UTF8 funcs into new modules/charset package 5 years ago			`log.Debug("Detected encoding: utf-8 (fast)")`
			`return "UTF-8", nil`
			`}`

			`textDetector := chardet.NewTextDetector()`
			`var detectContent []byte`
			`if len(content) < 1024 {`
			`// Check if original content is valid`
			`if _, err := textDetector.DetectBest(content); err != nil {`
			`return "", err`
			`}`
			`times := 1024 / len(content)`
			`detectContent = make([]byte, 0, times*len(content))`
			`for i := 0; i < times; i++ {`
			`detectContent = append(detectContent, content...)`
			`}`
			`} else {`
			`detectContent = content`
			`}`
Fix chardet test and add ordering option (#11621) * Fix chardet test and add ordering option Signed-off-by: Andrew Thornton <art27@cantab.net> * minor fixes Signed-off-by: Andrew Thornton <art27@cantab.net> * remove log Signed-off-by: Andrew Thornton <art27@cantab.net> * remove log2 Signed-off-by: Andrew Thornton <art27@cantab.net> * only iterate through top results Signed-off-by: Andrew Thornton <art27@cantab.net> * Update docs/content/doc/advanced/config-cheat-sheet.en-us.md * slight restructure of for loop Signed-off-by: Andrew Thornton <art27@cantab.net> Co-authored-by: techknowlogick <techknowlogick@gitea.io> 4 years ago
			`// Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie break`
			`results, err := textDetector.DetectAll(detectContent)`
Convert files to utf-8 for indexing (#7814) * Convert files to utf-8 for indexing * Move utf8 functions to modules/base * Bump repoIndexerLatestVersion to 3 * Add tests for base/encoding.go * Changes to pass gosimple * Move UTF8 funcs into new modules/charset package 5 years ago			`if err != nil {`
Fix chardet test and add ordering option (#11621) * Fix chardet test and add ordering option Signed-off-by: Andrew Thornton <art27@cantab.net> * minor fixes Signed-off-by: Andrew Thornton <art27@cantab.net> * remove log Signed-off-by: Andrew Thornton <art27@cantab.net> * remove log2 Signed-off-by: Andrew Thornton <art27@cantab.net> * only iterate through top results Signed-off-by: Andrew Thornton <art27@cantab.net> * Update docs/content/doc/advanced/config-cheat-sheet.en-us.md * slight restructure of for loop Signed-off-by: Andrew Thornton <art27@cantab.net> Co-authored-by: techknowlogick <techknowlogick@gitea.io> 4 years ago			`if err == chardet.NotDetectedError && len(setting.Repository.AnsiCharset) > 0 {`
			`log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)`
			`return setting.Repository.AnsiCharset, nil`
			`}`
Convert files to utf-8 for indexing (#7814) * Convert files to utf-8 for indexing * Move utf8 functions to modules/base * Bump repoIndexerLatestVersion to 3 * Add tests for base/encoding.go * Changes to pass gosimple * Move UTF8 funcs into new modules/charset package 5 years ago			`return "", err`
			`}`
Fix chardet test and add ordering option (#11621) * Fix chardet test and add ordering option Signed-off-by: Andrew Thornton <art27@cantab.net> * minor fixes Signed-off-by: Andrew Thornton <art27@cantab.net> * remove log Signed-off-by: Andrew Thornton <art27@cantab.net> * remove log2 Signed-off-by: Andrew Thornton <art27@cantab.net> * only iterate through top results Signed-off-by: Andrew Thornton <art27@cantab.net> * Update docs/content/doc/advanced/config-cheat-sheet.en-us.md * slight restructure of for loop Signed-off-by: Andrew Thornton <art27@cantab.net> Co-authored-by: techknowlogick <techknowlogick@gitea.io> 4 years ago
			`topConfidence := results[0].Confidence`
			`topResult := results[0]`
			`priority, has := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(topResult.Charset))]`
			`for _, result := range results {`
			`// As results are sorted in confidence order - if we have a different confidence`
			`// we know it's less than the current confidence and can break out of the loop early`
			`if result.Confidence != topConfidence {`
			`break`
			`}`

			`// Otherwise check if this results is earlier in the DetectedCharsetOrder than our current top guesss`
			`resultPriority, resultHas := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(result.Charset))]`
			`if resultHas && (!has \|\| resultPriority < priority) {`
			`topResult = result`
			`priority = resultPriority`
			`has = true`
			`}`
			`}`

Convert files to utf-8 for indexing (#7814) * Convert files to utf-8 for indexing * Move utf8 functions to modules/base * Bump repoIndexerLatestVersion to 3 * Add tests for base/encoding.go * Changes to pass gosimple * Move UTF8 funcs into new modules/charset package 5 years ago			`// FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument`
Fix chardet test and add ordering option (#11621) * Fix chardet test and add ordering option Signed-off-by: Andrew Thornton <art27@cantab.net> * minor fixes Signed-off-by: Andrew Thornton <art27@cantab.net> * remove log Signed-off-by: Andrew Thornton <art27@cantab.net> * remove log2 Signed-off-by: Andrew Thornton <art27@cantab.net> * only iterate through top results Signed-off-by: Andrew Thornton <art27@cantab.net> * Update docs/content/doc/advanced/config-cheat-sheet.en-us.md * slight restructure of for loop Signed-off-by: Andrew Thornton <art27@cantab.net> Co-authored-by: techknowlogick <techknowlogick@gitea.io> 4 years ago			`if topResult.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 {`
Convert files to utf-8 for indexing (#7814) * Convert files to utf-8 for indexing * Move utf8 functions to modules/base * Bump repoIndexerLatestVersion to 3 * Add tests for base/encoding.go * Changes to pass gosimple * Move UTF8 funcs into new modules/charset package 5 years ago			`log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)`
			`return setting.Repository.AnsiCharset, err`
			`}`

Fix chardet test and add ordering option (#11621) * Fix chardet test and add ordering option Signed-off-by: Andrew Thornton <art27@cantab.net> * minor fixes Signed-off-by: Andrew Thornton <art27@cantab.net> * remove log Signed-off-by: Andrew Thornton <art27@cantab.net> * remove log2 Signed-off-by: Andrew Thornton <art27@cantab.net> * only iterate through top results Signed-off-by: Andrew Thornton <art27@cantab.net> * Update docs/content/doc/advanced/config-cheat-sheet.en-us.md * slight restructure of for loop Signed-off-by: Andrew Thornton <art27@cantab.net> Co-authored-by: techknowlogick <techknowlogick@gitea.io> 4 years ago			`log.Debug("Detected encoding: %s", topResult.Charset)`
			`return topResult.Charset, err`
Convert files to utf-8 for indexing (#7814) * Convert files to utf-8 for indexing * Move utf8 functions to modules/base * Bump repoIndexerLatestVersion to 3 * Add tests for base/encoding.go * Changes to pass gosimple * Move UTF8 funcs into new modules/charset package 5 years ago			`}`