@ -5,6 +5,7 @@
package charset
package charset
import (
import (
"strings"
"testing"
"testing"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/setting"
@ -12,6 +13,22 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/assert"
)
)
func resetDefaultCharsetsOrder ( ) {
defaultDetectedCharsetsOrder := make ( [ ] string , 0 , len ( setting . Repository . DetectedCharsetsOrder ) )
for _ , charset := range setting . Repository . DetectedCharsetsOrder {
defaultDetectedCharsetsOrder = append ( defaultDetectedCharsetsOrder , strings . ToLower ( strings . TrimSpace ( charset ) ) )
}
setting . Repository . DetectedCharsetScore = map [ string ] int { }
i := 0
for _ , charset := range defaultDetectedCharsetsOrder {
canonicalCharset := strings . ToLower ( strings . TrimSpace ( charset ) )
if _ , has := setting . Repository . DetectedCharsetScore [ canonicalCharset ] ; ! has {
setting . Repository . DetectedCharsetScore [ canonicalCharset ] = i
i ++
}
}
}
func TestRemoveBOMIfPresent ( t * testing . T ) {
func TestRemoveBOMIfPresent ( t * testing . T ) {
res := RemoveBOMIfPresent ( [ ] byte { 0xc3 , 0xa1 , 0xc3 , 0xa9 , 0xc3 , 0xad , 0xc3 , 0xb3 , 0xc3 , 0xba } )
res := RemoveBOMIfPresent ( [ ] byte { 0xc3 , 0xa1 , 0xc3 , 0xa9 , 0xc3 , 0xad , 0xc3 , 0xb3 , 0xc3 , 0xba } )
assert . Equal ( t , [ ] byte { 0xc3 , 0xa1 , 0xc3 , 0xa9 , 0xc3 , 0xad , 0xc3 , 0xb3 , 0xc3 , 0xba } , res )
assert . Equal ( t , [ ] byte { 0xc3 , 0xa1 , 0xc3 , 0xa9 , 0xc3 , 0xad , 0xc3 , 0xb3 , 0xc3 , 0xba } , res )
@ -21,6 +38,7 @@ func TestRemoveBOMIfPresent(t *testing.T) {
}
}
func TestToUTF8WithErr ( t * testing . T ) {
func TestToUTF8WithErr ( t * testing . T ) {
resetDefaultCharsetsOrder ( )
var res string
var res string
var err error
var err error
@ -76,6 +94,7 @@ func TestToUTF8WithErr(t *testing.T) {
}
}
func TestToUTF8WithFallback ( t * testing . T ) {
func TestToUTF8WithFallback ( t * testing . T ) {
resetDefaultCharsetsOrder ( )
// "ABC"
// "ABC"
res := ToUTF8WithFallback ( [ ] byte { 0x41 , 0x42 , 0x43 } )
res := ToUTF8WithFallback ( [ ] byte { 0x41 , 0x42 , 0x43 } )
assert . Equal ( t , [ ] byte { 0x41 , 0x42 , 0x43 } , res )
assert . Equal ( t , [ ] byte { 0x41 , 0x42 , 0x43 } , res )
@ -116,7 +135,7 @@ func TestToUTF8WithFallback(t *testing.T) {
}
}
func TestToUTF8 ( t * testing . T ) {
func TestToUTF8 ( t * testing . T ) {
resetDefaultCharsetsOrder ( )
// Note: golang compiler seems so behave differently depending on the current
// Note: golang compiler seems so behave differently depending on the current
// locale, so some conversions might behave differently. For that reason, we don't
// locale, so some conversions might behave differently. For that reason, we don't
// depend on particular conversions but in expected behaviors.
// depend on particular conversions but in expected behaviors.
@ -165,6 +184,7 @@ func TestToUTF8(t *testing.T) {
}
}
func TestToUTF8DropErrors ( t * testing . T ) {
func TestToUTF8DropErrors ( t * testing . T ) {
resetDefaultCharsetsOrder ( )
// "ABC"
// "ABC"
res := ToUTF8DropErrors ( [ ] byte { 0x41 , 0x42 , 0x43 } )
res := ToUTF8DropErrors ( [ ] byte { 0x41 , 0x42 , 0x43 } )
assert . Equal ( t , [ ] byte { 0x41 , 0x42 , 0x43 } , res )
assert . Equal ( t , [ ] byte { 0x41 , 0x42 , 0x43 } , res )
@ -204,6 +224,7 @@ func TestToUTF8DropErrors(t *testing.T) {
}
}
func TestDetectEncoding ( t * testing . T ) {
func TestDetectEncoding ( t * testing . T ) {
resetDefaultCharsetsOrder ( )
testSuccess := func ( b [ ] byte , expected string ) {
testSuccess := func ( b [ ] byte , expected string ) {
encoding , err := DetectEncoding ( b )
encoding , err := DetectEncoding ( b )
assert . NoError ( t , err )
assert . NoError ( t , err )
@ -225,10 +246,7 @@ func TestDetectEncoding(t *testing.T) {
b = [ ] byte { 0x44 , 0xe9 , 0x63 , 0x6f , 0x72 , 0x0a }
b = [ ] byte { 0x44 , 0xe9 , 0x63 , 0x6f , 0x72 , 0x0a }
encoding , err := DetectEncoding ( b )
encoding , err := DetectEncoding ( b )
assert . NoError ( t , err )
assert . NoError ( t , err )
// due to a race condition in `chardet` library, it could either detect
assert . Contains ( t , encoding , "ISO-8859-1" )
// "ISO-8859-1" or "IS0-8859-2" here. Technically either is correct, so
// we accept either.
assert . Contains ( t , encoding , "ISO-8859" )
old := setting . Repository . AnsiCharset
old := setting . Repository . AnsiCharset
setting . Repository . AnsiCharset = "placeholder"
setting . Repository . AnsiCharset = "placeholder"