Add warning for BIDI characters in page renders and in diffs (#17562)
Fix #17514 Given the comments I've adjusted this somewhat. The numbers of characters detected are increased and include things like the use of U+300 to make à instead of à and non-breaking spaces. There is a button which can be used to escape the content to show it. Signed-off-by: Andrew Thornton <art27@cantab.net> Co-authored-by: Gwyneth Morgan <gwymor@tilde.club> Co-authored-by: silverwind <me@silverwind.io> Co-authored-by: wxiaoguang <wxiaoguang@gmail.com>tokarchuk/v1.17
parent
ee60f27aec
commit
21ed4fd8da
@ -0,0 +1,230 @@ |
||||
// Copyright 2021 The Gitea Authors. All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package charset |
||||
|
||||
import ( |
||||
"bytes" |
||||
"fmt" |
||||
"io" |
||||
"strings" |
||||
"unicode" |
||||
"unicode/utf8" |
||||
|
||||
"golang.org/x/text/unicode/bidi" |
||||
) |
||||
|
||||
// EscapeStatus represents the findings of the unicode escaper
|
||||
type EscapeStatus struct { |
||||
Escaped bool |
||||
HasError bool |
||||
HasBadRunes bool |
||||
HasControls bool |
||||
HasSpaces bool |
||||
HasMarks bool |
||||
HasBIDI bool |
||||
BadBIDI bool |
||||
HasRTLScript bool |
||||
HasLTRScript bool |
||||
} |
||||
|
||||
// Or combines two EscapeStatus structs into one representing the conjunction of the two
|
||||
func (status EscapeStatus) Or(other EscapeStatus) EscapeStatus { |
||||
st := status |
||||
st.Escaped = st.Escaped || other.Escaped |
||||
st.HasError = st.HasError || other.HasError |
||||
st.HasBadRunes = st.HasBadRunes || other.HasBadRunes |
||||
st.HasControls = st.HasControls || other.HasControls |
||||
st.HasSpaces = st.HasSpaces || other.HasSpaces |
||||
st.HasMarks = st.HasMarks || other.HasMarks |
||||
st.HasBIDI = st.HasBIDI || other.HasBIDI |
||||
st.BadBIDI = st.BadBIDI || other.BadBIDI |
||||
st.HasRTLScript = st.HasRTLScript || other.HasRTLScript |
||||
st.HasLTRScript = st.HasLTRScript || other.HasLTRScript |
||||
return st |
||||
} |
||||
|
||||
// EscapeControlString escapes the unicode control sequences in a provided string and returns the findings as an EscapeStatus and the escaped string
|
||||
func EscapeControlString(text string) (EscapeStatus, string) { |
||||
sb := &strings.Builder{} |
||||
escaped, _ := EscapeControlReader(strings.NewReader(text), sb) |
||||
return escaped, sb.String() |
||||
} |
||||
|
||||
// EscapeControlBytes escapes the unicode control sequences a provided []byte and returns the findings as an EscapeStatus and the escaped []byte
|
||||
func EscapeControlBytes(text []byte) (EscapeStatus, []byte) { |
||||
buf := &bytes.Buffer{} |
||||
escaped, _ := EscapeControlReader(bytes.NewReader(text), buf) |
||||
return escaped, buf.Bytes() |
||||
} |
||||
|
||||
// EscapeControlReader escapes the unicode control sequences a provided Reader writing the escaped output to the output and returns the findings as an EscapeStatus and an error
|
||||
func EscapeControlReader(text io.Reader, output io.Writer) (escaped EscapeStatus, err error) { |
||||
buf := make([]byte, 4096) |
||||
readStart := 0 |
||||
var n int |
||||
var writePos int |
||||
|
||||
lineHasBIDI := false |
||||
lineHasRTLScript := false |
||||
lineHasLTRScript := false |
||||
|
||||
readingloop: |
||||
for err == nil { |
||||
n, err = text.Read(buf[readStart:]) |
||||
bs := buf[:n+readStart] |
||||
i := 0 |
||||
|
||||
for i < len(bs) { |
||||
r, size := utf8.DecodeRune(bs[i:]) |
||||
// Now handle the codepoints
|
||||
switch { |
||||
case r == utf8.RuneError: |
||||
if writePos < i { |
||||
if _, err = output.Write(bs[writePos:i]); err != nil { |
||||
escaped.HasError = true |
||||
return |
||||
} |
||||
writePos = i |
||||
} |
||||
// runes can be at most 4 bytes - so...
|
||||
if len(bs)-i <= 3 { |
||||
// if not request more data
|
||||
copy(buf, bs[i:]) |
||||
readStart = n - i |
||||
writePos = 0 |
||||
continue readingloop |
||||
} |
||||
// this is a real broken rune
|
||||
escaped.HasBadRunes = true |
||||
escaped.Escaped = true |
||||
if err = writeBroken(output, bs[i:i+size]); err != nil { |
||||
escaped.HasError = true |
||||
return |
||||
} |
||||
writePos += size |
||||
case r == '\n': |
||||
if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript { |
||||
escaped.BadBIDI = true |
||||
} |
||||
lineHasBIDI = false |
||||
lineHasRTLScript = false |
||||
lineHasLTRScript = false |
||||
|
||||
case r == '\r' || r == '\t' || r == ' ': |
||||
// These are acceptable control characters and space characters
|
||||
case unicode.IsSpace(r): |
||||
escaped.HasSpaces = true |
||||
escaped.Escaped = true |
||||
if writePos < i { |
||||
if _, err = output.Write(bs[writePos:i]); err != nil { |
||||
escaped.HasError = true |
||||
return |
||||
} |
||||
} |
||||
if err = writeEscaped(output, r); err != nil { |
||||
escaped.HasError = true |
||||
return |
||||
} |
||||
writePos = i + size |
||||
case unicode.Is(unicode.Bidi_Control, r): |
||||
escaped.Escaped = true |
||||
escaped.HasBIDI = true |
||||
if writePos < i { |
||||
if _, err = output.Write(bs[writePos:i]); err != nil { |
||||
escaped.HasError = true |
||||
return |
||||
} |
||||
} |
||||
lineHasBIDI = true |
||||
if err = writeEscaped(output, r); err != nil { |
||||
escaped.HasError = true |
||||
return |
||||
} |
||||
writePos = i + size |
||||
case unicode.Is(unicode.C, r): |
||||
escaped.Escaped = true |
||||
escaped.HasControls = true |
||||
if writePos < i { |
||||
if _, err = output.Write(bs[writePos:i]); err != nil { |
||||
escaped.HasError = true |
||||
return |
||||
} |
||||
} |
||||
if err = writeEscaped(output, r); err != nil { |
||||
escaped.HasError = true |
||||
return |
||||
} |
||||
writePos = i + size |
||||
case unicode.Is(unicode.M, r): |
||||
escaped.Escaped = true |
||||
escaped.HasMarks = true |
||||
if writePos < i { |
||||
if _, err = output.Write(bs[writePos:i]); err != nil { |
||||
escaped.HasError = true |
||||
return |
||||
} |
||||
} |
||||
if err = writeEscaped(output, r); err != nil { |
||||
escaped.HasError = true |
||||
return |
||||
} |
||||
writePos = i + size |
||||
default: |
||||
p, _ := bidi.Lookup(bs[i : i+size]) |
||||
c := p.Class() |
||||
if c == bidi.R || c == bidi.AL { |
||||
lineHasRTLScript = true |
||||
escaped.HasRTLScript = true |
||||
} else if c == bidi.L { |
||||
lineHasLTRScript = true |
||||
escaped.HasLTRScript = true |
||||
} |
||||
} |
||||
i += size |
||||
} |
||||
if n > 0 { |
||||
// we read something...
|
||||
// write everything unwritten
|
||||
if writePos < i { |
||||
if _, err = output.Write(bs[writePos:i]); err != nil { |
||||
escaped.HasError = true |
||||
return |
||||
} |
||||
} |
||||
|
||||
// reset the starting positions for the next read
|
||||
readStart = 0 |
||||
writePos = 0 |
||||
} |
||||
} |
||||
if readStart > 0 { |
||||
// this means that there is an incomplete or broken rune at 0-readStart and we read nothing on the last go round
|
||||
escaped.Escaped = true |
||||
escaped.HasBadRunes = true |
||||
if err = writeBroken(output, buf[:readStart]); err != nil { |
||||
escaped.HasError = true |
||||
return |
||||
} |
||||
} |
||||
if err == io.EOF { |
||||
if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript { |
||||
escaped.BadBIDI = true |
||||
} |
||||
err = nil |
||||
return |
||||
} |
||||
escaped.HasError = true |
||||
return |
||||
} |
||||
|
||||
func writeBroken(output io.Writer, bs []byte) (err error) { |
||||
_, err = fmt.Fprintf(output, `<span class="broken-code-point"><%X></span>`, bs) |
||||
return |
||||
} |
||||
|
||||
func writeEscaped(output io.Writer, r rune) (err error) { |
||||
_, err = fmt.Fprintf(output, `<span class="escaped-code-point" data-escaped="[U+%04X]"><span class="char">%c</span></span>`, r, r) |
||||
return |
||||
} |
@ -0,0 +1,17 @@ |
||||
{{if .EscapeStatus.BadBIDI}} |
||||
<div class="ui error message unicode-escape-prompt"> |
||||
<span class="close icon hide-panel button" data-panel-closest=".message">{{svg "octicon-x" 16 "close inside"}}</span> |
||||
<div class="header"> |
||||
{{$.root.i18n.Tr "repo.bidi_bad_header"}} |
||||
</div> |
||||
<p>{{$.root.i18n.Tr "repo.bidi_bad_description" | Str2html}}</p> |
||||
</div> |
||||
{{else if .EscapeStatus.Escaped}} |
||||
<div class="ui warning message unicode-escape-prompt"> |
||||
<span class="close icon hide-panel button" data-panel-closest=".message">{{svg "octicon-x" 16 "close inside"}}</span> |
||||
<div class="header"> |
||||
{{$.root.i18n.Tr "repo.unicode_header"}} |
||||
</div> |
||||
<p>{{$.root.i18n.Tr "repo.unicode_description" | Str2html}}</p> |
||||
</div> |
||||
{{end}} |
@ -0,0 +1,28 @@ |
||||
export function initUnicodeEscapeButton() { |
||||
$(document).on('click', 'a.escape-button', (e) => { |
||||
e.preventDefault(); |
||||
$(e.target).parents('.file-content, .non-diff-file-content').find('.file-code, .file-view').addClass('unicode-escaped'); |
||||
$(e.target).hide(); |
||||
$(e.target).siblings('a.unescape-button').show(); |
||||
}); |
||||
$(document).on('click', 'a.unescape-button', (e) => { |
||||
e.preventDefault(); |
||||
$(e.target).parents('.file-content, .non-diff-file-content').find('.file-code, .file-view').removeClass('unicode-escaped'); |
||||
$(e.target).hide(); |
||||
$(e.target).siblings('a.escape-button').show(); |
||||
}); |
||||
$(document).on('click', 'a.toggle-escape-button', (e) => { |
||||
e.preventDefault(); |
||||
const fileContent = $(e.target).parents('.file-content, .non-diff-file-content'); |
||||
const fileView = fileContent.find('.file-code, .file-view'); |
||||
if (fileView.hasClass('unicode-escaped')) { |
||||
fileView.removeClass('unicode-escaped'); |
||||
fileContent.find('a.unescape-button').hide(); |
||||
fileContent.find('a.escape-button').show(); |
||||
} else { |
||||
fileView.addClass('unicode-escaped'); |
||||
fileContent.find('a.unescape-button').show(); |
||||
fileContent.find('a.escape-button').hide(); |
||||
} |
||||
}); |
||||
} |
Loading…
Reference in new issue