Add warning for BIDI characters in page renders and in diffs (#17562)
	
		
	
				
					
				
			Fix #17514 Given the comments I've adjusted this somewhat. The numbers of characters detected are increased and include things like the use of U+300 to make à instead of à and non-breaking spaces. There is a button which can be used to escape the content to show it. Signed-off-by: Andrew Thornton <art27@cantab.net> Co-authored-by: Gwyneth Morgan <gwymor@tilde.club> Co-authored-by: silverwind <me@silverwind.io> Co-authored-by: wxiaoguang <wxiaoguang@gmail.com>tokarchuk/v1.17
							parent
							
								
									ee60f27aec
								
							
						
					
					
						commit
						21ed4fd8da
					
				@ -0,0 +1,230 @@ | 
				
			||||
// Copyright 2021 The Gitea Authors. All rights reserved.
 | 
				
			||||
// Use of this source code is governed by a MIT-style
 | 
				
			||||
// license that can be found in the LICENSE file.
 | 
				
			||||
 | 
				
			||||
package charset | 
				
			||||
 | 
				
			||||
import ( | 
				
			||||
	"bytes" | 
				
			||||
	"fmt" | 
				
			||||
	"io" | 
				
			||||
	"strings" | 
				
			||||
	"unicode" | 
				
			||||
	"unicode/utf8" | 
				
			||||
 | 
				
			||||
	"golang.org/x/text/unicode/bidi" | 
				
			||||
) | 
				
			||||
 | 
				
			||||
// EscapeStatus represents the findings of the unicode escaper
 | 
				
			||||
type EscapeStatus struct { | 
				
			||||
	Escaped      bool | 
				
			||||
	HasError     bool | 
				
			||||
	HasBadRunes  bool | 
				
			||||
	HasControls  bool | 
				
			||||
	HasSpaces    bool | 
				
			||||
	HasMarks     bool | 
				
			||||
	HasBIDI      bool | 
				
			||||
	BadBIDI      bool | 
				
			||||
	HasRTLScript bool | 
				
			||||
	HasLTRScript bool | 
				
			||||
} | 
				
			||||
 | 
				
			||||
// Or combines two EscapeStatus structs into one representing the conjunction of the two
 | 
				
			||||
func (status EscapeStatus) Or(other EscapeStatus) EscapeStatus { | 
				
			||||
	st := status | 
				
			||||
	st.Escaped = st.Escaped || other.Escaped | 
				
			||||
	st.HasError = st.HasError || other.HasError | 
				
			||||
	st.HasBadRunes = st.HasBadRunes || other.HasBadRunes | 
				
			||||
	st.HasControls = st.HasControls || other.HasControls | 
				
			||||
	st.HasSpaces = st.HasSpaces || other.HasSpaces | 
				
			||||
	st.HasMarks = st.HasMarks || other.HasMarks | 
				
			||||
	st.HasBIDI = st.HasBIDI || other.HasBIDI | 
				
			||||
	st.BadBIDI = st.BadBIDI || other.BadBIDI | 
				
			||||
	st.HasRTLScript = st.HasRTLScript || other.HasRTLScript | 
				
			||||
	st.HasLTRScript = st.HasLTRScript || other.HasLTRScript | 
				
			||||
	return st | 
				
			||||
} | 
				
			||||
 | 
				
			||||
// EscapeControlString escapes the unicode control sequences in a provided string and returns the findings as an EscapeStatus and the escaped string
 | 
				
			||||
func EscapeControlString(text string) (EscapeStatus, string) { | 
				
			||||
	sb := &strings.Builder{} | 
				
			||||
	escaped, _ := EscapeControlReader(strings.NewReader(text), sb) | 
				
			||||
	return escaped, sb.String() | 
				
			||||
} | 
				
			||||
 | 
				
			||||
// EscapeControlBytes escapes the unicode control sequences  a provided []byte and returns the findings as an EscapeStatus and the escaped []byte
 | 
				
			||||
func EscapeControlBytes(text []byte) (EscapeStatus, []byte) { | 
				
			||||
	buf := &bytes.Buffer{} | 
				
			||||
	escaped, _ := EscapeControlReader(bytes.NewReader(text), buf) | 
				
			||||
	return escaped, buf.Bytes() | 
				
			||||
} | 
				
			||||
 | 
				
			||||
// EscapeControlReader escapes the unicode control sequences  a provided Reader writing the escaped output to the output and returns the findings as an EscapeStatus and an error
 | 
				
			||||
func EscapeControlReader(text io.Reader, output io.Writer) (escaped EscapeStatus, err error) { | 
				
			||||
	buf := make([]byte, 4096) | 
				
			||||
	readStart := 0 | 
				
			||||
	var n int | 
				
			||||
	var writePos int | 
				
			||||
 | 
				
			||||
	lineHasBIDI := false | 
				
			||||
	lineHasRTLScript := false | 
				
			||||
	lineHasLTRScript := false | 
				
			||||
 | 
				
			||||
readingloop: | 
				
			||||
	for err == nil { | 
				
			||||
		n, err = text.Read(buf[readStart:]) | 
				
			||||
		bs := buf[:n+readStart] | 
				
			||||
		i := 0 | 
				
			||||
 | 
				
			||||
		for i < len(bs) { | 
				
			||||
			r, size := utf8.DecodeRune(bs[i:]) | 
				
			||||
			// Now handle the codepoints
 | 
				
			||||
			switch { | 
				
			||||
			case r == utf8.RuneError: | 
				
			||||
				if writePos < i { | 
				
			||||
					if _, err = output.Write(bs[writePos:i]); err != nil { | 
				
			||||
						escaped.HasError = true | 
				
			||||
						return | 
				
			||||
					} | 
				
			||||
					writePos = i | 
				
			||||
				} | 
				
			||||
				// runes can be at most 4 bytes - so...
 | 
				
			||||
				if len(bs)-i <= 3 { | 
				
			||||
					// if not request more data
 | 
				
			||||
					copy(buf, bs[i:]) | 
				
			||||
					readStart = n - i | 
				
			||||
					writePos = 0 | 
				
			||||
					continue readingloop | 
				
			||||
				} | 
				
			||||
				// this is a real broken rune
 | 
				
			||||
				escaped.HasBadRunes = true | 
				
			||||
				escaped.Escaped = true | 
				
			||||
				if err = writeBroken(output, bs[i:i+size]); err != nil { | 
				
			||||
					escaped.HasError = true | 
				
			||||
					return | 
				
			||||
				} | 
				
			||||
				writePos += size | 
				
			||||
			case r == '\n': | 
				
			||||
				if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript { | 
				
			||||
					escaped.BadBIDI = true | 
				
			||||
				} | 
				
			||||
				lineHasBIDI = false | 
				
			||||
				lineHasRTLScript = false | 
				
			||||
				lineHasLTRScript = false | 
				
			||||
 | 
				
			||||
			case r == '\r' || r == '\t' || r == ' ': | 
				
			||||
				// These are acceptable control characters and space characters
 | 
				
			||||
			case unicode.IsSpace(r): | 
				
			||||
				escaped.HasSpaces = true | 
				
			||||
				escaped.Escaped = true | 
				
			||||
				if writePos < i { | 
				
			||||
					if _, err = output.Write(bs[writePos:i]); err != nil { | 
				
			||||
						escaped.HasError = true | 
				
			||||
						return | 
				
			||||
					} | 
				
			||||
				} | 
				
			||||
				if err = writeEscaped(output, r); err != nil { | 
				
			||||
					escaped.HasError = true | 
				
			||||
					return | 
				
			||||
				} | 
				
			||||
				writePos = i + size | 
				
			||||
			case unicode.Is(unicode.Bidi_Control, r): | 
				
			||||
				escaped.Escaped = true | 
				
			||||
				escaped.HasBIDI = true | 
				
			||||
				if writePos < i { | 
				
			||||
					if _, err = output.Write(bs[writePos:i]); err != nil { | 
				
			||||
						escaped.HasError = true | 
				
			||||
						return | 
				
			||||
					} | 
				
			||||
				} | 
				
			||||
				lineHasBIDI = true | 
				
			||||
				if err = writeEscaped(output, r); err != nil { | 
				
			||||
					escaped.HasError = true | 
				
			||||
					return | 
				
			||||
				} | 
				
			||||
				writePos = i + size | 
				
			||||
			case unicode.Is(unicode.C, r): | 
				
			||||
				escaped.Escaped = true | 
				
			||||
				escaped.HasControls = true | 
				
			||||
				if writePos < i { | 
				
			||||
					if _, err = output.Write(bs[writePos:i]); err != nil { | 
				
			||||
						escaped.HasError = true | 
				
			||||
						return | 
				
			||||
					} | 
				
			||||
				} | 
				
			||||
				if err = writeEscaped(output, r); err != nil { | 
				
			||||
					escaped.HasError = true | 
				
			||||
					return | 
				
			||||
				} | 
				
			||||
				writePos = i + size | 
				
			||||
			case unicode.Is(unicode.M, r): | 
				
			||||
				escaped.Escaped = true | 
				
			||||
				escaped.HasMarks = true | 
				
			||||
				if writePos < i { | 
				
			||||
					if _, err = output.Write(bs[writePos:i]); err != nil { | 
				
			||||
						escaped.HasError = true | 
				
			||||
						return | 
				
			||||
					} | 
				
			||||
				} | 
				
			||||
				if err = writeEscaped(output, r); err != nil { | 
				
			||||
					escaped.HasError = true | 
				
			||||
					return | 
				
			||||
				} | 
				
			||||
				writePos = i + size | 
				
			||||
			default: | 
				
			||||
				p, _ := bidi.Lookup(bs[i : i+size]) | 
				
			||||
				c := p.Class() | 
				
			||||
				if c == bidi.R || c == bidi.AL { | 
				
			||||
					lineHasRTLScript = true | 
				
			||||
					escaped.HasRTLScript = true | 
				
			||||
				} else if c == bidi.L { | 
				
			||||
					lineHasLTRScript = true | 
				
			||||
					escaped.HasLTRScript = true | 
				
			||||
				} | 
				
			||||
			} | 
				
			||||
			i += size | 
				
			||||
		} | 
				
			||||
		if n > 0 { | 
				
			||||
			// we read something...
 | 
				
			||||
			// write everything unwritten
 | 
				
			||||
			if writePos < i { | 
				
			||||
				if _, err = output.Write(bs[writePos:i]); err != nil { | 
				
			||||
					escaped.HasError = true | 
				
			||||
					return | 
				
			||||
				} | 
				
			||||
			} | 
				
			||||
 | 
				
			||||
			// reset the starting positions for the next read
 | 
				
			||||
			readStart = 0 | 
				
			||||
			writePos = 0 | 
				
			||||
		} | 
				
			||||
	} | 
				
			||||
	if readStart > 0 { | 
				
			||||
		// this means that there is an incomplete or broken rune at 0-readStart and we read nothing on the last go round
 | 
				
			||||
		escaped.Escaped = true | 
				
			||||
		escaped.HasBadRunes = true | 
				
			||||
		if err = writeBroken(output, buf[:readStart]); err != nil { | 
				
			||||
			escaped.HasError = true | 
				
			||||
			return | 
				
			||||
		} | 
				
			||||
	} | 
				
			||||
	if err == io.EOF { | 
				
			||||
		if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript { | 
				
			||||
			escaped.BadBIDI = true | 
				
			||||
		} | 
				
			||||
		err = nil | 
				
			||||
		return | 
				
			||||
	} | 
				
			||||
	escaped.HasError = true | 
				
			||||
	return | 
				
			||||
} | 
				
			||||
 | 
				
			||||
func writeBroken(output io.Writer, bs []byte) (err error) { | 
				
			||||
	_, err = fmt.Fprintf(output, `<span class="broken-code-point"><%X></span>`, bs) | 
				
			||||
	return | 
				
			||||
} | 
				
			||||
 | 
				
			||||
func writeEscaped(output io.Writer, r rune) (err error) { | 
				
			||||
	_, err = fmt.Fprintf(output, `<span class="escaped-code-point" data-escaped="[U+%04X]"><span class="char">%c</span></span>`, r, r) | 
				
			||||
	return | 
				
			||||
} | 
				
			||||
@ -0,0 +1,17 @@ | 
				
			||||
{{if .EscapeStatus.BadBIDI}} | 
				
			||||
<div class="ui error message unicode-escape-prompt"> | 
				
			||||
	<span class="close icon hide-panel button" data-panel-closest=".message">{{svg "octicon-x" 16 "close inside"}}</span> | 
				
			||||
	<div class="header"> | 
				
			||||
		{{$.root.i18n.Tr "repo.bidi_bad_header"}} | 
				
			||||
	</div> | 
				
			||||
	<p>{{$.root.i18n.Tr "repo.bidi_bad_description" | Str2html}}</p> | 
				
			||||
</div> | 
				
			||||
{{else if .EscapeStatus.Escaped}} | 
				
			||||
<div class="ui warning message unicode-escape-prompt"> | 
				
			||||
	<span class="close icon hide-panel button" data-panel-closest=".message">{{svg "octicon-x" 16 "close inside"}}</span> | 
				
			||||
	<div class="header"> | 
				
			||||
		{{$.root.i18n.Tr "repo.unicode_header"}} | 
				
			||||
	</div> | 
				
			||||
	<p>{{$.root.i18n.Tr "repo.unicode_description" | Str2html}}</p> | 
				
			||||
</div> | 
				
			||||
{{end}} | 
				
			||||
@ -0,0 +1,28 @@ | 
				
			||||
export function initUnicodeEscapeButton() { | 
				
			||||
  $(document).on('click', 'a.escape-button', (e) => { | 
				
			||||
    e.preventDefault(); | 
				
			||||
    $(e.target).parents('.file-content, .non-diff-file-content').find('.file-code, .file-view').addClass('unicode-escaped'); | 
				
			||||
    $(e.target).hide(); | 
				
			||||
    $(e.target).siblings('a.unescape-button').show(); | 
				
			||||
  }); | 
				
			||||
  $(document).on('click', 'a.unescape-button', (e) => { | 
				
			||||
    e.preventDefault(); | 
				
			||||
    $(e.target).parents('.file-content, .non-diff-file-content').find('.file-code, .file-view').removeClass('unicode-escaped'); | 
				
			||||
    $(e.target).hide(); | 
				
			||||
    $(e.target).siblings('a.escape-button').show(); | 
				
			||||
  }); | 
				
			||||
  $(document).on('click', 'a.toggle-escape-button', (e) => { | 
				
			||||
    e.preventDefault(); | 
				
			||||
    const fileContent = $(e.target).parents('.file-content, .non-diff-file-content'); | 
				
			||||
    const fileView = fileContent.find('.file-code, .file-view'); | 
				
			||||
    if (fileView.hasClass('unicode-escaped')) { | 
				
			||||
      fileView.removeClass('unicode-escaped'); | 
				
			||||
      fileContent.find('a.unescape-button').hide(); | 
				
			||||
      fileContent.find('a.escape-button').show(); | 
				
			||||
    } else { | 
				
			||||
      fileView.addClass('unicode-escaped'); | 
				
			||||
      fileContent.find('a.unescape-button').show(); | 
				
			||||
      fileContent.find('a.escape-button').hide(); | 
				
			||||
    } | 
				
			||||
  }); | 
				
			||||
} | 
				
			||||
					Loading…
					
					
				
		Reference in new issue