Switch Unicode Escaping to a VSCode-like system (#19990)
This PR rewrites the invisible unicode detection algorithm to more closely match that of the Monaco editor on the system. It provides a technique for detecting ambiguous characters and relaxes the detection of combining marks. Control characters are in addition detected as invisible in this implementation whereas they are not on monaco but this is related to font issues. Close #19913 Signed-off-by: Andrew Thornton <art27@cantab.net>tokarchuk/v1.18
parent
11dc6df5be
commit
99efa02edf
@ -0,0 +1,54 @@ |
|||||||
|
// This file is generated by modules/charset/ambiguous/generate.go DO NOT EDIT
|
||||||
|
// Copyright 2022 The Gitea Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package charset |
||||||
|
|
||||||
|
import ( |
||||||
|
"sort" |
||||||
|
"strings" |
||||||
|
"unicode" |
||||||
|
|
||||||
|
"code.gitea.io/gitea/modules/translation" |
||||||
|
) |
||||||
|
|
||||||
|
// AmbiguousTablesForLocale provides the table of ambiguous characters for this locale.
|
||||||
|
func AmbiguousTablesForLocale(locale translation.Locale) []*AmbiguousTable { |
||||||
|
key := locale.Language() |
||||||
|
var table *AmbiguousTable |
||||||
|
var ok bool |
||||||
|
for len(key) > 0 { |
||||||
|
if table, ok = AmbiguousCharacters[key]; ok { |
||||||
|
break |
||||||
|
} |
||||||
|
idx := strings.LastIndexAny(key, "-_") |
||||||
|
if idx < 0 { |
||||||
|
key = "" |
||||||
|
} else { |
||||||
|
key = key[:idx] |
||||||
|
} |
||||||
|
} |
||||||
|
if table == nil { |
||||||
|
table = AmbiguousCharacters["_default"] |
||||||
|
} |
||||||
|
|
||||||
|
return []*AmbiguousTable{ |
||||||
|
table, |
||||||
|
AmbiguousCharacters["_common"], |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
func isAmbiguous(r rune, confusableTo *rune, tables ...*AmbiguousTable) bool { |
||||||
|
for _, table := range tables { |
||||||
|
if !unicode.Is(table.RangeTable, r) { |
||||||
|
continue |
||||||
|
} |
||||||
|
i := sort.Search(len(table.Confusable), func(i int) bool { |
||||||
|
return table.Confusable[i] >= r |
||||||
|
}) |
||||||
|
(*confusableTo) = table.With[i] |
||||||
|
return true |
||||||
|
} |
||||||
|
return false |
||||||
|
} |
File diff suppressed because one or more lines are too long
@ -0,0 +1,178 @@ |
|||||||
|
// Copyright 2022 The Gitea Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package main |
||||||
|
|
||||||
|
import ( |
||||||
|
"bytes" |
||||||
|
"flag" |
||||||
|
"fmt" |
||||||
|
"go/format" |
||||||
|
"os" |
||||||
|
"sort" |
||||||
|
"text/template" |
||||||
|
"unicode" |
||||||
|
|
||||||
|
"code.gitea.io/gitea/modules/json" |
||||||
|
|
||||||
|
"golang.org/x/text/unicode/rangetable" |
||||||
|
) |
||||||
|
|
||||||
|
// ambiguous.json provides a one to one mapping of ambiguous characters to other characters
|
||||||
|
// See https://github.com/hediet/vscode-unicode-data/blob/main/out/ambiguous.json
|
||||||
|
|
||||||
|
type AmbiguousTable struct { |
||||||
|
Confusable []rune |
||||||
|
With []rune |
||||||
|
Locale string |
||||||
|
RangeTable *unicode.RangeTable |
||||||
|
} |
||||||
|
|
||||||
|
type RunePair struct { |
||||||
|
Confusable rune |
||||||
|
With rune |
||||||
|
} |
||||||
|
|
||||||
|
var verbose bool |
||||||
|
|
||||||
|
func main() { |
||||||
|
flag.Usage = func() { |
||||||
|
fmt.Fprintf(os.Stderr, `%s: Generate AmbiguousCharacter |
||||||
|
|
||||||
|
Usage: %[1]s [-v] [-o output.go] ambiguous.json |
||||||
|
`, os.Args[0]) |
||||||
|
flag.PrintDefaults() |
||||||
|
} |
||||||
|
|
||||||
|
output := "" |
||||||
|
flag.BoolVar(&verbose, "v", false, "verbose output") |
||||||
|
flag.StringVar(&output, "o", "ambiguous_gen.go", "file to output to") |
||||||
|
flag.Parse() |
||||||
|
input := flag.Arg(0) |
||||||
|
if input == "" { |
||||||
|
input = "ambiguous.json" |
||||||
|
} |
||||||
|
|
||||||
|
bs, err := os.ReadFile(input) |
||||||
|
if err != nil { |
||||||
|
fatalf("Unable to read: %s Err: %v", input, err) |
||||||
|
} |
||||||
|
|
||||||
|
var unwrapped string |
||||||
|
if err := json.Unmarshal(bs, &unwrapped); err != nil { |
||||||
|
fatalf("Unable to unwrap content in: %s Err: %v", input, err) |
||||||
|
} |
||||||
|
|
||||||
|
fromJSON := map[string][]uint32{} |
||||||
|
if err := json.Unmarshal([]byte(unwrapped), &fromJSON); err != nil { |
||||||
|
fatalf("Unable to unmarshal content in: %s Err: %v", input, err) |
||||||
|
} |
||||||
|
|
||||||
|
tables := make([]*AmbiguousTable, 0, len(fromJSON)) |
||||||
|
for locale, chars := range fromJSON { |
||||||
|
table := &AmbiguousTable{Locale: locale} |
||||||
|
table.Confusable = make([]rune, 0, len(chars)/2) |
||||||
|
table.With = make([]rune, 0, len(chars)/2) |
||||||
|
pairs := make([]RunePair, len(chars)/2) |
||||||
|
for i := 0; i < len(chars); i += 2 { |
||||||
|
pairs[i/2].Confusable, pairs[i/2].With = rune(chars[i]), rune(chars[i+1]) |
||||||
|
} |
||||||
|
sort.Slice(pairs, func(i, j int) bool { |
||||||
|
return pairs[i].Confusable < pairs[j].Confusable |
||||||
|
}) |
||||||
|
for _, pair := range pairs { |
||||||
|
table.Confusable = append(table.Confusable, pair.Confusable) |
||||||
|
table.With = append(table.With, pair.With) |
||||||
|
} |
||||||
|
table.RangeTable = rangetable.New(table.Confusable...) |
||||||
|
tables = append(tables, table) |
||||||
|
} |
||||||
|
sort.Slice(tables, func(i, j int) bool { |
||||||
|
return tables[i].Locale < tables[j].Locale |
||||||
|
}) |
||||||
|
data := map[string]interface{}{ |
||||||
|
"Tables": tables, |
||||||
|
} |
||||||
|
|
||||||
|
if err := runTemplate(generatorTemplate, output, &data); err != nil { |
||||||
|
fatalf("Unable to run template: %v", err) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
func runTemplate(t *template.Template, filename string, data interface{}) error { |
||||||
|
buf := bytes.NewBuffer(nil) |
||||||
|
if err := t.Execute(buf, data); err != nil { |
||||||
|
return fmt.Errorf("unable to execute template: %w", err) |
||||||
|
} |
||||||
|
bs, err := format.Source(buf.Bytes()) |
||||||
|
if err != nil { |
||||||
|
verbosef("Bad source:\n%s", buf.String()) |
||||||
|
return fmt.Errorf("unable to format source: %w", err) |
||||||
|
} |
||||||
|
file, err := os.Create(filename) |
||||||
|
if err != nil { |
||||||
|
return fmt.Errorf("failed to create file %s because %w", filename, err) |
||||||
|
} |
||||||
|
defer file.Close() |
||||||
|
_, err = file.Write(bs) |
||||||
|
if err != nil { |
||||||
|
return fmt.Errorf("unable to write generated source: %w", err) |
||||||
|
} |
||||||
|
return nil |
||||||
|
} |
||||||
|
|
||||||
|
var generatorTemplate = template.Must(template.New("ambiguousTemplate").Parse(`// This file is generated by modules/charset/ambiguous/generate.go DO NOT EDIT
|
||||||
|
// Copyright 2022 The Gitea Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package charset |
||||||
|
|
||||||
|
import "unicode" |
||||||
|
|
||||||
|
// This file is generated from https://github.com/hediet/vscode-unicode-data/blob/main/out/ambiguous.json
|
||||||
|
|
||||||
|
// AmbiguousTable matches a confusable rune with its partner for the Locale
|
||||||
|
type AmbiguousTable struct { |
||||||
|
Confusable []rune |
||||||
|
With []rune |
||||||
|
Locale string |
||||||
|
RangeTable *unicode.RangeTable |
||||||
|
} |
||||||
|
|
||||||
|
// AmbiguousCharacters provides a map by locale name to the confusable characters in that locale
|
||||||
|
var AmbiguousCharacters = map[string]*AmbiguousTable{ |
||||||
|
{{range .Tables}}{{printf "%q:" .Locale}} { |
||||||
|
Confusable: []rune{ {{range .Confusable}}{{.}},{{end}} }, |
||||||
|
With: []rune{ {{range .With}}{{.}},{{end}} }, |
||||||
|
Locale: {{printf "%q" .Locale}}, |
||||||
|
RangeTable: &unicode.RangeTable{ |
||||||
|
R16: []unicode.Range16{ |
||||||
|
{{range .RangeTable.R16 }} {Lo:{{.Lo}}, Hi:{{.Hi}}, Stride: {{.Stride}}}, |
||||||
|
{{end}} }, |
||||||
|
R32: []unicode.Range32{ |
||||||
|
{{range .RangeTable.R32}} {Lo:{{.Lo}}, Hi:{{.Hi}}, Stride: {{.Stride}}}, |
||||||
|
{{end}} }, |
||||||
|
LatinOffset: {{.RangeTable.LatinOffset}}, |
||||||
|
}, |
||||||
|
}, |
||||||
|
{{end}} |
||||||
|
} |
||||||
|
|
||||||
|
`)) |
||||||
|
|
||||||
|
func logf(format string, args ...interface{}) { |
||||||
|
fmt.Fprintf(os.Stderr, format+"\n", args...) |
||||||
|
} |
||||||
|
|
||||||
|
func verbosef(format string, args ...interface{}) { |
||||||
|
if verbose { |
||||||
|
logf(format, args...) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
func fatalf(format string, args ...interface{}) { |
||||||
|
logf("fatal: "+format+"\n", args...) |
||||||
|
os.Exit(1) |
||||||
|
} |
File diff suppressed because one or more lines are too long
@ -0,0 +1,32 @@ |
|||||||
|
// Copyright 2022 The Gitea Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package charset |
||||||
|
|
||||||
|
import ( |
||||||
|
"sort" |
||||||
|
"testing" |
||||||
|
"unicode" |
||||||
|
|
||||||
|
"github.com/stretchr/testify/assert" |
||||||
|
) |
||||||
|
|
||||||
|
func TestAmbiguousCharacters(t *testing.T) { |
||||||
|
for locale, ambiguous := range AmbiguousCharacters { |
||||||
|
assert.Equal(t, locale, ambiguous.Locale) |
||||||
|
assert.Equal(t, len(ambiguous.Confusable), len(ambiguous.With)) |
||||||
|
assert.True(t, sort.SliceIsSorted(ambiguous.Confusable, func(i, j int) bool { |
||||||
|
return ambiguous.Confusable[i] < ambiguous.Confusable[j] |
||||||
|
})) |
||||||
|
|
||||||
|
for _, confusable := range ambiguous.Confusable { |
||||||
|
assert.True(t, unicode.Is(ambiguous.RangeTable, confusable)) |
||||||
|
i := sort.Search(len(ambiguous.Confusable), func(j int) bool { |
||||||
|
return ambiguous.Confusable[j] >= confusable |
||||||
|
}) |
||||||
|
found := i < len(ambiguous.Confusable) && ambiguous.Confusable[i] == confusable |
||||||
|
assert.True(t, found, "%c is not in %d", confusable, i) |
||||||
|
} |
||||||
|
} |
||||||
|
} |
@ -0,0 +1,44 @@ |
|||||||
|
// Copyright 2022 The Gitea Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package charset |
||||||
|
|
||||||
|
import ( |
||||||
|
"bytes" |
||||||
|
"io" |
||||||
|
) |
||||||
|
|
||||||
|
// BreakWriter wraps an io.Writer to always write '\n' as '<br>'
|
||||||
|
type BreakWriter struct { |
||||||
|
io.Writer |
||||||
|
} |
||||||
|
|
||||||
|
// Write writes the provided byte slice transparently replacing '\n' with '<br>'
|
||||||
|
func (b *BreakWriter) Write(bs []byte) (n int, err error) { |
||||||
|
pos := 0 |
||||||
|
for pos < len(bs) { |
||||||
|
idx := bytes.IndexByte(bs[pos:], '\n') |
||||||
|
if idx < 0 { |
||||||
|
wn, err := b.Writer.Write(bs[pos:]) |
||||||
|
return n + wn, err |
||||||
|
} |
||||||
|
|
||||||
|
if idx > 0 { |
||||||
|
wn, err := b.Writer.Write(bs[pos : pos+idx]) |
||||||
|
n += wn |
||||||
|
if err != nil { |
||||||
|
return n, err |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
if _, err = b.Writer.Write([]byte("<br>")); err != nil { |
||||||
|
return n, err |
||||||
|
} |
||||||
|
pos += idx + 1 |
||||||
|
|
||||||
|
n++ |
||||||
|
} |
||||||
|
|
||||||
|
return n, err |
||||||
|
} |
@ -0,0 +1,69 @@ |
|||||||
|
// Copyright 2022 The Gitea Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package charset |
||||||
|
|
||||||
|
import ( |
||||||
|
"strings" |
||||||
|
"testing" |
||||||
|
) |
||||||
|
|
||||||
|
func TestBreakWriter_Write(t *testing.T) { |
||||||
|
tests := []struct { |
||||||
|
name string |
||||||
|
kase string |
||||||
|
expect string |
||||||
|
wantErr bool |
||||||
|
}{ |
||||||
|
{ |
||||||
|
name: "noline", |
||||||
|
kase: "abcdefghijklmnopqrstuvwxyz", |
||||||
|
expect: "abcdefghijklmnopqrstuvwxyz", |
||||||
|
}, |
||||||
|
{ |
||||||
|
name: "endline", |
||||||
|
kase: "abcdefghijklmnopqrstuvwxyz\n", |
||||||
|
expect: "abcdefghijklmnopqrstuvwxyz<br>", |
||||||
|
}, |
||||||
|
{ |
||||||
|
name: "startline", |
||||||
|
kase: "\nabcdefghijklmnopqrstuvwxyz", |
||||||
|
expect: "<br>abcdefghijklmnopqrstuvwxyz", |
||||||
|
}, |
||||||
|
{ |
||||||
|
name: "onlyline", |
||||||
|
kase: "\n\n\n", |
||||||
|
expect: "<br><br><br>", |
||||||
|
}, |
||||||
|
{ |
||||||
|
name: "empty", |
||||||
|
kase: "", |
||||||
|
expect: "", |
||||||
|
}, |
||||||
|
{ |
||||||
|
name: "midline", |
||||||
|
kase: "\nabc\ndefghijkl\nmnopqrstuvwxy\nz", |
||||||
|
expect: "<br>abc<br>defghijkl<br>mnopqrstuvwxy<br>z", |
||||||
|
}, |
||||||
|
} |
||||||
|
for _, tt := range tests { |
||||||
|
t.Run(tt.name, func(t *testing.T) { |
||||||
|
buf := &strings.Builder{} |
||||||
|
b := &BreakWriter{ |
||||||
|
Writer: buf, |
||||||
|
} |
||||||
|
n, err := b.Write([]byte(tt.kase)) |
||||||
|
if (err != nil) != tt.wantErr { |
||||||
|
t.Errorf("BreakWriter.Write() error = %v, wantErr %v", err, tt.wantErr) |
||||||
|
return |
||||||
|
} |
||||||
|
if n != len(tt.kase) { |
||||||
|
t.Errorf("BreakWriter.Write() = %v, want %v", n, len(tt.kase)) |
||||||
|
} |
||||||
|
if buf.String() != tt.expect { |
||||||
|
t.Errorf("BreakWriter.Write() wrote %q, want %v", buf.String(), tt.expect) |
||||||
|
} |
||||||
|
}) |
||||||
|
} |
||||||
|
} |
@ -1,236 +1,58 @@ |
|||||||
// Copyright 2021 The Gitea Authors. All rights reserved.
|
// Copyright 2022 The Gitea Authors. All rights reserved.
|
||||||
// Use of this source code is governed by a MIT-style
|
// Use of this source code is governed by a MIT-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
//go:generate go run invisible/generate.go -v -o ./invisible_gen.go
|
||||||
|
|
||||||
|
//go:generate go run ambiguous/generate.go -v -o ./ambiguous_gen.go ambiguous/ambiguous.json
|
||||||
|
|
||||||
package charset |
package charset |
||||||
|
|
||||||
import ( |
import ( |
||||||
"bytes" |
|
||||||
"fmt" |
|
||||||
"io" |
"io" |
||||||
"strings" |
"strings" |
||||||
"unicode" |
|
||||||
"unicode/utf8" |
|
||||||
|
|
||||||
"golang.org/x/text/unicode/bidi" |
"code.gitea.io/gitea/modules/log" |
||||||
|
"code.gitea.io/gitea/modules/translation" |
||||||
) |
) |
||||||
|
|
||||||
// EscapeStatus represents the findings of the unicode escaper
|
// RuneNBSP is the codepoint for NBSP
|
||||||
type EscapeStatus struct { |
const RuneNBSP = 0xa0 |
||||||
Escaped bool |
|
||||||
HasError bool |
|
||||||
HasBadRunes bool |
|
||||||
HasControls bool |
|
||||||
HasSpaces bool |
|
||||||
HasMarks bool |
|
||||||
HasBIDI bool |
|
||||||
BadBIDI bool |
|
||||||
HasRTLScript bool |
|
||||||
HasLTRScript bool |
|
||||||
} |
|
||||||
|
|
||||||
// Or combines two EscapeStatus structs into one representing the conjunction of the two
|
|
||||||
func (status EscapeStatus) Or(other EscapeStatus) EscapeStatus { |
|
||||||
st := status |
|
||||||
st.Escaped = st.Escaped || other.Escaped |
|
||||||
st.HasError = st.HasError || other.HasError |
|
||||||
st.HasBadRunes = st.HasBadRunes || other.HasBadRunes |
|
||||||
st.HasControls = st.HasControls || other.HasControls |
|
||||||
st.HasSpaces = st.HasSpaces || other.HasSpaces |
|
||||||
st.HasMarks = st.HasMarks || other.HasMarks |
|
||||||
st.HasBIDI = st.HasBIDI || other.HasBIDI |
|
||||||
st.BadBIDI = st.BadBIDI || other.BadBIDI |
|
||||||
st.HasRTLScript = st.HasRTLScript || other.HasRTLScript |
|
||||||
st.HasLTRScript = st.HasLTRScript || other.HasLTRScript |
|
||||||
return st |
|
||||||
} |
|
||||||
|
|
||||||
// EscapeControlString escapes the unicode control sequences in a provided string and returns the findings as an EscapeStatus and the escaped string
|
// EscapeControlHTML escapes the unicode control sequences in a provided html document
|
||||||
func EscapeControlString(text string) (EscapeStatus, string) { |
func EscapeControlHTML(text string, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, output string) { |
||||||
sb := &strings.Builder{} |
sb := &strings.Builder{} |
||||||
escaped, _ := EscapeControlReader(strings.NewReader(text), sb) |
outputStream := &HTMLStreamerWriter{Writer: sb} |
||||||
return escaped, sb.String() |
streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer) |
||||||
} |
|
||||||
|
|
||||||
// EscapeControlBytes escapes the unicode control sequences a provided []byte and returns the findings as an EscapeStatus and the escaped []byte
|
if err := StreamHTML(strings.NewReader(text), streamer); err != nil { |
||||||
func EscapeControlBytes(text []byte) (EscapeStatus, []byte) { |
streamer.escaped.HasError = true |
||||||
buf := &bytes.Buffer{} |
log.Error("Error whilst escaping: %v", err) |
||||||
escaped, _ := EscapeControlReader(bytes.NewReader(text), buf) |
} |
||||||
return escaped, buf.Bytes() |
return streamer.escaped, sb.String() |
||||||
} |
} |
||||||
|
|
||||||
// EscapeControlReader escapes the unicode control sequences a provided Reader writing the escaped output to the output and returns the findings as an EscapeStatus and an error
|
// EscapeControlReaders escapes the unicode control sequences in a provider reader and writer in a locale and returns the findings as an EscapeStatus and the escaped []byte
|
||||||
func EscapeControlReader(text io.Reader, output io.Writer) (escaped EscapeStatus, err error) { |
func EscapeControlReader(reader io.Reader, writer io.Writer, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, err error) { |
||||||
buf := make([]byte, 4096) |
outputStream := &HTMLStreamerWriter{Writer: writer} |
||||||
readStart := 0 |
streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer) |
||||||
runeCount := 0 |
|
||||||
var n int |
|
||||||
var writePos int |
|
||||||
|
|
||||||
lineHasBIDI := false |
|
||||||
lineHasRTLScript := false |
|
||||||
lineHasLTRScript := false |
|
||||||
|
|
||||||
readingloop: |
|
||||||
for err == nil { |
|
||||||
n, err = text.Read(buf[readStart:]) |
|
||||||
bs := buf[:n+readStart] |
|
||||||
n = len(bs) |
|
||||||
i := 0 |
|
||||||
|
|
||||||
for i < len(bs) { |
if err = StreamHTML(reader, streamer); err != nil { |
||||||
r, size := utf8.DecodeRune(bs[i:]) |
streamer.escaped.HasError = true |
||||||
runeCount++ |
log.Error("Error whilst escaping: %v", err) |
||||||
|
|
||||||
// Now handle the codepoints
|
|
||||||
switch { |
|
||||||
case r == utf8.RuneError: |
|
||||||
if writePos < i { |
|
||||||
if _, err = output.Write(bs[writePos:i]); err != nil { |
|
||||||
escaped.HasError = true |
|
||||||
return |
|
||||||
} |
|
||||||
writePos = i |
|
||||||
} |
|
||||||
// runes can be at most 4 bytes - so...
|
|
||||||
if len(bs)-i <= 3 { |
|
||||||
// if not request more data
|
|
||||||
copy(buf, bs[i:]) |
|
||||||
readStart = n - i |
|
||||||
writePos = 0 |
|
||||||
continue readingloop |
|
||||||
} |
|
||||||
// this is a real broken rune
|
|
||||||
escaped.HasBadRunes = true |
|
||||||
escaped.Escaped = true |
|
||||||
if err = writeBroken(output, bs[i:i+size]); err != nil { |
|
||||||
escaped.HasError = true |
|
||||||
return |
|
||||||
} |
|
||||||
writePos += size |
|
||||||
case r == '\n': |
|
||||||
if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript { |
|
||||||
escaped.BadBIDI = true |
|
||||||
} |
|
||||||
lineHasBIDI = false |
|
||||||
lineHasRTLScript = false |
|
||||||
lineHasLTRScript = false |
|
||||||
|
|
||||||
case runeCount == 1 && r == 0xFEFF: // UTF BOM
|
|
||||||
// the first BOM is safe
|
|
||||||
case r == '\r' || r == '\t' || r == ' ': |
|
||||||
// These are acceptable control characters and space characters
|
|
||||||
case unicode.IsSpace(r): |
|
||||||
escaped.HasSpaces = true |
|
||||||
escaped.Escaped = true |
|
||||||
if writePos < i { |
|
||||||
if _, err = output.Write(bs[writePos:i]); err != nil { |
|
||||||
escaped.HasError = true |
|
||||||
return |
|
||||||
} |
|
||||||
} |
|
||||||
if err = writeEscaped(output, r); err != nil { |
|
||||||
escaped.HasError = true |
|
||||||
return |
|
||||||
} |
|
||||||
writePos = i + size |
|
||||||
case unicode.Is(unicode.Bidi_Control, r): |
|
||||||
escaped.Escaped = true |
|
||||||
escaped.HasBIDI = true |
|
||||||
if writePos < i { |
|
||||||
if _, err = output.Write(bs[writePos:i]); err != nil { |
|
||||||
escaped.HasError = true |
|
||||||
return |
|
||||||
} |
|
||||||
} |
|
||||||
lineHasBIDI = true |
|
||||||
if err = writeEscaped(output, r); err != nil { |
|
||||||
escaped.HasError = true |
|
||||||
return |
|
||||||
} |
|
||||||
writePos = i + size |
|
||||||
case unicode.Is(unicode.C, r): |
|
||||||
escaped.Escaped = true |
|
||||||
escaped.HasControls = true |
|
||||||
if writePos < i { |
|
||||||
if _, err = output.Write(bs[writePos:i]); err != nil { |
|
||||||
escaped.HasError = true |
|
||||||
return |
|
||||||
} |
|
||||||
} |
|
||||||
if err = writeEscaped(output, r); err != nil { |
|
||||||
escaped.HasError = true |
|
||||||
return |
|
||||||
} |
|
||||||
writePos = i + size |
|
||||||
case unicode.Is(unicode.M, r): |
|
||||||
escaped.Escaped = true |
|
||||||
escaped.HasMarks = true |
|
||||||
if writePos < i { |
|
||||||
if _, err = output.Write(bs[writePos:i]); err != nil { |
|
||||||
escaped.HasError = true |
|
||||||
return |
|
||||||
} |
|
||||||
} |
|
||||||
if err = writeEscaped(output, r); err != nil { |
|
||||||
escaped.HasError = true |
|
||||||
return |
|
||||||
} |
|
||||||
writePos = i + size |
|
||||||
default: |
|
||||||
p, _ := bidi.Lookup(bs[i : i+size]) |
|
||||||
c := p.Class() |
|
||||||
if c == bidi.R || c == bidi.AL { |
|
||||||
lineHasRTLScript = true |
|
||||||
escaped.HasRTLScript = true |
|
||||||
} else if c == bidi.L { |
|
||||||
lineHasLTRScript = true |
|
||||||
escaped.HasLTRScript = true |
|
||||||
} |
|
||||||
} |
|
||||||
i += size |
|
||||||
} |
|
||||||
if n > 0 { |
|
||||||
// we read something...
|
|
||||||
// write everything unwritten
|
|
||||||
if writePos < i { |
|
||||||
if _, err = output.Write(bs[writePos:i]); err != nil { |
|
||||||
escaped.HasError = true |
|
||||||
return |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// reset the starting positions for the next read
|
|
||||||
readStart = 0 |
|
||||||
writePos = 0 |
|
||||||
} |
|
||||||
} |
|
||||||
if readStart > 0 { |
|
||||||
// this means that there is an incomplete or broken rune at 0-readStart and we read nothing on the last go round
|
|
||||||
escaped.Escaped = true |
|
||||||
escaped.HasBadRunes = true |
|
||||||
if err = writeBroken(output, buf[:readStart]); err != nil { |
|
||||||
escaped.HasError = true |
|
||||||
return |
|
||||||
} |
|
||||||
} |
|
||||||
if err == io.EOF { |
|
||||||
if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript { |
|
||||||
escaped.BadBIDI = true |
|
||||||
} |
|
||||||
err = nil |
|
||||||
return |
|
||||||
} |
} |
||||||
escaped.HasError = true |
return streamer.escaped, err |
||||||
return escaped, err |
|
||||||
} |
} |
||||||
|
|
||||||
func writeBroken(output io.Writer, bs []byte) (err error) { |
// EscapeControlString escapes the unicode control sequences in a provided string and returns the findings as an EscapeStatus and the escaped string
|
||||||
_, err = fmt.Fprintf(output, `<span class="broken-code-point"><%X></span>`, bs) |
func EscapeControlString(text string, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, output string) { |
||||||
return err |
sb := &strings.Builder{} |
||||||
} |
outputStream := &HTMLStreamerWriter{Writer: sb} |
||||||
|
streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer) |
||||||
|
|
||||||
func writeEscaped(output io.Writer, r rune) (err error) { |
if err := streamer.Text(text); err != nil { |
||||||
_, err = fmt.Fprintf(output, `<span class="escaped-code-point" data-escaped="[U+%04X]"><span class="char">%c</span></span>`, r, r) |
streamer.escaped.HasError = true |
||||||
return err |
log.Error("Error whilst escaping: %v", err) |
||||||
|
} |
||||||
|
return streamer.escaped, sb.String() |
||||||
} |
} |
||||||
|
@ -0,0 +1,28 @@ |
|||||||
|
// Copyright 2021 The Gitea Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package charset |
||||||
|
|
||||||
|
// EscapeStatus represents the findings of the unicode escaper
|
||||||
|
type EscapeStatus struct { |
||||||
|
Escaped bool |
||||||
|
HasError bool |
||||||
|
HasBadRunes bool |
||||||
|
HasInvisible bool |
||||||
|
HasAmbiguous bool |
||||||
|
} |
||||||
|
|
||||||
|
// Or combines two EscapeStatus structs into one representing the conjunction of the two
|
||||||
|
func (status *EscapeStatus) Or(other *EscapeStatus) *EscapeStatus { |
||||||
|
st := status |
||||||
|
if status == nil { |
||||||
|
st = &EscapeStatus{} |
||||||
|
} |
||||||
|
st.Escaped = st.Escaped || other.Escaped |
||||||
|
st.HasError = st.HasError || other.HasError |
||||||
|
st.HasBadRunes = st.HasBadRunes || other.HasBadRunes |
||||||
|
st.HasAmbiguous = st.HasAmbiguous || other.HasAmbiguous |
||||||
|
st.HasInvisible = st.HasInvisible || other.HasInvisible |
||||||
|
return st |
||||||
|
} |
@ -0,0 +1,297 @@ |
|||||||
|
// Copyright 2022 The Gitea Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package charset |
||||||
|
|
||||||
|
import ( |
||||||
|
"fmt" |
||||||
|
"regexp" |
||||||
|
"sort" |
||||||
|
"strings" |
||||||
|
"unicode" |
||||||
|
"unicode/utf8" |
||||||
|
|
||||||
|
"code.gitea.io/gitea/modules/translation" |
||||||
|
|
||||||
|
"golang.org/x/net/html" |
||||||
|
) |
||||||
|
|
||||||
|
// VScode defaultWordRegexp
|
||||||
|
var defaultWordRegexp = regexp.MustCompile(`(-?\d*\.\d\w*)|([^\` + "`" + `\~\!\@\#\$\%\^\&\*\(\)\-\=\+\[\{\]\}\\\|\;\:\'\"\,\.\<\>\/\?\s\x00-\x1f]+)`) |
||||||
|
|
||||||
|
func NewEscapeStreamer(locale translation.Locale, next HTMLStreamer, allowed ...rune) HTMLStreamer { |
||||||
|
return &escapeStreamer{ |
||||||
|
escaped: &EscapeStatus{}, |
||||||
|
PassthroughHTMLStreamer: *NewPassthroughStreamer(next), |
||||||
|
locale: locale, |
||||||
|
ambiguousTables: AmbiguousTablesForLocale(locale), |
||||||
|
allowed: allowed, |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
type escapeStreamer struct { |
||||||
|
PassthroughHTMLStreamer |
||||||
|
escaped *EscapeStatus |
||||||
|
locale translation.Locale |
||||||
|
ambiguousTables []*AmbiguousTable |
||||||
|
allowed []rune |
||||||
|
} |
||||||
|
|
||||||
|
func (e *escapeStreamer) EscapeStatus() *EscapeStatus { |
||||||
|
return e.escaped |
||||||
|
} |
||||||
|
|
||||||
|
// Text tells the next streamer there is a text
|
||||||
|
func (e *escapeStreamer) Text(data string) error { |
||||||
|
sb := &strings.Builder{} |
||||||
|
pos, until, next := 0, 0, 0 |
||||||
|
if len(data) > len(UTF8BOM) && data[:len(UTF8BOM)] == string(UTF8BOM) { |
||||||
|
_, _ = sb.WriteString(data[:len(UTF8BOM)]) |
||||||
|
pos = len(UTF8BOM) |
||||||
|
} |
||||||
|
for pos < len(data) { |
||||||
|
nextIdxs := defaultWordRegexp.FindStringIndex(data[pos:]) |
||||||
|
if nextIdxs == nil { |
||||||
|
until = len(data) |
||||||
|
next = until |
||||||
|
} else { |
||||||
|
until, next = nextIdxs[0]+pos, nextIdxs[1]+pos |
||||||
|
} |
||||||
|
|
||||||
|
// from pos until until we know that the runes are not \r\t\n or even ' '
|
||||||
|
runes := make([]rune, 0, next-until) |
||||||
|
positions := make([]int, 0, next-until+1) |
||||||
|
|
||||||
|
for pos < until { |
||||||
|
r, sz := utf8.DecodeRune([]byte(data)[pos:]) |
||||||
|
positions = positions[:0] |
||||||
|
positions = append(positions, pos, pos+sz) |
||||||
|
types, confusables, _ := e.runeTypes(r) |
||||||
|
if err := e.handleRunes(data, []rune{r}, positions, types, confusables, sb); err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
pos += sz |
||||||
|
} |
||||||
|
|
||||||
|
for i := pos; i < next; { |
||||||
|
r, sz := utf8.DecodeRune([]byte(data)[i:]) |
||||||
|
runes = append(runes, r) |
||||||
|
positions = append(positions, i) |
||||||
|
i += sz |
||||||
|
} |
||||||
|
positions = append(positions, next) |
||||||
|
types, confusables, runeCounts := e.runeTypes(runes...) |
||||||
|
if runeCounts.needsEscape() { |
||||||
|
if err := e.handleRunes(data, runes, positions, types, confusables, sb); err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
} else { |
||||||
|
_, _ = sb.Write([]byte(data)[pos:next]) |
||||||
|
} |
||||||
|
pos = next |
||||||
|
} |
||||||
|
if sb.Len() > 0 { |
||||||
|
if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
} |
||||||
|
return nil |
||||||
|
} |
||||||
|
|
||||||
|
func (e *escapeStreamer) handleRunes(data string, runes []rune, positions []int, types []runeType, confusables []rune, sb *strings.Builder) error { |
||||||
|
for i, r := range runes { |
||||||
|
switch types[i] { |
||||||
|
case brokenRuneType: |
||||||
|
if sb.Len() > 0 { |
||||||
|
if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
sb.Reset() |
||||||
|
} |
||||||
|
end := positions[i+1] |
||||||
|
start := positions[i] |
||||||
|
if err := e.brokenRune([]byte(data)[start:end]); err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
case ambiguousRuneType: |
||||||
|
if sb.Len() > 0 { |
||||||
|
if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
sb.Reset() |
||||||
|
} |
||||||
|
if err := e.ambiguousRune(r, confusables[0]); err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
confusables = confusables[1:] |
||||||
|
case invisibleRuneType: |
||||||
|
if sb.Len() > 0 { |
||||||
|
if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
sb.Reset() |
||||||
|
} |
||||||
|
if err := e.invisibleRune(r); err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
default: |
||||||
|
_, _ = sb.WriteRune(r) |
||||||
|
} |
||||||
|
} |
||||||
|
return nil |
||||||
|
} |
||||||
|
|
||||||
|
func (e *escapeStreamer) brokenRune(bs []byte) error { |
||||||
|
e.escaped.Escaped = true |
||||||
|
e.escaped.HasBadRunes = true |
||||||
|
|
||||||
|
if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{ |
||||||
|
Key: "class", |
||||||
|
Val: "broken-code-point", |
||||||
|
}); err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
if err := e.PassthroughHTMLStreamer.Text(fmt.Sprintf("<%X>", bs)); err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
|
||||||
|
return e.PassthroughHTMLStreamer.EndTag("span") |
||||||
|
} |
||||||
|
|
||||||
|
func (e *escapeStreamer) ambiguousRune(r, c rune) error { |
||||||
|
e.escaped.Escaped = true |
||||||
|
e.escaped.HasAmbiguous = true |
||||||
|
|
||||||
|
if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{ |
||||||
|
Key: "class", |
||||||
|
Val: "ambiguous-code-point tooltip", |
||||||
|
}, html.Attribute{ |
||||||
|
Key: "data-content", |
||||||
|
Val: e.locale.Tr("repo.ambiguous_character", r, c), |
||||||
|
}); err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{ |
||||||
|
Key: "class", |
||||||
|
Val: "char", |
||||||
|
}); err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
if err := e.PassthroughHTMLStreamer.Text(string(r)); err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
if err := e.PassthroughHTMLStreamer.EndTag("span"); err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
|
||||||
|
return e.PassthroughHTMLStreamer.EndTag("span") |
||||||
|
} |
||||||
|
|
||||||
|
func (e *escapeStreamer) invisibleRune(r rune) error { |
||||||
|
e.escaped.Escaped = true |
||||||
|
e.escaped.HasInvisible = true |
||||||
|
|
||||||
|
if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{ |
||||||
|
Key: "class", |
||||||
|
Val: "escaped-code-point", |
||||||
|
}, html.Attribute{ |
||||||
|
Key: "data-escaped", |
||||||
|
Val: fmt.Sprintf("[U+%04X]", r), |
||||||
|
}); err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{ |
||||||
|
Key: "class", |
||||||
|
Val: "char", |
||||||
|
}); err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
if err := e.PassthroughHTMLStreamer.Text(string(r)); err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
if err := e.PassthroughHTMLStreamer.EndTag("span"); err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
|
||||||
|
return e.PassthroughHTMLStreamer.EndTag("span") |
||||||
|
} |
||||||
|
|
||||||
|
type runeCountType struct { |
||||||
|
numBasicRunes int |
||||||
|
numNonConfusingNonBasicRunes int |
||||||
|
numAmbiguousRunes int |
||||||
|
numInvisibleRunes int |
||||||
|
numBrokenRunes int |
||||||
|
} |
||||||
|
|
||||||
|
func (counts runeCountType) needsEscape() bool { |
||||||
|
if counts.numBrokenRunes > 0 { |
||||||
|
return true |
||||||
|
} |
||||||
|
if counts.numBasicRunes == 0 && |
||||||
|
counts.numNonConfusingNonBasicRunes > 0 { |
||||||
|
return false |
||||||
|
} |
||||||
|
return counts.numAmbiguousRunes > 0 || counts.numInvisibleRunes > 0 |
||||||
|
} |
||||||
|
|
||||||
|
type runeType int |
||||||
|
|
||||||
|
const ( |
||||||
|
basicASCIIRuneType runeType = iota //nolint // <- This is technically deadcode but its self-documenting so it should stay
|
||||||
|
brokenRuneType |
||||||
|
nonBasicASCIIRuneType |
||||||
|
ambiguousRuneType |
||||||
|
invisibleRuneType |
||||||
|
) |
||||||
|
|
||||||
|
func (e *escapeStreamer) runeTypes(runes ...rune) (types []runeType, confusables []rune, runeCounts runeCountType) { |
||||||
|
types = make([]runeType, len(runes)) |
||||||
|
for i, r := range runes { |
||||||
|
var confusable rune |
||||||
|
switch { |
||||||
|
case r == utf8.RuneError: |
||||||
|
types[i] = brokenRuneType |
||||||
|
runeCounts.numBrokenRunes++ |
||||||
|
case r == ' ' || r == '\t' || r == '\n': |
||||||
|
runeCounts.numBasicRunes++ |
||||||
|
case e.isAllowed(r): |
||||||
|
if r > 0x7e || r < 0x20 { |
||||||
|
types[i] = nonBasicASCIIRuneType |
||||||
|
runeCounts.numNonConfusingNonBasicRunes++ |
||||||
|
} else { |
||||||
|
runeCounts.numBasicRunes++ |
||||||
|
} |
||||||
|
case unicode.Is(InvisibleRanges, r): |
||||||
|
types[i] = invisibleRuneType |
||||||
|
runeCounts.numInvisibleRunes++ |
||||||
|
case unicode.IsControl(r): |
||||||
|
types[i] = invisibleRuneType |
||||||
|
runeCounts.numInvisibleRunes++ |
||||||
|
case isAmbiguous(r, &confusable, e.ambiguousTables...): |
||||||
|
confusables = append(confusables, confusable) |
||||||
|
types[i] = ambiguousRuneType |
||||||
|
runeCounts.numAmbiguousRunes++ |
||||||
|
case r > 0x7e || r < 0x20: |
||||||
|
types[i] = nonBasicASCIIRuneType |
||||||
|
runeCounts.numNonConfusingNonBasicRunes++ |
||||||
|
default: |
||||||
|
runeCounts.numBasicRunes++ |
||||||
|
} |
||||||
|
} |
||||||
|
return types, confusables, runeCounts |
||||||
|
} |
||||||
|
|
||||||
|
func (e *escapeStreamer) isAllowed(r rune) bool { |
||||||
|
if len(e.allowed) == 0 { |
||||||
|
return false |
||||||
|
} |
||||||
|
if len(e.allowed) == 1 { |
||||||
|
return e.allowed[0] == r |
||||||
|
} |
||||||
|
|
||||||
|
return sort.Search(len(e.allowed), func(i int) bool { |
||||||
|
return e.allowed[i] >= r |
||||||
|
}) >= 0 |
||||||
|
} |
@ -0,0 +1,201 @@ |
|||||||
|
// Copyright 2022 The Gitea Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package charset |
||||||
|
|
||||||
|
import ( |
||||||
|
"fmt" |
||||||
|
"io" |
||||||
|
|
||||||
|
"golang.org/x/net/html" |
||||||
|
) |
||||||
|
|
||||||
|
// HTMLStreamer represents a SAX-like interface for HTML
|
||||||
|
type HTMLStreamer interface { |
||||||
|
Error(err error) error |
||||||
|
Doctype(data string) error |
||||||
|
Comment(data string) error |
||||||
|
StartTag(data string, attrs ...html.Attribute) error |
||||||
|
SelfClosingTag(data string, attrs ...html.Attribute) error |
||||||
|
EndTag(data string) error |
||||||
|
Text(data string) error |
||||||
|
} |
||||||
|
|
||||||
|
// PassthroughHTMLStreamer is a passthrough streamer
|
||||||
|
type PassthroughHTMLStreamer struct { |
||||||
|
next HTMLStreamer |
||||||
|
} |
||||||
|
|
||||||
|
func NewPassthroughStreamer(next HTMLStreamer) *PassthroughHTMLStreamer { |
||||||
|
return &PassthroughHTMLStreamer{next: next} |
||||||
|
} |
||||||
|
|
||||||
|
var _ (HTMLStreamer) = &PassthroughHTMLStreamer{} |
||||||
|
|
||||||
|
// Error tells the next streamer in line that there is an error
|
||||||
|
func (p *PassthroughHTMLStreamer) Error(err error) error { |
||||||
|
return p.next.Error(err) |
||||||
|
} |
||||||
|
|
||||||
|
// Doctype tells the next streamer what the doctype is
|
||||||
|
func (p *PassthroughHTMLStreamer) Doctype(data string) error { |
||||||
|
return p.next.Doctype(data) |
||||||
|
} |
||||||
|
|
||||||
|
// Comment tells the next streamer there is a comment
|
||||||
|
func (p *PassthroughHTMLStreamer) Comment(data string) error { |
||||||
|
return p.next.Comment(data) |
||||||
|
} |
||||||
|
|
||||||
|
// StartTag tells the next streamer there is a starting tag
|
||||||
|
func (p *PassthroughHTMLStreamer) StartTag(data string, attrs ...html.Attribute) error { |
||||||
|
return p.next.StartTag(data, attrs...) |
||||||
|
} |
||||||
|
|
||||||
|
// SelfClosingTag tells the next streamer there is a self-closing tag
|
||||||
|
func (p *PassthroughHTMLStreamer) SelfClosingTag(data string, attrs ...html.Attribute) error { |
||||||
|
return p.next.SelfClosingTag(data, attrs...) |
||||||
|
} |
||||||
|
|
||||||
|
// EndTag tells the next streamer there is a end tag
|
||||||
|
func (p *PassthroughHTMLStreamer) EndTag(data string) error { |
||||||
|
return p.next.EndTag(data) |
||||||
|
} |
||||||
|
|
||||||
|
// Text tells the next streamer there is a text
|
||||||
|
func (p *PassthroughHTMLStreamer) Text(data string) error { |
||||||
|
return p.next.Text(data) |
||||||
|
} |
||||||
|
|
||||||
|
// HTMLStreamWriter acts as a writing sink
|
||||||
|
type HTMLStreamerWriter struct { |
||||||
|
io.Writer |
||||||
|
err error |
||||||
|
} |
||||||
|
|
||||||
|
// Write implements io.Writer
|
||||||
|
func (h *HTMLStreamerWriter) Write(data []byte) (int, error) { |
||||||
|
if h.err != nil { |
||||||
|
return 0, h.err |
||||||
|
} |
||||||
|
return h.Writer.Write(data) |
||||||
|
} |
||||||
|
|
||||||
|
// Write implements io.StringWriter
|
||||||
|
func (h *HTMLStreamerWriter) WriteString(data string) (int, error) { |
||||||
|
if h.err != nil { |
||||||
|
return 0, h.err |
||||||
|
} |
||||||
|
return h.Writer.Write([]byte(data)) |
||||||
|
} |
||||||
|
|
||||||
|
// Error tells the next streamer in line that there is an error
|
||||||
|
func (h *HTMLStreamerWriter) Error(err error) error { |
||||||
|
if h.err == nil { |
||||||
|
h.err = err |
||||||
|
} |
||||||
|
return h.err |
||||||
|
} |
||||||
|
|
||||||
|
// Doctype tells the next streamer what the doctype is
|
||||||
|
func (h *HTMLStreamerWriter) Doctype(data string) error { |
||||||
|
_, h.err = h.WriteString("<!DOCTYPE " + data + ">") |
||||||
|
return h.err |
||||||
|
} |
||||||
|
|
||||||
|
// Comment tells the next streamer there is a comment
|
||||||
|
func (h *HTMLStreamerWriter) Comment(data string) error { |
||||||
|
_, h.err = h.WriteString("<!--" + data + "-->") |
||||||
|
return h.err |
||||||
|
} |
||||||
|
|
||||||
|
// StartTag tells the next streamer there is a starting tag
|
||||||
|
func (h *HTMLStreamerWriter) StartTag(data string, attrs ...html.Attribute) error { |
||||||
|
return h.startTag(data, attrs, false) |
||||||
|
} |
||||||
|
|
||||||
|
// SelfClosingTag tells the next streamer there is a self-closing tag
|
||||||
|
func (h *HTMLStreamerWriter) SelfClosingTag(data string, attrs ...html.Attribute) error { |
||||||
|
return h.startTag(data, attrs, true) |
||||||
|
} |
||||||
|
|
||||||
|
func (h *HTMLStreamerWriter) startTag(data string, attrs []html.Attribute, selfclosing bool) error { |
||||||
|
if _, h.err = h.WriteString("<" + data); h.err != nil { |
||||||
|
return h.err |
||||||
|
} |
||||||
|
for _, attr := range attrs { |
||||||
|
if _, h.err = h.WriteString(" " + attr.Key + "=\"" + html.EscapeString(attr.Val) + "\""); h.err != nil { |
||||||
|
return h.err |
||||||
|
} |
||||||
|
} |
||||||
|
if selfclosing { |
||||||
|
if _, h.err = h.WriteString("/>"); h.err != nil { |
||||||
|
return h.err |
||||||
|
} |
||||||
|
} else { |
||||||
|
if _, h.err = h.WriteString(">"); h.err != nil { |
||||||
|
return h.err |
||||||
|
} |
||||||
|
} |
||||||
|
return h.err |
||||||
|
} |
||||||
|
|
||||||
|
// EndTag tells the next streamer there is a end tag
|
||||||
|
func (h *HTMLStreamerWriter) EndTag(data string) error { |
||||||
|
_, h.err = h.WriteString("</" + data + ">") |
||||||
|
return h.err |
||||||
|
} |
||||||
|
|
||||||
|
// Text tells the next streamer there is a text
|
||||||
|
func (h *HTMLStreamerWriter) Text(data string) error { |
||||||
|
_, h.err = h.WriteString(html.EscapeString(data)) |
||||||
|
return h.err |
||||||
|
} |
||||||
|
|
||||||
|
// StreamHTML streams an html to a provided streamer
|
||||||
|
func StreamHTML(source io.Reader, streamer HTMLStreamer) error { |
||||||
|
tokenizer := html.NewTokenizer(source) |
||||||
|
for { |
||||||
|
tt := tokenizer.Next() |
||||||
|
switch tt { |
||||||
|
case html.ErrorToken: |
||||||
|
if tokenizer.Err() != io.EOF { |
||||||
|
return tokenizer.Err() |
||||||
|
} |
||||||
|
return nil |
||||||
|
case html.DoctypeToken: |
||||||
|
token := tokenizer.Token() |
||||||
|
if err := streamer.Doctype(token.Data); err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
case html.CommentToken: |
||||||
|
token := tokenizer.Token() |
||||||
|
if err := streamer.Comment(token.Data); err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
case html.StartTagToken: |
||||||
|
token := tokenizer.Token() |
||||||
|
if err := streamer.StartTag(token.Data, token.Attr...); err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
case html.SelfClosingTagToken: |
||||||
|
token := tokenizer.Token() |
||||||
|
if err := streamer.StartTag(token.Data, token.Attr...); err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
case html.EndTagToken: |
||||||
|
token := tokenizer.Token() |
||||||
|
if err := streamer.EndTag(token.Data); err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
case html.TextToken: |
||||||
|
token := tokenizer.Token() |
||||||
|
if err := streamer.Text(token.Data); err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
default: |
||||||
|
return fmt.Errorf("unknown type of token: %d", tt) |
||||||
|
} |
||||||
|
} |
||||||
|
} |
@ -0,0 +1,111 @@ |
|||||||
|
// Copyright 2022 The Gitea Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package main |
||||||
|
|
||||||
|
import ( |
||||||
|
"bytes" |
||||||
|
"flag" |
||||||
|
"fmt" |
||||||
|
"go/format" |
||||||
|
"os" |
||||||
|
"text/template" |
||||||
|
|
||||||
|
"golang.org/x/text/unicode/rangetable" |
||||||
|
) |
||||||
|
|
||||||
|
// InvisibleRunes these are runes that vscode has assigned to be invisible
|
||||||
|
// See https://github.com/hediet/vscode-unicode-data
|
||||||
|
var InvisibleRunes = []rune{ |
||||||
|
9, 10, 11, 12, 13, 32, 127, 160, 173, 847, 1564, 4447, 4448, 6068, 6069, 6155, 6156, 6157, 6158, 7355, 7356, 8192, 8193, 8194, 8195, 8196, 8197, 8198, 8199, 8200, 8201, 8202, 8203, 8204, 8205, 8206, 8207, 8234, 8235, 8236, 8237, 8238, 8239, 8287, 8288, 8289, 8290, 8291, 8292, 8293, 8294, 8295, 8296, 8297, 8298, 8299, 8300, 8301, 8302, 8303, 10240, 12288, 12644, 65024, 65025, 65026, 65027, 65028, 65029, 65030, 65031, 65032, 65033, 65034, 65035, 65036, 65037, 65038, 65039, 65279, 65440, 65520, 65521, 65522, 65523, 65524, 65525, 65526, 65527, 65528, 65532, 78844, 119155, 119156, 119157, 119158, 119159, 119160, 119161, 119162, 917504, 917505, 917506, 917507, 917508, 917509, 917510, 917511, 917512, 917513, 917514, 917515, 917516, 917517, 917518, 917519, 917520, 917521, 917522, 917523, 917524, 917525, 917526, 917527, 917528, 917529, 917530, 917531, 917532, 917533, 917534, 917535, 917536, 917537, 917538, 917539, 917540, 917541, 917542, 917543, 917544, 917545, 917546, 917547, 917548, 917549, 917550, 917551, 917552, 917553, 917554, 917555, 917556, 917557, 917558, 917559, 917560, 917561, 917562, 917563, 917564, 917565, 917566, 917567, 917568, 917569, 917570, 917571, 917572, 917573, 917574, 917575, 917576, 917577, 917578, 917579, 917580, 917581, 917582, 917583, 917584, 917585, 917586, 917587, 917588, 917589, 917590, 917591, 917592, 917593, 917594, 917595, 917596, 917597, 917598, 917599, 917600, 917601, 917602, 917603, 917604, 917605, 917606, 917607, 917608, 917609, 917610, 917611, 917612, 917613, 917614, 917615, 917616, 917617, 917618, 917619, 917620, 917621, 917622, 917623, 917624, 917625, 917626, 917627, 917628, 917629, 917630, 917631, 917760, 917761, 917762, 917763, 917764, 917765, 917766, 917767, 917768, 917769, 917770, 917771, 917772, 917773, 917774, 917775, 917776, 917777, 917778, 917779, 917780, 917781, 917782, 917783, 917784, 917785, 917786, 917787, 917788, 917789, 917790, 917791, 917792, 917793, 917794, 917795, 917796, 917797, 917798, 917799, 917800, 917801, 917802, 917803, 917804, 917805, 917806, 917807, 917808, 917809, 917810, 917811, 917812, 917813, 917814, 917815, 917816, 917817, 917818, 917819, 917820, 917821, 917822, 917823, 917824, 917825, 917826, 917827, 917828, 917829, 917830, 917831, 917832, 917833, 917834, 917835, 917836, 917837, 917838, 917839, 917840, 917841, 917842, 917843, 917844, 917845, 917846, 917847, 917848, 917849, 917850, 917851, 917852, 917853, 917854, 917855, 917856, 917857, 917858, 917859, 917860, 917861, 917862, 917863, 917864, 917865, 917866, 917867, 917868, 917869, 917870, 917871, 917872, 917873, 917874, 917875, 917876, 917877, 917878, 917879, 917880, 917881, 917882, 917883, 917884, 917885, 917886, 917887, 917888, 917889, 917890, 917891, 917892, 917893, 917894, 917895, 917896, 917897, 917898, 917899, 917900, 917901, 917902, 917903, 917904, 917905, 917906, 917907, 917908, 917909, 917910, 917911, 917912, 917913, 917914, 917915, 917916, 917917, 917918, 917919, 917920, 917921, 917922, 917923, 917924, 917925, 917926, 917927, 917928, 917929, 917930, 917931, 917932, 917933, 917934, 917935, 917936, 917937, 917938, 917939, 917940, 917941, 917942, 917943, 917944, 917945, 917946, 917947, 917948, 917949, 917950, 917951, 917952, 917953, 917954, 917955, 917956, 917957, 917958, 917959, 917960, 917961, 917962, 917963, 917964, 917965, 917966, 917967, 917968, 917969, 917970, 917971, 917972, 917973, 917974, 917975, 917976, 917977, 917978, 917979, 917980, 917981, 917982, 917983, 917984, 917985, 917986, 917987, 917988, 917989, 917990, 917991, 917992, 917993, 917994, 917995, 917996, 917997, 917998, 917999, |
||||||
|
} |
||||||
|
|
||||||
|
var verbose bool |
||||||
|
|
||||||
|
func main() { |
||||||
|
flag.Usage = func() { |
||||||
|
fmt.Fprintf(os.Stderr, `%s: Generate InvisibleRunesRange |
||||||
|
|
||||||
|
Usage: %[1]s [-v] [-o output.go] |
||||||
|
`, os.Args[0]) |
||||||
|
flag.PrintDefaults() |
||||||
|
} |
||||||
|
|
||||||
|
output := "" |
||||||
|
flag.BoolVar(&verbose, "v", false, "verbose output") |
||||||
|
flag.StringVar(&output, "o", "invisible_gen.go", "file to output to") |
||||||
|
flag.Parse() |
||||||
|
|
||||||
|
// First we filter the runes to remove
|
||||||
|
// <space><tab><newline>
|
||||||
|
filtered := make([]rune, 0, len(InvisibleRunes)) |
||||||
|
for _, r := range InvisibleRunes { |
||||||
|
if r == ' ' || r == '\t' || r == '\n' { |
||||||
|
continue |
||||||
|
} |
||||||
|
filtered = append(filtered, r) |
||||||
|
} |
||||||
|
|
||||||
|
table := rangetable.New(filtered...) |
||||||
|
if err := runTemplate(generatorTemplate, output, table); err != nil { |
||||||
|
fatalf("Unable to run template: %v", err) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
func runTemplate(t *template.Template, filename string, data interface{}) error { |
||||||
|
buf := bytes.NewBuffer(nil) |
||||||
|
if err := t.Execute(buf, data); err != nil { |
||||||
|
return fmt.Errorf("unable to execute template: %w", err) |
||||||
|
} |
||||||
|
bs, err := format.Source(buf.Bytes()) |
||||||
|
if err != nil { |
||||||
|
verbosef("Bad source:\n%s", buf.String()) |
||||||
|
return fmt.Errorf("unable to format source: %w", err) |
||||||
|
} |
||||||
|
file, err := os.Create(filename) |
||||||
|
if err != nil { |
||||||
|
return fmt.Errorf("failed to create file %s because %w", filename, err) |
||||||
|
} |
||||||
|
defer file.Close() |
||||||
|
_, err = file.Write(bs) |
||||||
|
if err != nil { |
||||||
|
return fmt.Errorf("unable to write generated source: %w", err) |
||||||
|
} |
||||||
|
return nil |
||||||
|
} |
||||||
|
|
||||||
|
var generatorTemplate = template.Must(template.New("invisibleTemplate").Parse(`// This file is generated by modules/charset/invisible/generate.go DO NOT EDIT
|
||||||
|
// Copyright 2022 The Gitea Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package charset |
||||||
|
|
||||||
|
import "unicode" |
||||||
|
|
||||||
|
var InvisibleRanges = &unicode.RangeTable{ |
||||||
|
R16: []unicode.Range16{ |
||||||
|
{{range .R16 }} {Lo:{{.Lo}}, Hi:{{.Hi}}, Stride: {{.Stride}}}, |
||||||
|
{{end}} }, |
||||||
|
R32: []unicode.Range32{ |
||||||
|
{{range .R32}} {Lo:{{.Lo}}, Hi:{{.Hi}}, Stride: {{.Stride}}}, |
||||||
|
{{end}} }, |
||||||
|
LatinOffset: {{.LatinOffset}}, |
||||||
|
} |
||||||
|
`)) |
||||||
|
|
||||||
|
func logf(format string, args ...interface{}) { |
||||||
|
fmt.Fprintf(os.Stderr, format+"\n", args...) |
||||||
|
} |
||||||
|
|
||||||
|
func verbosef(format string, args ...interface{}) { |
||||||
|
if verbose { |
||||||
|
logf(format, args...) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
func fatalf(format string, args ...interface{}) { |
||||||
|
logf("fatal: "+format+"\n", args...) |
||||||
|
os.Exit(1) |
||||||
|
} |
@ -0,0 +1,37 @@ |
|||||||
|
// This file is generated by modules/charset/invisible/generate.go DO NOT EDIT
|
||||||
|
// Copyright 2022 The Gitea Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package charset |
||||||
|
|
||||||
|
import "unicode" |
||||||
|
|
||||||
|
var InvisibleRanges = &unicode.RangeTable{ |
||||||
|
R16: []unicode.Range16{ |
||||||
|
{Lo: 11, Hi: 13, Stride: 1}, |
||||||
|
{Lo: 127, Hi: 160, Stride: 33}, |
||||||
|
{Lo: 173, Hi: 847, Stride: 674}, |
||||||
|
{Lo: 1564, Hi: 4447, Stride: 2883}, |
||||||
|
{Lo: 4448, Hi: 6068, Stride: 1620}, |
||||||
|
{Lo: 6069, Hi: 6155, Stride: 86}, |
||||||
|
{Lo: 6156, Hi: 6158, Stride: 1}, |
||||||
|
{Lo: 7355, Hi: 7356, Stride: 1}, |
||||||
|
{Lo: 8192, Hi: 8207, Stride: 1}, |
||||||
|
{Lo: 8234, Hi: 8239, Stride: 1}, |
||||||
|
{Lo: 8287, Hi: 8303, Stride: 1}, |
||||||
|
{Lo: 10240, Hi: 12288, Stride: 2048}, |
||||||
|
{Lo: 12644, Hi: 65024, Stride: 52380}, |
||||||
|
{Lo: 65025, Hi: 65039, Stride: 1}, |
||||||
|
{Lo: 65279, Hi: 65440, Stride: 161}, |
||||||
|
{Lo: 65520, Hi: 65528, Stride: 1}, |
||||||
|
{Lo: 65532, Hi: 65532, Stride: 1}, |
||||||
|
}, |
||||||
|
R32: []unicode.Range32{ |
||||||
|
{Lo: 78844, Hi: 119155, Stride: 40311}, |
||||||
|
{Lo: 119156, Hi: 119162, Stride: 1}, |
||||||
|
{Lo: 917504, Hi: 917631, Stride: 1}, |
||||||
|
{Lo: 917760, Hi: 917999, Stride: 1}, |
||||||
|
}, |
||||||
|
LatinOffset: 2, |
||||||
|
} |
@ -0,0 +1,2 @@ |
|||||||
|
{{if .diff.EscapeStatus.HasInvisible}}{{.locale.Tr "repo.invisible_runes_line"}} {{end}}{{/* |
||||||
|
*/}}{{if .diff.EscapeStatus.HasAmbiguous}}{{.locale.Tr "repo.ambiguous_runes_line"}}{{end}} |
@ -0,0 +1,6 @@ |
|||||||
|
<code {{if .diff.EscapeStatus.Escaped}}{{/* |
||||||
|
*/}}class="code-inner has-escaped" {{/* |
||||||
|
*/}}title="{{template "repo/diff/escape_title" .}}"{{/* |
||||||
|
*/}}{{else}}{{/* |
||||||
|
*/}}class="code-inner"{{/* |
||||||
|
*/}}{{end}}>{{.diff.Content}}</code> |
@ -1,19 +1,22 @@ |
|||||||
{{if .EscapeStatus}} |
{{if .EscapeStatus}} |
||||||
{{if .EscapeStatus.BadBIDI}} |
{{if .EscapeStatus.HasInvisible}} |
||||||
<div class="ui error message unicode-escape-prompt tl"> |
<div class="ui error message unicode-escape-prompt tl"> |
||||||
<span class="close icon hide-panel button" data-panel-closest=".message">{{svg "octicon-x" 16 "close inside"}}</span> |
<span class="close icon hide-panel button" data-panel-closest=".message">{{svg "octicon-x" 16 "close inside"}}</span> |
||||||
<div class="header"> |
<div class="header"> |
||||||
{{$.root.locale.Tr "repo.bidi_bad_header"}} |
{{$.root.locale.Tr "repo.invisible_runes_header"}} |
||||||
</div> |
</div> |
||||||
<p>{{$.root.locale.Tr "repo.bidi_bad_description" | Str2html}}</p> |
<p>{{$.root.locale.Tr "repo.invisible_runes_description" | Str2html}}</p> |
||||||
|
{{if .EscapeStatus.HasAmbiguous}} |
||||||
|
<p>{{$.root.locale.Tr "repo.ambiguous_runes_description" | Str2html}}</p> |
||||||
|
{{end}} |
||||||
</div> |
</div> |
||||||
{{else if .EscapeStatus.HasBIDI}} |
{{else if .EscapeStatus.HasAmbiguous}} |
||||||
<div class="ui warning message unicode-escape-prompt tl"> |
<div class="ui warning message unicode-escape-prompt tl"> |
||||||
<span class="close icon hide-panel button" data-panel-closest=".message">{{svg "octicon-x" 16 "close inside"}}</span> |
<span class="close icon hide-panel button" data-panel-closest=".message">{{svg "octicon-x" 16 "close inside"}}</span> |
||||||
<div class="header"> |
<div class="header"> |
||||||
{{$.root.locale.Tr "repo.unicode_header"}} |
{{$.root.locale.Tr "repo.ambiguous_runes_header"}} |
||||||
</div> |
</div> |
||||||
<p>{{$.root.locale.Tr "repo.unicode_description" | Str2html}}</p> |
<p>{{$.root.locale.Tr "repo.ambiguous_runes_description" | Str2html}}</p> |
||||||
</div> |
</div> |
||||||
{{end}} |
{{end}} |
||||||
{{end}} |
{{end}} |
||||||
|
Loading…
Reference in new issue