Update to last common x/text (#3994)
parent
000b2d33a9
commit
edc78b9633
@ -1,524 +0,0 @@ |
|||||||
// Copyright 2013 The Go Authors. All rights reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
// +build ignore
|
|
||||||
|
|
||||||
package main |
|
||||||
|
|
||||||
import ( |
|
||||||
"bufio" |
|
||||||
"fmt" |
|
||||||
"log" |
|
||||||
"net/http" |
|
||||||
"sort" |
|
||||||
"strings" |
|
||||||
"unicode/utf8" |
|
||||||
|
|
||||||
"golang.org/x/text/encoding" |
|
||||||
"golang.org/x/text/internal/gen" |
|
||||||
) |
|
||||||
|
|
||||||
const ascii = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + |
|
||||||
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" + |
|
||||||
` !"#$%&'()*+,-./0123456789:;<=>?` + |
|
||||||
`@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` + |
|
||||||
"`abcdefghijklmnopqrstuvwxyz{|}~\u007f" |
|
||||||
|
|
||||||
var encodings = []struct { |
|
||||||
name string |
|
||||||
mib string |
|
||||||
comment string |
|
||||||
varName string |
|
||||||
replacement byte |
|
||||||
mapping string |
|
||||||
}{ |
|
||||||
{ |
|
||||||
"IBM Code Page 437", |
|
||||||
"PC8CodePage437", |
|
||||||
"", |
|
||||||
"CodePage437", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM437-2.1.2.ucm", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"IBM Code Page 850", |
|
||||||
"PC850Multilingual", |
|
||||||
"", |
|
||||||
"CodePage850", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM850-2.1.2.ucm", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"IBM Code Page 852", |
|
||||||
"PCp852", |
|
||||||
"", |
|
||||||
"CodePage852", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM852-2.1.2.ucm", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"IBM Code Page 855", |
|
||||||
"IBM855", |
|
||||||
"", |
|
||||||
"CodePage855", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM855-2.1.2.ucm", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"Windows Code Page 858", // PC latin1 with Euro
|
|
||||||
"IBM00858", |
|
||||||
"", |
|
||||||
"CodePage858", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/windows-858-2000.ucm", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"IBM Code Page 860", |
|
||||||
"IBM860", |
|
||||||
"", |
|
||||||
"CodePage860", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM860-2.1.2.ucm", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"IBM Code Page 862", |
|
||||||
"PC862LatinHebrew", |
|
||||||
"", |
|
||||||
"CodePage862", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM862-2.1.2.ucm", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"IBM Code Page 863", |
|
||||||
"IBM863", |
|
||||||
"", |
|
||||||
"CodePage863", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM863-2.1.2.ucm", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"IBM Code Page 865", |
|
||||||
"IBM865", |
|
||||||
"", |
|
||||||
"CodePage865", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM865-2.1.2.ucm", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"IBM Code Page 866", |
|
||||||
"IBM866", |
|
||||||
"", |
|
||||||
"CodePage866", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://encoding.spec.whatwg.org/index-ibm866.txt", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"ISO 8859-1", |
|
||||||
"ISOLatin1", |
|
||||||
"", |
|
||||||
"ISO8859_1", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_1-1998.ucm", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"ISO 8859-2", |
|
||||||
"ISOLatin2", |
|
||||||
"", |
|
||||||
"ISO8859_2", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://encoding.spec.whatwg.org/index-iso-8859-2.txt", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"ISO 8859-3", |
|
||||||
"ISOLatin3", |
|
||||||
"", |
|
||||||
"ISO8859_3", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://encoding.spec.whatwg.org/index-iso-8859-3.txt", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"ISO 8859-4", |
|
||||||
"ISOLatin4", |
|
||||||
"", |
|
||||||
"ISO8859_4", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://encoding.spec.whatwg.org/index-iso-8859-4.txt", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"ISO 8859-5", |
|
||||||
"ISOLatinCyrillic", |
|
||||||
"", |
|
||||||
"ISO8859_5", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://encoding.spec.whatwg.org/index-iso-8859-5.txt", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"ISO 8859-6", |
|
||||||
"ISOLatinArabic", |
|
||||||
"", |
|
||||||
"ISO8859_6,ISO8859_6E,ISO8859_6I", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://encoding.spec.whatwg.org/index-iso-8859-6.txt", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"ISO 8859-7", |
|
||||||
"ISOLatinGreek", |
|
||||||
"", |
|
||||||
"ISO8859_7", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://encoding.spec.whatwg.org/index-iso-8859-7.txt", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"ISO 8859-8", |
|
||||||
"ISOLatinHebrew", |
|
||||||
"", |
|
||||||
"ISO8859_8,ISO8859_8E,ISO8859_8I", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://encoding.spec.whatwg.org/index-iso-8859-8.txt", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"ISO 8859-10", |
|
||||||
"ISOLatin6", |
|
||||||
"", |
|
||||||
"ISO8859_10", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://encoding.spec.whatwg.org/index-iso-8859-10.txt", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"ISO 8859-13", |
|
||||||
"ISO885913", |
|
||||||
"", |
|
||||||
"ISO8859_13", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://encoding.spec.whatwg.org/index-iso-8859-13.txt", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"ISO 8859-14", |
|
||||||
"ISO885914", |
|
||||||
"", |
|
||||||
"ISO8859_14", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://encoding.spec.whatwg.org/index-iso-8859-14.txt", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"ISO 8859-15", |
|
||||||
"ISO885915", |
|
||||||
"", |
|
||||||
"ISO8859_15", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://encoding.spec.whatwg.org/index-iso-8859-15.txt", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"ISO 8859-16", |
|
||||||
"ISO885916", |
|
||||||
"", |
|
||||||
"ISO8859_16", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://encoding.spec.whatwg.org/index-iso-8859-16.txt", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"KOI8-R", |
|
||||||
"KOI8R", |
|
||||||
"", |
|
||||||
"KOI8R", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://encoding.spec.whatwg.org/index-koi8-r.txt", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"KOI8-U", |
|
||||||
"KOI8U", |
|
||||||
"", |
|
||||||
"KOI8U", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://encoding.spec.whatwg.org/index-koi8-u.txt", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"Macintosh", |
|
||||||
"Macintosh", |
|
||||||
"", |
|
||||||
"Macintosh", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://encoding.spec.whatwg.org/index-macintosh.txt", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"Macintosh Cyrillic", |
|
||||||
"MacintoshCyrillic", |
|
||||||
"", |
|
||||||
"MacintoshCyrillic", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://encoding.spec.whatwg.org/index-x-mac-cyrillic.txt", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"Windows 874", |
|
||||||
"Windows874", |
|
||||||
"", |
|
||||||
"Windows874", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://encoding.spec.whatwg.org/index-windows-874.txt", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"Windows 1250", |
|
||||||
"Windows1250", |
|
||||||
"", |
|
||||||
"Windows1250", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://encoding.spec.whatwg.org/index-windows-1250.txt", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"Windows 1251", |
|
||||||
"Windows1251", |
|
||||||
"", |
|
||||||
"Windows1251", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://encoding.spec.whatwg.org/index-windows-1251.txt", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"Windows 1252", |
|
||||||
"Windows1252", |
|
||||||
"", |
|
||||||
"Windows1252", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://encoding.spec.whatwg.org/index-windows-1252.txt", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"Windows 1253", |
|
||||||
"Windows1253", |
|
||||||
"", |
|
||||||
"Windows1253", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://encoding.spec.whatwg.org/index-windows-1253.txt", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"Windows 1254", |
|
||||||
"Windows1254", |
|
||||||
"", |
|
||||||
"Windows1254", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://encoding.spec.whatwg.org/index-windows-1254.txt", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"Windows 1255", |
|
||||||
"Windows1255", |
|
||||||
"", |
|
||||||
"Windows1255", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://encoding.spec.whatwg.org/index-windows-1255.txt", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"Windows 1256", |
|
||||||
"Windows1256", |
|
||||||
"", |
|
||||||
"Windows1256", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://encoding.spec.whatwg.org/index-windows-1256.txt", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"Windows 1257", |
|
||||||
"Windows1257", |
|
||||||
"", |
|
||||||
"Windows1257", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://encoding.spec.whatwg.org/index-windows-1257.txt", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"Windows 1258", |
|
||||||
"Windows1258", |
|
||||||
"", |
|
||||||
"Windows1258", |
|
||||||
encoding.ASCIISub, |
|
||||||
"http://encoding.spec.whatwg.org/index-windows-1258.txt", |
|
||||||
}, |
|
||||||
{ |
|
||||||
"X-User-Defined", |
|
||||||
"XUserDefined", |
|
||||||
"It is defined at http://encoding.spec.whatwg.org/#x-user-defined", |
|
||||||
"XUserDefined", |
|
||||||
encoding.ASCIISub, |
|
||||||
ascii + |
|
||||||
"\uf780\uf781\uf782\uf783\uf784\uf785\uf786\uf787" + |
|
||||||
"\uf788\uf789\uf78a\uf78b\uf78c\uf78d\uf78e\uf78f" + |
|
||||||
"\uf790\uf791\uf792\uf793\uf794\uf795\uf796\uf797" + |
|
||||||
"\uf798\uf799\uf79a\uf79b\uf79c\uf79d\uf79e\uf79f" + |
|
||||||
"\uf7a0\uf7a1\uf7a2\uf7a3\uf7a4\uf7a5\uf7a6\uf7a7" + |
|
||||||
"\uf7a8\uf7a9\uf7aa\uf7ab\uf7ac\uf7ad\uf7ae\uf7af" + |
|
||||||
"\uf7b0\uf7b1\uf7b2\uf7b3\uf7b4\uf7b5\uf7b6\uf7b7" + |
|
||||||
"\uf7b8\uf7b9\uf7ba\uf7bb\uf7bc\uf7bd\uf7be\uf7bf" + |
|
||||||
"\uf7c0\uf7c1\uf7c2\uf7c3\uf7c4\uf7c5\uf7c6\uf7c7" + |
|
||||||
"\uf7c8\uf7c9\uf7ca\uf7cb\uf7cc\uf7cd\uf7ce\uf7cf" + |
|
||||||
"\uf7d0\uf7d1\uf7d2\uf7d3\uf7d4\uf7d5\uf7d6\uf7d7" + |
|
||||||
"\uf7d8\uf7d9\uf7da\uf7db\uf7dc\uf7dd\uf7de\uf7df" + |
|
||||||
"\uf7e0\uf7e1\uf7e2\uf7e3\uf7e4\uf7e5\uf7e6\uf7e7" + |
|
||||||
"\uf7e8\uf7e9\uf7ea\uf7eb\uf7ec\uf7ed\uf7ee\uf7ef" + |
|
||||||
"\uf7f0\uf7f1\uf7f2\uf7f3\uf7f4\uf7f5\uf7f6\uf7f7" + |
|
||||||
"\uf7f8\uf7f9\uf7fa\uf7fb\uf7fc\uf7fd\uf7fe\uf7ff", |
|
||||||
}, |
|
||||||
} |
|
||||||
|
|
||||||
func getWHATWG(url string) string { |
|
||||||
res, err := http.Get(url) |
|
||||||
if err != nil { |
|
||||||
log.Fatalf("%q: Get: %v", url, err) |
|
||||||
} |
|
||||||
defer res.Body.Close() |
|
||||||
|
|
||||||
mapping := make([]rune, 128) |
|
||||||
for i := range mapping { |
|
||||||
mapping[i] = '\ufffd' |
|
||||||
} |
|
||||||
|
|
||||||
scanner := bufio.NewScanner(res.Body) |
|
||||||
for scanner.Scan() { |
|
||||||
s := strings.TrimSpace(scanner.Text()) |
|
||||||
if s == "" || s[0] == '#' { |
|
||||||
continue |
|
||||||
} |
|
||||||
x, y := 0, 0 |
|
||||||
if _, err := fmt.Sscanf(s, "%d\t0x%x", &x, &y); err != nil { |
|
||||||
log.Fatalf("could not parse %q", s) |
|
||||||
} |
|
||||||
if x < 0 || 128 <= x { |
|
||||||
log.Fatalf("code %d is out of range", x) |
|
||||||
} |
|
||||||
if 0x80 <= y && y < 0xa0 { |
|
||||||
// We diverge from the WHATWG spec by mapping control characters
|
|
||||||
// in the range [0x80, 0xa0) to U+FFFD.
|
|
||||||
continue |
|
||||||
} |
|
||||||
mapping[x] = rune(y) |
|
||||||
} |
|
||||||
return ascii + string(mapping) |
|
||||||
} |
|
||||||
|
|
||||||
func getUCM(url string) string { |
|
||||||
res, err := http.Get(url) |
|
||||||
if err != nil { |
|
||||||
log.Fatalf("%q: Get: %v", url, err) |
|
||||||
} |
|
||||||
defer res.Body.Close() |
|
||||||
|
|
||||||
mapping := make([]rune, 256) |
|
||||||
for i := range mapping { |
|
||||||
mapping[i] = '\ufffd' |
|
||||||
} |
|
||||||
|
|
||||||
charsFound := 0 |
|
||||||
scanner := bufio.NewScanner(res.Body) |
|
||||||
for scanner.Scan() { |
|
||||||
s := strings.TrimSpace(scanner.Text()) |
|
||||||
if s == "" || s[0] == '#' { |
|
||||||
continue |
|
||||||
} |
|
||||||
var c byte |
|
||||||
var r rune |
|
||||||
if _, err := fmt.Sscanf(s, `<U%x> \x%x |0`, &r, &c); err != nil { |
|
||||||
continue |
|
||||||
} |
|
||||||
mapping[c] = r |
|
||||||
charsFound++ |
|
||||||
} |
|
||||||
|
|
||||||
if charsFound < 200 { |
|
||||||
log.Fatalf("%q: only %d characters found (wrong page format?)", url, charsFound) |
|
||||||
} |
|
||||||
|
|
||||||
return string(mapping) |
|
||||||
} |
|
||||||
|
|
||||||
func main() { |
|
||||||
mibs := map[string]bool{} |
|
||||||
all := []string{} |
|
||||||
|
|
||||||
w := gen.NewCodeWriter() |
|
||||||
defer w.WriteGoFile("tables.go", "charmap") |
|
||||||
|
|
||||||
printf := func(s string, a ...interface{}) { fmt.Fprintf(w, s, a...) } |
|
||||||
|
|
||||||
printf("import (\n") |
|
||||||
printf("\t\"golang.org/x/text/encoding\"\n") |
|
||||||
printf("\t\"golang.org/x/text/encoding/internal/identifier\"\n") |
|
||||||
printf(")\n\n") |
|
||||||
for _, e := range encodings { |
|
||||||
varNames := strings.Split(e.varName, ",") |
|
||||||
all = append(all, varNames...) |
|
||||||
varName := varNames[0] |
|
||||||
switch { |
|
||||||
case strings.HasPrefix(e.mapping, "http://encoding.spec.whatwg.org/"): |
|
||||||
e.mapping = getWHATWG(e.mapping) |
|
||||||
case strings.HasPrefix(e.mapping, "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/"): |
|
||||||
e.mapping = getUCM(e.mapping) |
|
||||||
} |
|
||||||
|
|
||||||
asciiSuperset, low := strings.HasPrefix(e.mapping, ascii), 0x00 |
|
||||||
if asciiSuperset { |
|
||||||
low = 0x80 |
|
||||||
} |
|
||||||
lvn := 1 |
|
||||||
if strings.HasPrefix(varName, "ISO") || strings.HasPrefix(varName, "KOI") { |
|
||||||
lvn = 3 |
|
||||||
} |
|
||||||
lowerVarName := strings.ToLower(varName[:lvn]) + varName[lvn:] |
|
||||||
printf("// %s is the %s encoding.\n", varName, e.name) |
|
||||||
if e.comment != "" { |
|
||||||
printf("//\n// %s\n", e.comment) |
|
||||||
} |
|
||||||
printf("var %s encoding.Encoding = &%s\n\nvar %s = charmap{\nname: %q,\n", |
|
||||||
varName, lowerVarName, lowerVarName, e.name) |
|
||||||
if mibs[e.mib] { |
|
||||||
log.Fatalf("MIB type %q declared multiple times.", e.mib) |
|
||||||
} |
|
||||||
printf("mib: identifier.%s,\n", e.mib) |
|
||||||
printf("asciiSuperset: %t,\n", asciiSuperset) |
|
||||||
printf("low: 0x%02x,\n", low) |
|
||||||
printf("replacement: 0x%02x,\n", e.replacement) |
|
||||||
|
|
||||||
printf("decode: [256]utf8Enc{\n") |
|
||||||
i, backMapping := 0, map[rune]byte{} |
|
||||||
for _, c := range e.mapping { |
|
||||||
if _, ok := backMapping[c]; !ok && c != utf8.RuneError { |
|
||||||
backMapping[c] = byte(i) |
|
||||||
} |
|
||||||
var buf [8]byte |
|
||||||
n := utf8.EncodeRune(buf[:], c) |
|
||||||
if n > 3 { |
|
||||||
panic(fmt.Sprintf("rune %q (%U) is too long", c, c)) |
|
||||||
} |
|
||||||
printf("{%d,[3]byte{0x%02x,0x%02x,0x%02x}},", n, buf[0], buf[1], buf[2]) |
|
||||||
if i%2 == 1 { |
|
||||||
printf("\n") |
|
||||||
} |
|
||||||
i++ |
|
||||||
} |
|
||||||
printf("},\n") |
|
||||||
|
|
||||||
printf("encode: [256]uint32{\n") |
|
||||||
encode := make([]uint32, 0, 256) |
|
||||||
for c, i := range backMapping { |
|
||||||
encode = append(encode, uint32(i)<<24|uint32(c)) |
|
||||||
} |
|
||||||
sort.Sort(byRune(encode)) |
|
||||||
for len(encode) < cap(encode) { |
|
||||||
encode = append(encode, encode[len(encode)-1]) |
|
||||||
} |
|
||||||
for i, enc := range encode { |
|
||||||
printf("0x%08x,", enc) |
|
||||||
if i%8 == 7 { |
|
||||||
printf("\n") |
|
||||||
} |
|
||||||
} |
|
||||||
printf("},\n}\n") |
|
||||||
|
|
||||||
// Add an estimate of the size of a single charmap{} struct value, which
|
|
||||||
// includes two 256 elem arrays of 4 bytes and some extra fields, which
|
|
||||||
// align to 3 uint64s on 64-bit architectures.
|
|
||||||
w.Size += 2*4*256 + 3*8 |
|
||||||
} |
|
||||||
// TODO: add proper line breaking.
|
|
||||||
printf("var listAll = []encoding.Encoding{\n%s,\n}\n\n", strings.Join(all, ",\n")) |
|
||||||
} |
|
||||||
|
|
||||||
type byRune []uint32 |
|
||||||
|
|
||||||
func (b byRune) Len() int { return len(b) } |
|
||||||
func (b byRune) Less(i, j int) bool { return b[i]&0xffffff < b[j]&0xffffff } |
|
||||||
func (b byRune) Swap(i, j int) { b[i], b[j] = b[j], b[i] } |
|
File diff suppressed because it is too large
Load Diff
@ -1,167 +0,0 @@ |
|||||||
// Copyright 2015 The Go Authors. All rights reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
// +build ignore
|
|
||||||
|
|
||||||
package main |
|
||||||
|
|
||||||
import ( |
|
||||||
"bytes" |
|
||||||
"encoding/json" |
|
||||||
"fmt" |
|
||||||
"log" |
|
||||||
"strings" |
|
||||||
|
|
||||||
"golang.org/x/text/internal/gen" |
|
||||||
) |
|
||||||
|
|
||||||
type group struct { |
|
||||||
Encodings []struct { |
|
||||||
Labels []string |
|
||||||
Name string |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
func main() { |
|
||||||
gen.Init() |
|
||||||
|
|
||||||
r := gen.Open("http://www.w3.org/TR", "w3", "encoding/indexes/encodings.json") |
|
||||||
var groups []group |
|
||||||
if err := json.NewDecoder(r).Decode(&groups); err != nil { |
|
||||||
log.Fatalf("Error reading encodings.json: %v", err) |
|
||||||
} |
|
||||||
|
|
||||||
w := &bytes.Buffer{} |
|
||||||
fmt.Fprintln(w, "type htmlEncoding byte") |
|
||||||
fmt.Fprintln(w, "const (") |
|
||||||
for i, g := range groups { |
|
||||||
for _, e := range g.Encodings { |
|
||||||
name := consts[e.Name] |
|
||||||
if name == "" { |
|
||||||
log.Fatalf("No const defined for %s.", e.Name) |
|
||||||
} |
|
||||||
if i == 0 { |
|
||||||
fmt.Fprintf(w, "%s htmlEncoding = iota\n", name) |
|
||||||
} else { |
|
||||||
fmt.Fprintf(w, "%s\n", name) |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
fmt.Fprintln(w, "numEncodings") |
|
||||||
fmt.Fprint(w, ")\n\n") |
|
||||||
|
|
||||||
fmt.Fprintln(w, "var canonical = [numEncodings]string{") |
|
||||||
for _, g := range groups { |
|
||||||
for _, e := range g.Encodings { |
|
||||||
fmt.Fprintf(w, "%q,\n", e.Name) |
|
||||||
} |
|
||||||
} |
|
||||||
fmt.Fprint(w, "}\n\n") |
|
||||||
|
|
||||||
fmt.Fprintln(w, "var nameMap = map[string]htmlEncoding{") |
|
||||||
for _, g := range groups { |
|
||||||
for _, e := range g.Encodings { |
|
||||||
for _, l := range e.Labels { |
|
||||||
fmt.Fprintf(w, "%q: %s,\n", l, consts[e.Name]) |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
fmt.Fprint(w, "}\n\n") |
|
||||||
|
|
||||||
var tags []string |
|
||||||
fmt.Fprintln(w, "var localeMap = []htmlEncoding{") |
|
||||||
for _, loc := range locales { |
|
||||||
tags = append(tags, loc.tag) |
|
||||||
fmt.Fprintf(w, "%s, // %s \n", consts[loc.name], loc.tag) |
|
||||||
} |
|
||||||
fmt.Fprint(w, "}\n\n") |
|
||||||
|
|
||||||
fmt.Fprintf(w, "const locales = %q\n", strings.Join(tags, " ")) |
|
||||||
|
|
||||||
gen.WriteGoFile("tables.go", "htmlindex", w.Bytes()) |
|
||||||
} |
|
||||||
|
|
||||||
// consts maps canonical encoding name to internal constant.
|
|
||||||
var consts = map[string]string{ |
|
||||||
"utf-8": "utf8", |
|
||||||
"ibm866": "ibm866", |
|
||||||
"iso-8859-2": "iso8859_2", |
|
||||||
"iso-8859-3": "iso8859_3", |
|
||||||
"iso-8859-4": "iso8859_4", |
|
||||||
"iso-8859-5": "iso8859_5", |
|
||||||
"iso-8859-6": "iso8859_6", |
|
||||||
"iso-8859-7": "iso8859_7", |
|
||||||
"iso-8859-8": "iso8859_8", |
|
||||||
"iso-8859-8-i": "iso8859_8I", |
|
||||||
"iso-8859-10": "iso8859_10", |
|
||||||
"iso-8859-13": "iso8859_13", |
|
||||||
"iso-8859-14": "iso8859_14", |
|
||||||
"iso-8859-15": "iso8859_15", |
|
||||||
"iso-8859-16": "iso8859_16", |
|
||||||
"koi8-r": "koi8r", |
|
||||||
"koi8-u": "koi8u", |
|
||||||
"macintosh": "macintosh", |
|
||||||
"windows-874": "windows874", |
|
||||||
"windows-1250": "windows1250", |
|
||||||
"windows-1251": "windows1251", |
|
||||||
"windows-1252": "windows1252", |
|
||||||
"windows-1253": "windows1253", |
|
||||||
"windows-1254": "windows1254", |
|
||||||
"windows-1255": "windows1255", |
|
||||||
"windows-1256": "windows1256", |
|
||||||
"windows-1257": "windows1257", |
|
||||||
"windows-1258": "windows1258", |
|
||||||
"x-mac-cyrillic": "macintoshCyrillic", |
|
||||||
"gbk": "gbk", |
|
||||||
"gb18030": "gb18030", |
|
||||||
// "hz-gb-2312": "hzgb2312", // Was removed from WhatWG
|
|
||||||
"big5": "big5", |
|
||||||
"euc-jp": "eucjp", |
|
||||||
"iso-2022-jp": "iso2022jp", |
|
||||||
"shift_jis": "shiftJIS", |
|
||||||
"euc-kr": "euckr", |
|
||||||
"replacement": "replacement", |
|
||||||
"utf-16be": "utf16be", |
|
||||||
"utf-16le": "utf16le", |
|
||||||
"x-user-defined": "xUserDefined", |
|
||||||
} |
|
||||||
|
|
||||||
// locales is taken from
|
|
||||||
// https://html.spec.whatwg.org/multipage/syntax.html#encoding-sniffing-algorithm.
|
|
||||||
var locales = []struct{ tag, name string }{ |
|
||||||
{"und", "windows-1252"}, // The default value.
|
|
||||||
{"ar", "windows-1256"}, |
|
||||||
{"ba", "windows-1251"}, |
|
||||||
{"be", "windows-1251"}, |
|
||||||
{"bg", "windows-1251"}, |
|
||||||
{"cs", "windows-1250"}, |
|
||||||
{"el", "iso-8859-7"}, |
|
||||||
{"et", "windows-1257"}, |
|
||||||
{"fa", "windows-1256"}, |
|
||||||
{"he", "windows-1255"}, |
|
||||||
{"hr", "windows-1250"}, |
|
||||||
{"hu", "iso-8859-2"}, |
|
||||||
{"ja", "shift_jis"}, |
|
||||||
{"kk", "windows-1251"}, |
|
||||||
{"ko", "euc-kr"}, |
|
||||||
{"ku", "windows-1254"}, |
|
||||||
{"ky", "windows-1251"}, |
|
||||||
{"lt", "windows-1257"}, |
|
||||||
{"lv", "windows-1257"}, |
|
||||||
{"mk", "windows-1251"}, |
|
||||||
{"pl", "iso-8859-2"}, |
|
||||||
{"ru", "windows-1251"}, |
|
||||||
{"sah", "windows-1251"}, |
|
||||||
{"sk", "windows-1250"}, |
|
||||||
{"sl", "iso-8859-2"}, |
|
||||||
{"sr", "windows-1251"}, |
|
||||||
{"tg", "windows-1251"}, |
|
||||||
{"th", "windows-874"}, |
|
||||||
{"tr", "windows-1254"}, |
|
||||||
{"tt", "windows-1251"}, |
|
||||||
{"uk", "windows-1251"}, |
|
||||||
{"vi", "windows-1258"}, |
|
||||||
{"zh-hans", "gb18030"}, |
|
||||||
{"zh-hant", "big5"}, |
|
||||||
} |
|
@ -1,137 +0,0 @@ |
|||||||
// Copyright 2015 The Go Authors. All rights reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
// +build ignore
|
|
||||||
|
|
||||||
package main |
|
||||||
|
|
||||||
import ( |
|
||||||
"bytes" |
|
||||||
"encoding/xml" |
|
||||||
"fmt" |
|
||||||
"io" |
|
||||||
"log" |
|
||||||
"strings" |
|
||||||
|
|
||||||
"golang.org/x/text/internal/gen" |
|
||||||
) |
|
||||||
|
|
||||||
type registry struct { |
|
||||||
XMLName xml.Name `xml:"registry"` |
|
||||||
Updated string `xml:"updated"` |
|
||||||
Registry []struct { |
|
||||||
ID string `xml:"id,attr"` |
|
||||||
Record []struct { |
|
||||||
Name string `xml:"name"` |
|
||||||
Xref []struct { |
|
||||||
Type string `xml:"type,attr"` |
|
||||||
Data string `xml:"data,attr"` |
|
||||||
} `xml:"xref"` |
|
||||||
Desc struct { |
|
||||||
Data string `xml:",innerxml"` |
|
||||||
// Any []struct {
|
|
||||||
// Data string `xml:",chardata"`
|
|
||||||
// } `xml:",any"`
|
|
||||||
// Data string `xml:",chardata"`
|
|
||||||
} `xml:"description,"` |
|
||||||
MIB string `xml:"value"` |
|
||||||
Alias []string `xml:"alias"` |
|
||||||
MIME string `xml:"preferred_alias"` |
|
||||||
} `xml:"record"` |
|
||||||
} `xml:"registry"` |
|
||||||
} |
|
||||||
|
|
||||||
func main() { |
|
||||||
r := gen.OpenIANAFile("assignments/character-sets/character-sets.xml") |
|
||||||
reg := ®istry{} |
|
||||||
if err := xml.NewDecoder(r).Decode(®); err != nil && err != io.EOF { |
|
||||||
log.Fatalf("Error decoding charset registry: %v", err) |
|
||||||
} |
|
||||||
if len(reg.Registry) == 0 || reg.Registry[0].ID != "character-sets-1" { |
|
||||||
log.Fatalf("Unexpected ID %s", reg.Registry[0].ID) |
|
||||||
} |
|
||||||
|
|
||||||
w := &bytes.Buffer{} |
|
||||||
fmt.Fprintf(w, "const (\n") |
|
||||||
for _, rec := range reg.Registry[0].Record { |
|
||||||
constName := "" |
|
||||||
for _, a := range rec.Alias { |
|
||||||
if strings.HasPrefix(a, "cs") && strings.IndexByte(a, '-') == -1 { |
|
||||||
// Some of the constant definitions have comments in them. Strip those.
|
|
||||||
constName = strings.Title(strings.SplitN(a[2:], "\n", 2)[0]) |
|
||||||
} |
|
||||||
} |
|
||||||
if constName == "" { |
|
||||||
switch rec.MIB { |
|
||||||
case "2085": |
|
||||||
constName = "HZGB2312" // Not listed as alias for some reason.
|
|
||||||
default: |
|
||||||
log.Fatalf("No cs alias defined for %s.", rec.MIB) |
|
||||||
} |
|
||||||
} |
|
||||||
if rec.MIME != "" { |
|
||||||
rec.MIME = fmt.Sprintf(" (MIME: %s)", rec.MIME) |
|
||||||
} |
|
||||||
fmt.Fprintf(w, "// %s is the MIB identifier with IANA name %s%s.\n//\n", constName, rec.Name, rec.MIME) |
|
||||||
if len(rec.Desc.Data) > 0 { |
|
||||||
fmt.Fprint(w, "// ") |
|
||||||
d := xml.NewDecoder(strings.NewReader(rec.Desc.Data)) |
|
||||||
inElem := true |
|
||||||
attr := "" |
|
||||||
for { |
|
||||||
t, err := d.Token() |
|
||||||
if err != nil { |
|
||||||
if err != io.EOF { |
|
||||||
log.Fatal(err) |
|
||||||
} |
|
||||||
break |
|
||||||
} |
|
||||||
switch x := t.(type) { |
|
||||||
case xml.CharData: |
|
||||||
attr = "" // Don't need attribute info.
|
|
||||||
a := bytes.Split([]byte(x), []byte("\n")) |
|
||||||
for i, b := range a { |
|
||||||
if b = bytes.TrimSpace(b); len(b) != 0 { |
|
||||||
if !inElem && i > 0 { |
|
||||||
fmt.Fprint(w, "\n// ") |
|
||||||
} |
|
||||||
inElem = false |
|
||||||
fmt.Fprintf(w, "%s ", string(b)) |
|
||||||
} |
|
||||||
} |
|
||||||
case xml.StartElement: |
|
||||||
if x.Name.Local == "xref" { |
|
||||||
inElem = true |
|
||||||
use := false |
|
||||||
for _, a := range x.Attr { |
|
||||||
if a.Name.Local == "type" { |
|
||||||
use = use || a.Value != "person" |
|
||||||
} |
|
||||||
if a.Name.Local == "data" && use { |
|
||||||
attr = a.Value + " " |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
case xml.EndElement: |
|
||||||
inElem = false |
|
||||||
fmt.Fprint(w, attr) |
|
||||||
} |
|
||||||
} |
|
||||||
fmt.Fprint(w, "\n") |
|
||||||
} |
|
||||||
for _, x := range rec.Xref { |
|
||||||
switch x.Type { |
|
||||||
case "rfc": |
|
||||||
fmt.Fprintf(w, "// Reference: %s\n", strings.ToUpper(x.Data)) |
|
||||||
case "uri": |
|
||||||
fmt.Fprintf(w, "// Reference: %s\n", x.Data) |
|
||||||
} |
|
||||||
} |
|
||||||
fmt.Fprintf(w, "%s MIB = %s\n", constName, rec.MIB) |
|
||||||
fmt.Fprintln(w) |
|
||||||
} |
|
||||||
fmt.Fprintln(w, ")") |
|
||||||
|
|
||||||
gen.WriteGoFile("mib.go", "identifier", w.Bytes()) |
|
||||||
} |
|
@ -1,161 +0,0 @@ |
|||||||
// Copyright 2013 The Go Authors. All rights reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
// +build ignore
|
|
||||||
|
|
||||||
package main |
|
||||||
|
|
||||||
// This program generates tables.go:
|
|
||||||
// go run maketables.go | gofmt > tables.go
|
|
||||||
|
|
||||||
// TODO: Emoji extensions?
|
|
||||||
// http://www.unicode.org/faq/emoji_dingbats.html
|
|
||||||
// http://www.unicode.org/Public/UNIDATA/EmojiSources.txt
|
|
||||||
|
|
||||||
import ( |
|
||||||
"bufio" |
|
||||||
"fmt" |
|
||||||
"log" |
|
||||||
"net/http" |
|
||||||
"sort" |
|
||||||
"strings" |
|
||||||
) |
|
||||||
|
|
||||||
type entry struct { |
|
||||||
jisCode, table int |
|
||||||
} |
|
||||||
|
|
||||||
func main() { |
|
||||||
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n") |
|
||||||
fmt.Printf("// Package japanese provides Japanese encodings such as EUC-JP and Shift JIS.\n") |
|
||||||
fmt.Printf(`package japanese // import "golang.org/x/text/encoding/japanese"` + "\n\n") |
|
||||||
|
|
||||||
reverse := [65536]entry{} |
|
||||||
for i := range reverse { |
|
||||||
reverse[i].table = -1 |
|
||||||
} |
|
||||||
|
|
||||||
tables := []struct { |
|
||||||
url string |
|
||||||
name string |
|
||||||
}{ |
|
||||||
{"http://encoding.spec.whatwg.org/index-jis0208.txt", "0208"}, |
|
||||||
{"http://encoding.spec.whatwg.org/index-jis0212.txt", "0212"}, |
|
||||||
} |
|
||||||
for i, table := range tables { |
|
||||||
res, err := http.Get(table.url) |
|
||||||
if err != nil { |
|
||||||
log.Fatalf("%q: Get: %v", table.url, err) |
|
||||||
} |
|
||||||
defer res.Body.Close() |
|
||||||
|
|
||||||
mapping := [65536]uint16{} |
|
||||||
|
|
||||||
scanner := bufio.NewScanner(res.Body) |
|
||||||
for scanner.Scan() { |
|
||||||
s := strings.TrimSpace(scanner.Text()) |
|
||||||
if s == "" || s[0] == '#' { |
|
||||||
continue |
|
||||||
} |
|
||||||
x, y := 0, uint16(0) |
|
||||||
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil { |
|
||||||
log.Fatalf("%q: could not parse %q", table.url, s) |
|
||||||
} |
|
||||||
if x < 0 || 120*94 <= x { |
|
||||||
log.Fatalf("%q: JIS code %d is out of range", table.url, x) |
|
||||||
} |
|
||||||
mapping[x] = y |
|
||||||
if reverse[y].table == -1 { |
|
||||||
reverse[y] = entry{jisCode: x, table: i} |
|
||||||
} |
|
||||||
} |
|
||||||
if err := scanner.Err(); err != nil { |
|
||||||
log.Fatalf("%q: scanner error: %v", table.url, err) |
|
||||||
} |
|
||||||
|
|
||||||
fmt.Printf("// jis%sDecode is the decoding table from JIS %s code to Unicode.\n// It is defined at %s\n", |
|
||||||
table.name, table.name, table.url) |
|
||||||
fmt.Printf("var jis%sDecode = [...]uint16{\n", table.name) |
|
||||||
for i, m := range mapping { |
|
||||||
if m != 0 { |
|
||||||
fmt.Printf("\t%d: 0x%04X,\n", i, m) |
|
||||||
} |
|
||||||
} |
|
||||||
fmt.Printf("}\n\n") |
|
||||||
} |
|
||||||
|
|
||||||
// Any run of at least separation continuous zero entries in the reverse map will
|
|
||||||
// be a separate encode table.
|
|
||||||
const separation = 1024 |
|
||||||
|
|
||||||
intervals := []interval(nil) |
|
||||||
low, high := -1, -1 |
|
||||||
for i, v := range reverse { |
|
||||||
if v.table == -1 { |
|
||||||
continue |
|
||||||
} |
|
||||||
if low < 0 { |
|
||||||
low = i |
|
||||||
} else if i-high >= separation { |
|
||||||
if high >= 0 { |
|
||||||
intervals = append(intervals, interval{low, high}) |
|
||||||
} |
|
||||||
low = i |
|
||||||
} |
|
||||||
high = i + 1 |
|
||||||
} |
|
||||||
if high >= 0 { |
|
||||||
intervals = append(intervals, interval{low, high}) |
|
||||||
} |
|
||||||
sort.Sort(byDecreasingLength(intervals)) |
|
||||||
|
|
||||||
fmt.Printf("const (\n") |
|
||||||
fmt.Printf("\tjis0208 = 1\n") |
|
||||||
fmt.Printf("\tjis0212 = 2\n") |
|
||||||
fmt.Printf("\tcodeMask = 0x7f\n") |
|
||||||
fmt.Printf("\tcodeShift = 7\n") |
|
||||||
fmt.Printf("\ttableShift = 14\n") |
|
||||||
fmt.Printf(")\n\n") |
|
||||||
|
|
||||||
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals)) |
|
||||||
fmt.Printf("// encodeX are the encoding tables from Unicode to JIS code,\n") |
|
||||||
fmt.Printf("// sorted by decreasing length.\n") |
|
||||||
for i, v := range intervals { |
|
||||||
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high) |
|
||||||
} |
|
||||||
fmt.Printf("//\n") |
|
||||||
fmt.Printf("// The high two bits of the value record whether the JIS code comes from the\n") |
|
||||||
fmt.Printf("// JIS0208 table (high bits == 1) or the JIS0212 table (high bits == 2).\n") |
|
||||||
fmt.Printf("// The low 14 bits are two 7-bit unsigned integers j1 and j2 that form the\n") |
|
||||||
fmt.Printf("// JIS code (94*j1 + j2) within that table.\n") |
|
||||||
fmt.Printf("\n") |
|
||||||
|
|
||||||
for i, v := range intervals { |
|
||||||
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high) |
|
||||||
fmt.Printf("var encode%d = [...]uint16{\n", i) |
|
||||||
for j := v.low; j < v.high; j++ { |
|
||||||
x := reverse[j] |
|
||||||
if x.table == -1 { |
|
||||||
continue |
|
||||||
} |
|
||||||
fmt.Printf("\t%d - %d: jis%s<<14 | 0x%02X<<7 | 0x%02X,\n", |
|
||||||
j, v.low, tables[x.table].name, x.jisCode/94, x.jisCode%94) |
|
||||||
} |
|
||||||
fmt.Printf("}\n\n") |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// interval is a half-open interval [low, high).
|
|
||||||
type interval struct { |
|
||||||
low, high int |
|
||||||
} |
|
||||||
|
|
||||||
func (i interval) len() int { return i.high - i.low } |
|
||||||
|
|
||||||
// byDecreasingLength sorts intervals by decreasing length.
|
|
||||||
type byDecreasingLength []interval |
|
||||||
|
|
||||||
func (b byDecreasingLength) Len() int { return len(b) } |
|
||||||
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() } |
|
||||||
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] } |
|
@ -1,143 +0,0 @@ |
|||||||
// Copyright 2013 The Go Authors. All rights reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
// +build ignore
|
|
||||||
|
|
||||||
package main |
|
||||||
|
|
||||||
// This program generates tables.go:
|
|
||||||
// go run maketables.go | gofmt > tables.go
|
|
||||||
|
|
||||||
import ( |
|
||||||
"bufio" |
|
||||||
"fmt" |
|
||||||
"log" |
|
||||||
"net/http" |
|
||||||
"sort" |
|
||||||
"strings" |
|
||||||
) |
|
||||||
|
|
||||||
func main() { |
|
||||||
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n") |
|
||||||
fmt.Printf("// Package korean provides Korean encodings such as EUC-KR.\n") |
|
||||||
fmt.Printf(`package korean // import "golang.org/x/text/encoding/korean"` + "\n\n") |
|
||||||
|
|
||||||
res, err := http.Get("http://encoding.spec.whatwg.org/index-euc-kr.txt") |
|
||||||
if err != nil { |
|
||||||
log.Fatalf("Get: %v", err) |
|
||||||
} |
|
||||||
defer res.Body.Close() |
|
||||||
|
|
||||||
mapping := [65536]uint16{} |
|
||||||
reverse := [65536]uint16{} |
|
||||||
|
|
||||||
scanner := bufio.NewScanner(res.Body) |
|
||||||
for scanner.Scan() { |
|
||||||
s := strings.TrimSpace(scanner.Text()) |
|
||||||
if s == "" || s[0] == '#' { |
|
||||||
continue |
|
||||||
} |
|
||||||
x, y := uint16(0), uint16(0) |
|
||||||
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil { |
|
||||||
log.Fatalf("could not parse %q", s) |
|
||||||
} |
|
||||||
if x < 0 || 178*(0xc7-0x81)+(0xfe-0xc7)*94+(0xff-0xa1) <= x { |
|
||||||
log.Fatalf("EUC-KR code %d is out of range", x) |
|
||||||
} |
|
||||||
mapping[x] = y |
|
||||||
if reverse[y] == 0 { |
|
||||||
c0, c1 := uint16(0), uint16(0) |
|
||||||
if x < 178*(0xc7-0x81) { |
|
||||||
c0 = uint16(x/178) + 0x81 |
|
||||||
c1 = uint16(x % 178) |
|
||||||
switch { |
|
||||||
case c1 < 1*26: |
|
||||||
c1 += 0x41 |
|
||||||
case c1 < 2*26: |
|
||||||
c1 += 0x47 |
|
||||||
default: |
|
||||||
c1 += 0x4d |
|
||||||
} |
|
||||||
} else { |
|
||||||
x -= 178 * (0xc7 - 0x81) |
|
||||||
c0 = uint16(x/94) + 0xc7 |
|
||||||
c1 = uint16(x%94) + 0xa1 |
|
||||||
} |
|
||||||
reverse[y] = c0<<8 | c1 |
|
||||||
} |
|
||||||
} |
|
||||||
if err := scanner.Err(); err != nil { |
|
||||||
log.Fatalf("scanner error: %v", err) |
|
||||||
} |
|
||||||
|
|
||||||
fmt.Printf("// decode is the decoding table from EUC-KR code to Unicode.\n") |
|
||||||
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-euc-kr.txt\n") |
|
||||||
fmt.Printf("var decode = [...]uint16{\n") |
|
||||||
for i, v := range mapping { |
|
||||||
if v != 0 { |
|
||||||
fmt.Printf("\t%d: 0x%04X,\n", i, v) |
|
||||||
} |
|
||||||
} |
|
||||||
fmt.Printf("}\n\n") |
|
||||||
|
|
||||||
// Any run of at least separation continuous zero entries in the reverse map will
|
|
||||||
// be a separate encode table.
|
|
||||||
const separation = 1024 |
|
||||||
|
|
||||||
intervals := []interval(nil) |
|
||||||
low, high := -1, -1 |
|
||||||
for i, v := range reverse { |
|
||||||
if v == 0 { |
|
||||||
continue |
|
||||||
} |
|
||||||
if low < 0 { |
|
||||||
low = i |
|
||||||
} else if i-high >= separation { |
|
||||||
if high >= 0 { |
|
||||||
intervals = append(intervals, interval{low, high}) |
|
||||||
} |
|
||||||
low = i |
|
||||||
} |
|
||||||
high = i + 1 |
|
||||||
} |
|
||||||
if high >= 0 { |
|
||||||
intervals = append(intervals, interval{low, high}) |
|
||||||
} |
|
||||||
sort.Sort(byDecreasingLength(intervals)) |
|
||||||
|
|
||||||
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals)) |
|
||||||
fmt.Printf("// encodeX are the encoding tables from Unicode to EUC-KR code,\n") |
|
||||||
fmt.Printf("// sorted by decreasing length.\n") |
|
||||||
for i, v := range intervals { |
|
||||||
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high) |
|
||||||
} |
|
||||||
fmt.Printf("\n") |
|
||||||
|
|
||||||
for i, v := range intervals { |
|
||||||
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high) |
|
||||||
fmt.Printf("var encode%d = [...]uint16{\n", i) |
|
||||||
for j := v.low; j < v.high; j++ { |
|
||||||
x := reverse[j] |
|
||||||
if x == 0 { |
|
||||||
continue |
|
||||||
} |
|
||||||
fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x) |
|
||||||
} |
|
||||||
fmt.Printf("}\n\n") |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// interval is a half-open interval [low, high).
|
|
||||||
type interval struct { |
|
||||||
low, high int |
|
||||||
} |
|
||||||
|
|
||||||
func (i interval) len() int { return i.high - i.low } |
|
||||||
|
|
||||||
// byDecreasingLength sorts intervals by decreasing length.
|
|
||||||
type byDecreasingLength []interval |
|
||||||
|
|
||||||
func (b byDecreasingLength) Len() int { return len(b) } |
|
||||||
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() } |
|
||||||
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] } |
|
@ -1,161 +0,0 @@ |
|||||||
// Copyright 2013 The Go Authors. All rights reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
// +build ignore
|
|
||||||
|
|
||||||
package main |
|
||||||
|
|
||||||
// This program generates tables.go:
|
|
||||||
// go run maketables.go | gofmt > tables.go
|
|
||||||
|
|
||||||
import ( |
|
||||||
"bufio" |
|
||||||
"fmt" |
|
||||||
"log" |
|
||||||
"net/http" |
|
||||||
"sort" |
|
||||||
"strings" |
|
||||||
) |
|
||||||
|
|
||||||
func main() { |
|
||||||
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n") |
|
||||||
fmt.Printf("// Package simplifiedchinese provides Simplified Chinese encodings such as GBK.\n") |
|
||||||
fmt.Printf(`package simplifiedchinese // import "golang.org/x/text/encoding/simplifiedchinese"` + "\n\n") |
|
||||||
|
|
||||||
printGB18030() |
|
||||||
printGBK() |
|
||||||
} |
|
||||||
|
|
||||||
func printGB18030() { |
|
||||||
res, err := http.Get("http://encoding.spec.whatwg.org/index-gb18030.txt") |
|
||||||
if err != nil { |
|
||||||
log.Fatalf("Get: %v", err) |
|
||||||
} |
|
||||||
defer res.Body.Close() |
|
||||||
|
|
||||||
fmt.Printf("// gb18030 is the table from http://encoding.spec.whatwg.org/index-gb18030.txt\n") |
|
||||||
fmt.Printf("var gb18030 = [...][2]uint16{\n") |
|
||||||
scanner := bufio.NewScanner(res.Body) |
|
||||||
for scanner.Scan() { |
|
||||||
s := strings.TrimSpace(scanner.Text()) |
|
||||||
if s == "" || s[0] == '#' { |
|
||||||
continue |
|
||||||
} |
|
||||||
x, y := uint32(0), uint32(0) |
|
||||||
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil { |
|
||||||
log.Fatalf("could not parse %q", s) |
|
||||||
} |
|
||||||
if x < 0x10000 && y < 0x10000 { |
|
||||||
fmt.Printf("\t{0x%04x, 0x%04x},\n", x, y) |
|
||||||
} |
|
||||||
} |
|
||||||
fmt.Printf("}\n\n") |
|
||||||
} |
|
||||||
|
|
||||||
func printGBK() { |
|
||||||
res, err := http.Get("http://encoding.spec.whatwg.org/index-gbk.txt") |
|
||||||
if err != nil { |
|
||||||
log.Fatalf("Get: %v", err) |
|
||||||
} |
|
||||||
defer res.Body.Close() |
|
||||||
|
|
||||||
mapping := [65536]uint16{} |
|
||||||
reverse := [65536]uint16{} |
|
||||||
|
|
||||||
scanner := bufio.NewScanner(res.Body) |
|
||||||
for scanner.Scan() { |
|
||||||
s := strings.TrimSpace(scanner.Text()) |
|
||||||
if s == "" || s[0] == '#' { |
|
||||||
continue |
|
||||||
} |
|
||||||
x, y := uint16(0), uint16(0) |
|
||||||
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil { |
|
||||||
log.Fatalf("could not parse %q", s) |
|
||||||
} |
|
||||||
if x < 0 || 126*190 <= x { |
|
||||||
log.Fatalf("GBK code %d is out of range", x) |
|
||||||
} |
|
||||||
mapping[x] = y |
|
||||||
if reverse[y] == 0 { |
|
||||||
c0, c1 := x/190, x%190 |
|
||||||
if c1 >= 0x3f { |
|
||||||
c1++ |
|
||||||
} |
|
||||||
reverse[y] = (0x81+c0)<<8 | (0x40 + c1) |
|
||||||
} |
|
||||||
} |
|
||||||
if err := scanner.Err(); err != nil { |
|
||||||
log.Fatalf("scanner error: %v", err) |
|
||||||
} |
|
||||||
|
|
||||||
fmt.Printf("// decode is the decoding table from GBK code to Unicode.\n") |
|
||||||
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-gbk.txt\n") |
|
||||||
fmt.Printf("var decode = [...]uint16{\n") |
|
||||||
for i, v := range mapping { |
|
||||||
if v != 0 { |
|
||||||
fmt.Printf("\t%d: 0x%04X,\n", i, v) |
|
||||||
} |
|
||||||
} |
|
||||||
fmt.Printf("}\n\n") |
|
||||||
|
|
||||||
// Any run of at least separation continuous zero entries in the reverse map will
|
|
||||||
// be a separate encode table.
|
|
||||||
const separation = 1024 |
|
||||||
|
|
||||||
intervals := []interval(nil) |
|
||||||
low, high := -1, -1 |
|
||||||
for i, v := range reverse { |
|
||||||
if v == 0 { |
|
||||||
continue |
|
||||||
} |
|
||||||
if low < 0 { |
|
||||||
low = i |
|
||||||
} else if i-high >= separation { |
|
||||||
if high >= 0 { |
|
||||||
intervals = append(intervals, interval{low, high}) |
|
||||||
} |
|
||||||
low = i |
|
||||||
} |
|
||||||
high = i + 1 |
|
||||||
} |
|
||||||
if high >= 0 { |
|
||||||
intervals = append(intervals, interval{low, high}) |
|
||||||
} |
|
||||||
sort.Sort(byDecreasingLength(intervals)) |
|
||||||
|
|
||||||
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals)) |
|
||||||
fmt.Printf("// encodeX are the encoding tables from Unicode to GBK code,\n") |
|
||||||
fmt.Printf("// sorted by decreasing length.\n") |
|
||||||
for i, v := range intervals { |
|
||||||
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high) |
|
||||||
} |
|
||||||
fmt.Printf("\n") |
|
||||||
|
|
||||||
for i, v := range intervals { |
|
||||||
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high) |
|
||||||
fmt.Printf("var encode%d = [...]uint16{\n", i) |
|
||||||
for j := v.low; j < v.high; j++ { |
|
||||||
x := reverse[j] |
|
||||||
if x == 0 { |
|
||||||
continue |
|
||||||
} |
|
||||||
fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x) |
|
||||||
} |
|
||||||
fmt.Printf("}\n\n") |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// interval is a half-open interval [low, high).
|
|
||||||
type interval struct { |
|
||||||
low, high int |
|
||||||
} |
|
||||||
|
|
||||||
func (i interval) len() int { return i.high - i.low } |
|
||||||
|
|
||||||
// byDecreasingLength sorts intervals by decreasing length.
|
|
||||||
type byDecreasingLength []interval |
|
||||||
|
|
||||||
func (b byDecreasingLength) Len() int { return len(b) } |
|
||||||
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() } |
|
||||||
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] } |
|
@ -1,140 +0,0 @@ |
|||||||
// Copyright 2013 The Go Authors. All rights reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
// +build ignore
|
|
||||||
|
|
||||||
package main |
|
||||||
|
|
||||||
// This program generates tables.go:
|
|
||||||
// go run maketables.go | gofmt > tables.go
|
|
||||||
|
|
||||||
import ( |
|
||||||
"bufio" |
|
||||||
"fmt" |
|
||||||
"log" |
|
||||||
"net/http" |
|
||||||
"sort" |
|
||||||
"strings" |
|
||||||
) |
|
||||||
|
|
||||||
func main() { |
|
||||||
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n") |
|
||||||
fmt.Printf("// Package traditionalchinese provides Traditional Chinese encodings such as Big5.\n") |
|
||||||
fmt.Printf(`package traditionalchinese // import "golang.org/x/text/encoding/traditionalchinese"` + "\n\n") |
|
||||||
|
|
||||||
res, err := http.Get("http://encoding.spec.whatwg.org/index-big5.txt") |
|
||||||
if err != nil { |
|
||||||
log.Fatalf("Get: %v", err) |
|
||||||
} |
|
||||||
defer res.Body.Close() |
|
||||||
|
|
||||||
mapping := [65536]uint32{} |
|
||||||
reverse := [65536 * 4]uint16{} |
|
||||||
|
|
||||||
scanner := bufio.NewScanner(res.Body) |
|
||||||
for scanner.Scan() { |
|
||||||
s := strings.TrimSpace(scanner.Text()) |
|
||||||
if s == "" || s[0] == '#' { |
|
||||||
continue |
|
||||||
} |
|
||||||
x, y := uint16(0), uint32(0) |
|
||||||
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil { |
|
||||||
log.Fatalf("could not parse %q", s) |
|
||||||
} |
|
||||||
if x < 0 || 126*157 <= x { |
|
||||||
log.Fatalf("Big5 code %d is out of range", x) |
|
||||||
} |
|
||||||
mapping[x] = y |
|
||||||
|
|
||||||
// The WHATWG spec http://encoding.spec.whatwg.org/#indexes says that
|
|
||||||
// "The index pointer for code point in index is the first pointer
|
|
||||||
// corresponding to code point in index", which would normally mean
|
|
||||||
// that the code below should be guarded by "if reverse[y] == 0", but
|
|
||||||
// last instead of first seems to match the behavior of
|
|
||||||
// "iconv -f UTF-8 -t BIG5". For example, U+8005 者 occurs twice in
|
|
||||||
// http://encoding.spec.whatwg.org/index-big5.txt, as index 2148
|
|
||||||
// (encoded as "\x8e\xcd") and index 6543 (encoded as "\xaa\xcc")
|
|
||||||
// and "echo 者 | iconv -f UTF-8 -t BIG5 | xxd" gives "\xaa\xcc".
|
|
||||||
c0, c1 := x/157, x%157 |
|
||||||
if c1 < 0x3f { |
|
||||||
c1 += 0x40 |
|
||||||
} else { |
|
||||||
c1 += 0x62 |
|
||||||
} |
|
||||||
reverse[y] = (0x81+c0)<<8 | c1 |
|
||||||
} |
|
||||||
if err := scanner.Err(); err != nil { |
|
||||||
log.Fatalf("scanner error: %v", err) |
|
||||||
} |
|
||||||
|
|
||||||
fmt.Printf("// decode is the decoding table from Big5 code to Unicode.\n") |
|
||||||
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-big5.txt\n") |
|
||||||
fmt.Printf("var decode = [...]uint32{\n") |
|
||||||
for i, v := range mapping { |
|
||||||
if v != 0 { |
|
||||||
fmt.Printf("\t%d: 0x%08X,\n", i, v) |
|
||||||
} |
|
||||||
} |
|
||||||
fmt.Printf("}\n\n") |
|
||||||
|
|
||||||
// Any run of at least separation continuous zero entries in the reverse map will
|
|
||||||
// be a separate encode table.
|
|
||||||
const separation = 1024 |
|
||||||
|
|
||||||
intervals := []interval(nil) |
|
||||||
low, high := -1, -1 |
|
||||||
for i, v := range reverse { |
|
||||||
if v == 0 { |
|
||||||
continue |
|
||||||
} |
|
||||||
if low < 0 { |
|
||||||
low = i |
|
||||||
} else if i-high >= separation { |
|
||||||
if high >= 0 { |
|
||||||
intervals = append(intervals, interval{low, high}) |
|
||||||
} |
|
||||||
low = i |
|
||||||
} |
|
||||||
high = i + 1 |
|
||||||
} |
|
||||||
if high >= 0 { |
|
||||||
intervals = append(intervals, interval{low, high}) |
|
||||||
} |
|
||||||
sort.Sort(byDecreasingLength(intervals)) |
|
||||||
|
|
||||||
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals)) |
|
||||||
fmt.Printf("// encodeX are the encoding tables from Unicode to Big5 code,\n") |
|
||||||
fmt.Printf("// sorted by decreasing length.\n") |
|
||||||
for i, v := range intervals { |
|
||||||
fmt.Printf("// encode%d: %5d entries for runes in [%6d, %6d).\n", i, v.len(), v.low, v.high) |
|
||||||
} |
|
||||||
fmt.Printf("\n") |
|
||||||
|
|
||||||
for i, v := range intervals { |
|
||||||
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high) |
|
||||||
fmt.Printf("var encode%d = [...]uint16{\n", i) |
|
||||||
for j := v.low; j < v.high; j++ { |
|
||||||
x := reverse[j] |
|
||||||
if x == 0 { |
|
||||||
continue |
|
||||||
} |
|
||||||
fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x) |
|
||||||
} |
|
||||||
fmt.Printf("}\n\n") |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// interval is a half-open interval [low, high).
|
|
||||||
type interval struct { |
|
||||||
low, high int |
|
||||||
} |
|
||||||
|
|
||||||
func (i interval) len() int { return i.high - i.low } |
|
||||||
|
|
||||||
// byDecreasingLength sorts intervals by decreasing length.
|
|
||||||
type byDecreasingLength []interval |
|
||||||
|
|
||||||
func (b byDecreasingLength) Len() int { return len(b) } |
|
||||||
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() } |
|
||||||
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] } |
|
@ -1,20 +0,0 @@ |
|||||||
// Copyright 2014 The Go Authors. All rights reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
// +build ignore
|
|
||||||
|
|
||||||
package main |
|
||||||
|
|
||||||
// This file contains code common to the maketables.go and the package code.
|
|
||||||
|
|
||||||
// langAliasType is the type of an alias in langAliasMap.
|
|
||||||
type langAliasType int8 |
|
||||||
|
|
||||||
const ( |
|
||||||
langDeprecated langAliasType = iota |
|
||||||
langMacro |
|
||||||
langLegacy |
|
||||||
|
|
||||||
langAliasTypeUnknown langAliasType = -1 |
|
||||||
) |
|
@ -1,162 +0,0 @@ |
|||||||
// Copyright 2015 The Go Authors. All rights reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
// +build ignore
|
|
||||||
|
|
||||||
package main |
|
||||||
|
|
||||||
// This file generates derivative tables based on the language package itself.
|
|
||||||
|
|
||||||
import ( |
|
||||||
"bytes" |
|
||||||
"flag" |
|
||||||
"fmt" |
|
||||||
"io/ioutil" |
|
||||||
"log" |
|
||||||
"reflect" |
|
||||||
"sort" |
|
||||||
"strings" |
|
||||||
|
|
||||||
"golang.org/x/text/internal/gen" |
|
||||||
"golang.org/x/text/language" |
|
||||||
"golang.org/x/text/unicode/cldr" |
|
||||||
) |
|
||||||
|
|
||||||
var ( |
|
||||||
test = flag.Bool("test", false, |
|
||||||
"test existing tables; can be used to compare web data with package data.") |
|
||||||
|
|
||||||
draft = flag.String("draft", |
|
||||||
"contributed", |
|
||||||
`Minimal draft requirements (approved, contributed, provisional, unconfirmed).`) |
|
||||||
) |
|
||||||
|
|
||||||
func main() { |
|
||||||
gen.Init() |
|
||||||
|
|
||||||
// Read the CLDR zip file.
|
|
||||||
r := gen.OpenCLDRCoreZip() |
|
||||||
defer r.Close() |
|
||||||
|
|
||||||
d := &cldr.Decoder{} |
|
||||||
data, err := d.DecodeZip(r) |
|
||||||
if err != nil { |
|
||||||
log.Fatalf("DecodeZip: %v", err) |
|
||||||
} |
|
||||||
|
|
||||||
w := gen.NewCodeWriter() |
|
||||||
defer func() { |
|
||||||
buf := &bytes.Buffer{} |
|
||||||
|
|
||||||
if _, err = w.WriteGo(buf, "language"); err != nil { |
|
||||||
log.Fatalf("Error formatting file index.go: %v", err) |
|
||||||
} |
|
||||||
|
|
||||||
// Since we're generating a table for our own package we need to rewrite
|
|
||||||
// doing the equivalent of go fmt -r 'language.b -> b'. Using
|
|
||||||
// bytes.Replace will do.
|
|
||||||
out := bytes.Replace(buf.Bytes(), []byte("language."), nil, -1) |
|
||||||
if err := ioutil.WriteFile("index.go", out, 0600); err != nil { |
|
||||||
log.Fatalf("Could not create file index.go: %v", err) |
|
||||||
} |
|
||||||
}() |
|
||||||
|
|
||||||
m := map[language.Tag]bool{} |
|
||||||
for _, lang := range data.Locales() { |
|
||||||
// We include all locales unconditionally to be consistent with en_US.
|
|
||||||
// We want en_US, even though it has no data associated with it.
|
|
||||||
|
|
||||||
// TODO: put any of the languages for which no data exists at the end
|
|
||||||
// of the index. This allows all components based on ICU to use that
|
|
||||||
// as the cutoff point.
|
|
||||||
// if x := data.RawLDML(lang); false ||
|
|
||||||
// x.LocaleDisplayNames != nil ||
|
|
||||||
// x.Characters != nil ||
|
|
||||||
// x.Delimiters != nil ||
|
|
||||||
// x.Measurement != nil ||
|
|
||||||
// x.Dates != nil ||
|
|
||||||
// x.Numbers != nil ||
|
|
||||||
// x.Units != nil ||
|
|
||||||
// x.ListPatterns != nil ||
|
|
||||||
// x.Collations != nil ||
|
|
||||||
// x.Segmentations != nil ||
|
|
||||||
// x.Rbnf != nil ||
|
|
||||||
// x.Annotations != nil ||
|
|
||||||
// x.Metadata != nil {
|
|
||||||
|
|
||||||
// TODO: support POSIX natively, albeit non-standard.
|
|
||||||
tag := language.Make(strings.Replace(lang, "_POSIX", "-u-va-posix", 1)) |
|
||||||
m[tag] = true |
|
||||||
// }
|
|
||||||
} |
|
||||||
// Include locales for plural rules, which uses a different structure.
|
|
||||||
for _, plurals := range data.Supplemental().Plurals { |
|
||||||
for _, rules := range plurals.PluralRules { |
|
||||||
for _, lang := range strings.Split(rules.Locales, " ") { |
|
||||||
m[language.Make(lang)] = true |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
var core, special []language.Tag |
|
||||||
|
|
||||||
for t := range m { |
|
||||||
if x := t.Extensions(); len(x) != 0 && fmt.Sprint(x) != "[u-va-posix]" { |
|
||||||
log.Fatalf("Unexpected extension %v in %v", x, t) |
|
||||||
} |
|
||||||
if len(t.Variants()) == 0 && len(t.Extensions()) == 0 { |
|
||||||
core = append(core, t) |
|
||||||
} else { |
|
||||||
special = append(special, t) |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
w.WriteComment(` |
|
||||||
NumCompactTags is the number of common tags. The maximum tag is |
|
||||||
NumCompactTags-1.`) |
|
||||||
w.WriteConst("NumCompactTags", len(core)+len(special)) |
|
||||||
|
|
||||||
sort.Sort(byAlpha(special)) |
|
||||||
w.WriteVar("specialTags", special) |
|
||||||
|
|
||||||
// TODO: order by frequency?
|
|
||||||
sort.Sort(byAlpha(core)) |
|
||||||
|
|
||||||
// Size computations are just an estimate.
|
|
||||||
w.Size += int(reflect.TypeOf(map[uint32]uint16{}).Size()) |
|
||||||
w.Size += len(core) * 6 // size of uint32 and uint16
|
|
||||||
|
|
||||||
fmt.Fprintln(w) |
|
||||||
fmt.Fprintln(w, "var coreTags = map[uint32]uint16{") |
|
||||||
fmt.Fprintln(w, "0x0: 0, // und") |
|
||||||
i := len(special) + 1 // Und and special tags already written.
|
|
||||||
for _, t := range core { |
|
||||||
if t == language.Und { |
|
||||||
continue |
|
||||||
} |
|
||||||
fmt.Fprint(w.Hash, t, i) |
|
||||||
b, s, r := t.Raw() |
|
||||||
fmt.Fprintf(w, "0x%s%s%s: %d, // %s\n", |
|
||||||
getIndex(b, 3), // 3 is enough as it is guaranteed to be a compact number
|
|
||||||
getIndex(s, 2), |
|
||||||
getIndex(r, 3), |
|
||||||
i, t) |
|
||||||
i++ |
|
||||||
} |
|
||||||
fmt.Fprintln(w, "}") |
|
||||||
} |
|
||||||
|
|
||||||
// getIndex prints the subtag type and extracts its index of size nibble.
|
|
||||||
// If the index is less than n nibbles, the result is prefixed with 0s.
|
|
||||||
func getIndex(x interface{}, n int) string { |
|
||||||
s := fmt.Sprintf("%#v", x) // s is of form Type{typeID: 0x00}
|
|
||||||
s = s[strings.Index(s, "0x")+2 : len(s)-1] |
|
||||||
return strings.Repeat("0", n-len(s)) + s |
|
||||||
} |
|
||||||
|
|
||||||
type byAlpha []language.Tag |
|
||||||
|
|
||||||
func (a byAlpha) Len() int { return len(a) } |
|
||||||
func (a byAlpha) Swap(i, j int) { a[i], a[j] = a[j], a[i] } |
|
||||||
func (a byAlpha) Less(i, j int) bool { return a[i].String() < a[j].String() } |
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,976 +0,0 @@ |
|||||||
// Copyright 2011 The Go Authors. All rights reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
// +build ignore
|
|
||||||
|
|
||||||
// Normalization table generator.
|
|
||||||
// Data read from the web.
|
|
||||||
// See forminfo.go for a description of the trie values associated with each rune.
|
|
||||||
|
|
||||||
package main |
|
||||||
|
|
||||||
import ( |
|
||||||
"bytes" |
|
||||||
"flag" |
|
||||||
"fmt" |
|
||||||
"io" |
|
||||||
"log" |
|
||||||
"sort" |
|
||||||
"strconv" |
|
||||||
"strings" |
|
||||||
|
|
||||||
"golang.org/x/text/internal/gen" |
|
||||||
"golang.org/x/text/internal/triegen" |
|
||||||
"golang.org/x/text/internal/ucd" |
|
||||||
) |
|
||||||
|
|
||||||
func main() { |
|
||||||
gen.Init() |
|
||||||
loadUnicodeData() |
|
||||||
compactCCC() |
|
||||||
loadCompositionExclusions() |
|
||||||
completeCharFields(FCanonical) |
|
||||||
completeCharFields(FCompatibility) |
|
||||||
computeNonStarterCounts() |
|
||||||
verifyComputed() |
|
||||||
printChars() |
|
||||||
testDerived() |
|
||||||
printTestdata() |
|
||||||
makeTables() |
|
||||||
} |
|
||||||
|
|
||||||
var ( |
|
||||||
tablelist = flag.String("tables", |
|
||||||
"all", |
|
||||||
"comma-separated list of which tables to generate; "+ |
|
||||||
"can be 'decomp', 'recomp', 'info' and 'all'") |
|
||||||
test = flag.Bool("test", |
|
||||||
false, |
|
||||||
"test existing tables against DerivedNormalizationProps and generate test data for regression testing") |
|
||||||
verbose = flag.Bool("verbose", |
|
||||||
false, |
|
||||||
"write data to stdout as it is parsed") |
|
||||||
) |
|
||||||
|
|
||||||
const MaxChar = 0x10FFFF // anything above this shouldn't exist
|
|
||||||
|
|
||||||
// Quick Check properties of runes allow us to quickly
|
|
||||||
// determine whether a rune may occur in a normal form.
|
|
||||||
// For a given normal form, a rune may be guaranteed to occur
|
|
||||||
// verbatim (QC=Yes), may or may not combine with another
|
|
||||||
// rune (QC=Maybe), or may not occur (QC=No).
|
|
||||||
type QCResult int |
|
||||||
|
|
||||||
const ( |
|
||||||
QCUnknown QCResult = iota |
|
||||||
QCYes |
|
||||||
QCNo |
|
||||||
QCMaybe |
|
||||||
) |
|
||||||
|
|
||||||
func (r QCResult) String() string { |
|
||||||
switch r { |
|
||||||
case QCYes: |
|
||||||
return "Yes" |
|
||||||
case QCNo: |
|
||||||
return "No" |
|
||||||
case QCMaybe: |
|
||||||
return "Maybe" |
|
||||||
} |
|
||||||
return "***UNKNOWN***" |
|
||||||
} |
|
||||||
|
|
||||||
const ( |
|
||||||
FCanonical = iota // NFC or NFD
|
|
||||||
FCompatibility // NFKC or NFKD
|
|
||||||
FNumberOfFormTypes |
|
||||||
) |
|
||||||
|
|
||||||
const ( |
|
||||||
MComposed = iota // NFC or NFKC
|
|
||||||
MDecomposed // NFD or NFKD
|
|
||||||
MNumberOfModes |
|
||||||
) |
|
||||||
|
|
||||||
// This contains only the properties we're interested in.
|
|
||||||
type Char struct { |
|
||||||
name string |
|
||||||
codePoint rune // if zero, this index is not a valid code point.
|
|
||||||
ccc uint8 // canonical combining class
|
|
||||||
origCCC uint8 |
|
||||||
excludeInComp bool // from CompositionExclusions.txt
|
|
||||||
compatDecomp bool // it has a compatibility expansion
|
|
||||||
|
|
||||||
nTrailingNonStarters uint8 |
|
||||||
nLeadingNonStarters uint8 // must be equal to trailing if non-zero
|
|
||||||
|
|
||||||
forms [FNumberOfFormTypes]FormInfo // For FCanonical and FCompatibility
|
|
||||||
|
|
||||||
state State |
|
||||||
} |
|
||||||
|
|
||||||
var chars = make([]Char, MaxChar+1) |
|
||||||
var cccMap = make(map[uint8]uint8) |
|
||||||
|
|
||||||
func (c Char) String() string { |
|
||||||
buf := new(bytes.Buffer) |
|
||||||
|
|
||||||
fmt.Fprintf(buf, "%U [%s]:\n", c.codePoint, c.name) |
|
||||||
fmt.Fprintf(buf, " ccc: %v\n", c.ccc) |
|
||||||
fmt.Fprintf(buf, " excludeInComp: %v\n", c.excludeInComp) |
|
||||||
fmt.Fprintf(buf, " compatDecomp: %v\n", c.compatDecomp) |
|
||||||
fmt.Fprintf(buf, " state: %v\n", c.state) |
|
||||||
fmt.Fprintf(buf, " NFC:\n") |
|
||||||
fmt.Fprint(buf, c.forms[FCanonical]) |
|
||||||
fmt.Fprintf(buf, " NFKC:\n") |
|
||||||
fmt.Fprint(buf, c.forms[FCompatibility]) |
|
||||||
|
|
||||||
return buf.String() |
|
||||||
} |
|
||||||
|
|
||||||
// In UnicodeData.txt, some ranges are marked like this:
|
|
||||||
// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
|
|
||||||
// 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
|
|
||||||
// parseCharacter keeps a state variable indicating the weirdness.
|
|
||||||
type State int |
|
||||||
|
|
||||||
const ( |
|
||||||
SNormal State = iota // known to be zero for the type
|
|
||||||
SFirst |
|
||||||
SLast |
|
||||||
SMissing |
|
||||||
) |
|
||||||
|
|
||||||
var lastChar = rune('\u0000') |
|
||||||
|
|
||||||
func (c Char) isValid() bool { |
|
||||||
return c.codePoint != 0 && c.state != SMissing |
|
||||||
} |
|
||||||
|
|
||||||
type FormInfo struct { |
|
||||||
quickCheck [MNumberOfModes]QCResult // index: MComposed or MDecomposed
|
|
||||||
verified [MNumberOfModes]bool // index: MComposed or MDecomposed
|
|
||||||
|
|
||||||
combinesForward bool // May combine with rune on the right
|
|
||||||
combinesBackward bool // May combine with rune on the left
|
|
||||||
isOneWay bool // Never appears in result
|
|
||||||
inDecomp bool // Some decompositions result in this char.
|
|
||||||
decomp Decomposition |
|
||||||
expandedDecomp Decomposition |
|
||||||
} |
|
||||||
|
|
||||||
func (f FormInfo) String() string { |
|
||||||
buf := bytes.NewBuffer(make([]byte, 0)) |
|
||||||
|
|
||||||
fmt.Fprintf(buf, " quickCheck[C]: %v\n", f.quickCheck[MComposed]) |
|
||||||
fmt.Fprintf(buf, " quickCheck[D]: %v\n", f.quickCheck[MDecomposed]) |
|
||||||
fmt.Fprintf(buf, " cmbForward: %v\n", f.combinesForward) |
|
||||||
fmt.Fprintf(buf, " cmbBackward: %v\n", f.combinesBackward) |
|
||||||
fmt.Fprintf(buf, " isOneWay: %v\n", f.isOneWay) |
|
||||||
fmt.Fprintf(buf, " inDecomp: %v\n", f.inDecomp) |
|
||||||
fmt.Fprintf(buf, " decomposition: %X\n", f.decomp) |
|
||||||
fmt.Fprintf(buf, " expandedDecomp: %X\n", f.expandedDecomp) |
|
||||||
|
|
||||||
return buf.String() |
|
||||||
} |
|
||||||
|
|
||||||
type Decomposition []rune |
|
||||||
|
|
||||||
func parseDecomposition(s string, skipfirst bool) (a []rune, err error) { |
|
||||||
decomp := strings.Split(s, " ") |
|
||||||
if len(decomp) > 0 && skipfirst { |
|
||||||
decomp = decomp[1:] |
|
||||||
} |
|
||||||
for _, d := range decomp { |
|
||||||
point, err := strconv.ParseUint(d, 16, 64) |
|
||||||
if err != nil { |
|
||||||
return a, err |
|
||||||
} |
|
||||||
a = append(a, rune(point)) |
|
||||||
} |
|
||||||
return a, nil |
|
||||||
} |
|
||||||
|
|
||||||
func loadUnicodeData() { |
|
||||||
f := gen.OpenUCDFile("UnicodeData.txt") |
|
||||||
defer f.Close() |
|
||||||
p := ucd.New(f) |
|
||||||
for p.Next() { |
|
||||||
r := p.Rune(ucd.CodePoint) |
|
||||||
char := &chars[r] |
|
||||||
|
|
||||||
char.ccc = uint8(p.Uint(ucd.CanonicalCombiningClass)) |
|
||||||
decmap := p.String(ucd.DecompMapping) |
|
||||||
|
|
||||||
exp, err := parseDecomposition(decmap, false) |
|
||||||
isCompat := false |
|
||||||
if err != nil { |
|
||||||
if len(decmap) > 0 { |
|
||||||
exp, err = parseDecomposition(decmap, true) |
|
||||||
if err != nil { |
|
||||||
log.Fatalf(`%U: bad decomp |%v|: "%s"`, r, decmap, err) |
|
||||||
} |
|
||||||
isCompat = true |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
char.name = p.String(ucd.Name) |
|
||||||
char.codePoint = r |
|
||||||
char.forms[FCompatibility].decomp = exp |
|
||||||
if !isCompat { |
|
||||||
char.forms[FCanonical].decomp = exp |
|
||||||
} else { |
|
||||||
char.compatDecomp = true |
|
||||||
} |
|
||||||
if len(decmap) > 0 { |
|
||||||
char.forms[FCompatibility].decomp = exp |
|
||||||
} |
|
||||||
} |
|
||||||
if err := p.Err(); err != nil { |
|
||||||
log.Fatal(err) |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// compactCCC converts the sparse set of CCC values to a continguous one,
|
|
||||||
// reducing the number of bits needed from 8 to 6.
|
|
||||||
func compactCCC() { |
|
||||||
m := make(map[uint8]uint8) |
|
||||||
for i := range chars { |
|
||||||
c := &chars[i] |
|
||||||
m[c.ccc] = 0 |
|
||||||
} |
|
||||||
cccs := []int{} |
|
||||||
for v, _ := range m { |
|
||||||
cccs = append(cccs, int(v)) |
|
||||||
} |
|
||||||
sort.Ints(cccs) |
|
||||||
for i, c := range cccs { |
|
||||||
cccMap[uint8(i)] = uint8(c) |
|
||||||
m[uint8(c)] = uint8(i) |
|
||||||
} |
|
||||||
for i := range chars { |
|
||||||
c := &chars[i] |
|
||||||
c.origCCC = c.ccc |
|
||||||
c.ccc = m[c.ccc] |
|
||||||
} |
|
||||||
if len(m) >= 1<<6 { |
|
||||||
log.Fatalf("too many difference CCC values: %d >= 64", len(m)) |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// CompositionExclusions.txt has form:
|
|
||||||
// 0958 # ...
|
|
||||||
// See http://unicode.org/reports/tr44/ for full explanation
|
|
||||||
func loadCompositionExclusions() { |
|
||||||
f := gen.OpenUCDFile("CompositionExclusions.txt") |
|
||||||
defer f.Close() |
|
||||||
p := ucd.New(f) |
|
||||||
for p.Next() { |
|
||||||
c := &chars[p.Rune(0)] |
|
||||||
if c.excludeInComp { |
|
||||||
log.Fatalf("%U: Duplicate entry in exclusions.", c.codePoint) |
|
||||||
} |
|
||||||
c.excludeInComp = true |
|
||||||
} |
|
||||||
if e := p.Err(); e != nil { |
|
||||||
log.Fatal(e) |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// hasCompatDecomp returns true if any of the recursive
|
|
||||||
// decompositions contains a compatibility expansion.
|
|
||||||
// In this case, the character may not occur in NFK*.
|
|
||||||
func hasCompatDecomp(r rune) bool { |
|
||||||
c := &chars[r] |
|
||||||
if c.compatDecomp { |
|
||||||
return true |
|
||||||
} |
|
||||||
for _, d := range c.forms[FCompatibility].decomp { |
|
||||||
if hasCompatDecomp(d) { |
|
||||||
return true |
|
||||||
} |
|
||||||
} |
|
||||||
return false |
|
||||||
} |
|
||||||
|
|
||||||
// Hangul related constants.
|
|
||||||
const ( |
|
||||||
HangulBase = 0xAC00 |
|
||||||
HangulEnd = 0xD7A4 // hangulBase + Jamo combinations (19 * 21 * 28)
|
|
||||||
|
|
||||||
JamoLBase = 0x1100 |
|
||||||
JamoLEnd = 0x1113 |
|
||||||
JamoVBase = 0x1161 |
|
||||||
JamoVEnd = 0x1176 |
|
||||||
JamoTBase = 0x11A8 |
|
||||||
JamoTEnd = 0x11C3 |
|
||||||
|
|
||||||
JamoLVTCount = 19 * 21 * 28 |
|
||||||
JamoTCount = 28 |
|
||||||
) |
|
||||||
|
|
||||||
func isHangul(r rune) bool { |
|
||||||
return HangulBase <= r && r < HangulEnd |
|
||||||
} |
|
||||||
|
|
||||||
func isHangulWithoutJamoT(r rune) bool { |
|
||||||
if !isHangul(r) { |
|
||||||
return false |
|
||||||
} |
|
||||||
r -= HangulBase |
|
||||||
return r < JamoLVTCount && r%JamoTCount == 0 |
|
||||||
} |
|
||||||
|
|
||||||
func ccc(r rune) uint8 { |
|
||||||
return chars[r].ccc |
|
||||||
} |
|
||||||
|
|
||||||
// Insert a rune in a buffer, ordered by Canonical Combining Class.
|
|
||||||
func insertOrdered(b Decomposition, r rune) Decomposition { |
|
||||||
n := len(b) |
|
||||||
b = append(b, 0) |
|
||||||
cc := ccc(r) |
|
||||||
if cc > 0 { |
|
||||||
// Use bubble sort.
|
|
||||||
for ; n > 0; n-- { |
|
||||||
if ccc(b[n-1]) <= cc { |
|
||||||
break |
|
||||||
} |
|
||||||
b[n] = b[n-1] |
|
||||||
} |
|
||||||
} |
|
||||||
b[n] = r |
|
||||||
return b |
|
||||||
} |
|
||||||
|
|
||||||
// Recursively decompose.
|
|
||||||
func decomposeRecursive(form int, r rune, d Decomposition) Decomposition { |
|
||||||
dcomp := chars[r].forms[form].decomp |
|
||||||
if len(dcomp) == 0 { |
|
||||||
return insertOrdered(d, r) |
|
||||||
} |
|
||||||
for _, c := range dcomp { |
|
||||||
d = decomposeRecursive(form, c, d) |
|
||||||
} |
|
||||||
return d |
|
||||||
} |
|
||||||
|
|
||||||
func completeCharFields(form int) { |
|
||||||
// Phase 0: pre-expand decomposition.
|
|
||||||
for i := range chars { |
|
||||||
f := &chars[i].forms[form] |
|
||||||
if len(f.decomp) == 0 { |
|
||||||
continue |
|
||||||
} |
|
||||||
exp := make(Decomposition, 0) |
|
||||||
for _, c := range f.decomp { |
|
||||||
exp = decomposeRecursive(form, c, exp) |
|
||||||
} |
|
||||||
f.expandedDecomp = exp |
|
||||||
} |
|
||||||
|
|
||||||
// Phase 1: composition exclusion, mark decomposition.
|
|
||||||
for i := range chars { |
|
||||||
c := &chars[i] |
|
||||||
f := &c.forms[form] |
|
||||||
|
|
||||||
// Marks script-specific exclusions and version restricted.
|
|
||||||
f.isOneWay = c.excludeInComp |
|
||||||
|
|
||||||
// Singletons
|
|
||||||
f.isOneWay = f.isOneWay || len(f.decomp) == 1 |
|
||||||
|
|
||||||
// Non-starter decompositions
|
|
||||||
if len(f.decomp) > 1 { |
|
||||||
chk := c.ccc != 0 || chars[f.decomp[0]].ccc != 0 |
|
||||||
f.isOneWay = f.isOneWay || chk |
|
||||||
} |
|
||||||
|
|
||||||
// Runes that decompose into more than two runes.
|
|
||||||
f.isOneWay = f.isOneWay || len(f.decomp) > 2 |
|
||||||
|
|
||||||
if form == FCompatibility { |
|
||||||
f.isOneWay = f.isOneWay || hasCompatDecomp(c.codePoint) |
|
||||||
} |
|
||||||
|
|
||||||
for _, r := range f.decomp { |
|
||||||
chars[r].forms[form].inDecomp = true |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// Phase 2: forward and backward combining.
|
|
||||||
for i := range chars { |
|
||||||
c := &chars[i] |
|
||||||
f := &c.forms[form] |
|
||||||
|
|
||||||
if !f.isOneWay && len(f.decomp) == 2 { |
|
||||||
f0 := &chars[f.decomp[0]].forms[form] |
|
||||||
f1 := &chars[f.decomp[1]].forms[form] |
|
||||||
if !f0.isOneWay { |
|
||||||
f0.combinesForward = true |
|
||||||
} |
|
||||||
if !f1.isOneWay { |
|
||||||
f1.combinesBackward = true |
|
||||||
} |
|
||||||
} |
|
||||||
if isHangulWithoutJamoT(rune(i)) { |
|
||||||
f.combinesForward = true |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// Phase 3: quick check values.
|
|
||||||
for i := range chars { |
|
||||||
c := &chars[i] |
|
||||||
f := &c.forms[form] |
|
||||||
|
|
||||||
switch { |
|
||||||
case len(f.decomp) > 0: |
|
||||||
f.quickCheck[MDecomposed] = QCNo |
|
||||||
case isHangul(rune(i)): |
|
||||||
f.quickCheck[MDecomposed] = QCNo |
|
||||||
default: |
|
||||||
f.quickCheck[MDecomposed] = QCYes |
|
||||||
} |
|
||||||
switch { |
|
||||||
case f.isOneWay: |
|
||||||
f.quickCheck[MComposed] = QCNo |
|
||||||
case (i & 0xffff00) == JamoLBase: |
|
||||||
f.quickCheck[MComposed] = QCYes |
|
||||||
if JamoLBase <= i && i < JamoLEnd { |
|
||||||
f.combinesForward = true |
|
||||||
} |
|
||||||
if JamoVBase <= i && i < JamoVEnd { |
|
||||||
f.quickCheck[MComposed] = QCMaybe |
|
||||||
f.combinesBackward = true |
|
||||||
f.combinesForward = true |
|
||||||
} |
|
||||||
if JamoTBase <= i && i < JamoTEnd { |
|
||||||
f.quickCheck[MComposed] = QCMaybe |
|
||||||
f.combinesBackward = true |
|
||||||
} |
|
||||||
case !f.combinesBackward: |
|
||||||
f.quickCheck[MComposed] = QCYes |
|
||||||
default: |
|
||||||
f.quickCheck[MComposed] = QCMaybe |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
func computeNonStarterCounts() { |
|
||||||
// Phase 4: leading and trailing non-starter count
|
|
||||||
for i := range chars { |
|
||||||
c := &chars[i] |
|
||||||
|
|
||||||
runes := []rune{rune(i)} |
|
||||||
// We always use FCompatibility so that the CGJ insertion points do not
|
|
||||||
// change for repeated normalizations with different forms.
|
|
||||||
if exp := c.forms[FCompatibility].expandedDecomp; len(exp) > 0 { |
|
||||||
runes = exp |
|
||||||
} |
|
||||||
// We consider runes that combine backwards to be non-starters for the
|
|
||||||
// purpose of Stream-Safe Text Processing.
|
|
||||||
for _, r := range runes { |
|
||||||
if cr := &chars[r]; cr.ccc == 0 && !cr.forms[FCompatibility].combinesBackward { |
|
||||||
break |
|
||||||
} |
|
||||||
c.nLeadingNonStarters++ |
|
||||||
} |
|
||||||
for i := len(runes) - 1; i >= 0; i-- { |
|
||||||
if cr := &chars[runes[i]]; cr.ccc == 0 && !cr.forms[FCompatibility].combinesBackward { |
|
||||||
break |
|
||||||
} |
|
||||||
c.nTrailingNonStarters++ |
|
||||||
} |
|
||||||
if c.nTrailingNonStarters > 3 { |
|
||||||
log.Fatalf("%U: Decomposition with more than 3 (%d) trailing modifiers (%U)", i, c.nTrailingNonStarters, runes) |
|
||||||
} |
|
||||||
|
|
||||||
if isHangul(rune(i)) { |
|
||||||
c.nTrailingNonStarters = 2 |
|
||||||
if isHangulWithoutJamoT(rune(i)) { |
|
||||||
c.nTrailingNonStarters = 1 |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
if l, t := c.nLeadingNonStarters, c.nTrailingNonStarters; l > 0 && l != t { |
|
||||||
log.Fatalf("%U: number of leading and trailing non-starters should be equal (%d vs %d)", i, l, t) |
|
||||||
} |
|
||||||
if t := c.nTrailingNonStarters; t > 3 { |
|
||||||
log.Fatalf("%U: number of trailing non-starters is %d > 3", t) |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
func printBytes(w io.Writer, b []byte, name string) { |
|
||||||
fmt.Fprintf(w, "// %s: %d bytes\n", name, len(b)) |
|
||||||
fmt.Fprintf(w, "var %s = [...]byte {", name) |
|
||||||
for i, c := range b { |
|
||||||
switch { |
|
||||||
case i%64 == 0: |
|
||||||
fmt.Fprintf(w, "\n// Bytes %x - %x\n", i, i+63) |
|
||||||
case i%8 == 0: |
|
||||||
fmt.Fprintf(w, "\n") |
|
||||||
} |
|
||||||
fmt.Fprintf(w, "0x%.2X, ", c) |
|
||||||
} |
|
||||||
fmt.Fprint(w, "\n}\n\n") |
|
||||||
} |
|
||||||
|
|
||||||
// See forminfo.go for format.
|
|
||||||
func makeEntry(f *FormInfo, c *Char) uint16 { |
|
||||||
e := uint16(0) |
|
||||||
if r := c.codePoint; HangulBase <= r && r < HangulEnd { |
|
||||||
e |= 0x40 |
|
||||||
} |
|
||||||
if f.combinesForward { |
|
||||||
e |= 0x20 |
|
||||||
} |
|
||||||
if f.quickCheck[MDecomposed] == QCNo { |
|
||||||
e |= 0x4 |
|
||||||
} |
|
||||||
switch f.quickCheck[MComposed] { |
|
||||||
case QCYes: |
|
||||||
case QCNo: |
|
||||||
e |= 0x10 |
|
||||||
case QCMaybe: |
|
||||||
e |= 0x18 |
|
||||||
default: |
|
||||||
log.Fatalf("Illegal quickcheck value %v.", f.quickCheck[MComposed]) |
|
||||||
} |
|
||||||
e |= uint16(c.nTrailingNonStarters) |
|
||||||
return e |
|
||||||
} |
|
||||||
|
|
||||||
// decompSet keeps track of unique decompositions, grouped by whether
|
|
||||||
// the decomposition is followed by a trailing and/or leading CCC.
|
|
||||||
type decompSet [7]map[string]bool |
|
||||||
|
|
||||||
const ( |
|
||||||
normalDecomp = iota |
|
||||||
firstMulti |
|
||||||
firstCCC |
|
||||||
endMulti |
|
||||||
firstLeadingCCC |
|
||||||
firstCCCZeroExcept |
|
||||||
firstStarterWithNLead |
|
||||||
lastDecomp |
|
||||||
) |
|
||||||
|
|
||||||
var cname = []string{"firstMulti", "firstCCC", "endMulti", "firstLeadingCCC", "firstCCCZeroExcept", "firstStarterWithNLead", "lastDecomp"} |
|
||||||
|
|
||||||
func makeDecompSet() decompSet { |
|
||||||
m := decompSet{} |
|
||||||
for i := range m { |
|
||||||
m[i] = make(map[string]bool) |
|
||||||
} |
|
||||||
return m |
|
||||||
} |
|
||||||
func (m *decompSet) insert(key int, s string) { |
|
||||||
m[key][s] = true |
|
||||||
} |
|
||||||
|
|
||||||
func printCharInfoTables(w io.Writer) int { |
|
||||||
mkstr := func(r rune, f *FormInfo) (int, string) { |
|
||||||
d := f.expandedDecomp |
|
||||||
s := string([]rune(d)) |
|
||||||
if max := 1 << 6; len(s) >= max { |
|
||||||
const msg = "%U: too many bytes in decomposition: %d >= %d" |
|
||||||
log.Fatalf(msg, r, len(s), max) |
|
||||||
} |
|
||||||
head := uint8(len(s)) |
|
||||||
if f.quickCheck[MComposed] != QCYes { |
|
||||||
head |= 0x40 |
|
||||||
} |
|
||||||
if f.combinesForward { |
|
||||||
head |= 0x80 |
|
||||||
} |
|
||||||
s = string([]byte{head}) + s |
|
||||||
|
|
||||||
lccc := ccc(d[0]) |
|
||||||
tccc := ccc(d[len(d)-1]) |
|
||||||
cc := ccc(r) |
|
||||||
if cc != 0 && lccc == 0 && tccc == 0 { |
|
||||||
log.Fatalf("%U: trailing and leading ccc are 0 for non-zero ccc %d", r, cc) |
|
||||||
} |
|
||||||
if tccc < lccc && lccc != 0 { |
|
||||||
const msg = "%U: lccc (%d) must be <= tcc (%d)" |
|
||||||
log.Fatalf(msg, r, lccc, tccc) |
|
||||||
} |
|
||||||
index := normalDecomp |
|
||||||
nTrail := chars[r].nTrailingNonStarters |
|
||||||
nLead := chars[r].nLeadingNonStarters |
|
||||||
if tccc > 0 || lccc > 0 || nTrail > 0 { |
|
||||||
tccc <<= 2 |
|
||||||
tccc |= nTrail |
|
||||||
s += string([]byte{tccc}) |
|
||||||
index = endMulti |
|
||||||
for _, r := range d[1:] { |
|
||||||
if ccc(r) == 0 { |
|
||||||
index = firstCCC |
|
||||||
} |
|
||||||
} |
|
||||||
if lccc > 0 || nLead > 0 { |
|
||||||
s += string([]byte{lccc}) |
|
||||||
if index == firstCCC { |
|
||||||
log.Fatalf("%U: multi-segment decomposition not supported for decompositions with leading CCC != 0", r) |
|
||||||
} |
|
||||||
index = firstLeadingCCC |
|
||||||
} |
|
||||||
if cc != lccc { |
|
||||||
if cc != 0 { |
|
||||||
log.Fatalf("%U: for lccc != ccc, expected ccc to be 0; was %d", r, cc) |
|
||||||
} |
|
||||||
index = firstCCCZeroExcept |
|
||||||
} |
|
||||||
} else if len(d) > 1 { |
|
||||||
index = firstMulti |
|
||||||
} |
|
||||||
return index, s |
|
||||||
} |
|
||||||
|
|
||||||
decompSet := makeDecompSet() |
|
||||||
const nLeadStr = "\x00\x01" // 0-byte length and tccc with nTrail.
|
|
||||||
decompSet.insert(firstStarterWithNLead, nLeadStr) |
|
||||||
|
|
||||||
// Store the uniqued decompositions in a byte buffer,
|
|
||||||
// preceded by their byte length.
|
|
||||||
for _, c := range chars { |
|
||||||
for _, f := range c.forms { |
|
||||||
if len(f.expandedDecomp) == 0 { |
|
||||||
continue |
|
||||||
} |
|
||||||
if f.combinesBackward { |
|
||||||
log.Fatalf("%U: combinesBackward and decompose", c.codePoint) |
|
||||||
} |
|
||||||
index, s := mkstr(c.codePoint, &f) |
|
||||||
decompSet.insert(index, s) |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
decompositions := bytes.NewBuffer(make([]byte, 0, 10000)) |
|
||||||
size := 0 |
|
||||||
positionMap := make(map[string]uint16) |
|
||||||
decompositions.WriteString("\000") |
|
||||||
fmt.Fprintln(w, "const (") |
|
||||||
for i, m := range decompSet { |
|
||||||
sa := []string{} |
|
||||||
for s := range m { |
|
||||||
sa = append(sa, s) |
|
||||||
} |
|
||||||
sort.Strings(sa) |
|
||||||
for _, s := range sa { |
|
||||||
p := decompositions.Len() |
|
||||||
decompositions.WriteString(s) |
|
||||||
positionMap[s] = uint16(p) |
|
||||||
} |
|
||||||
if cname[i] != "" { |
|
||||||
fmt.Fprintf(w, "%s = 0x%X\n", cname[i], decompositions.Len()) |
|
||||||
} |
|
||||||
} |
|
||||||
fmt.Fprintln(w, "maxDecomp = 0x8000") |
|
||||||
fmt.Fprintln(w, ")") |
|
||||||
b := decompositions.Bytes() |
|
||||||
printBytes(w, b, "decomps") |
|
||||||
size += len(b) |
|
||||||
|
|
||||||
varnames := []string{"nfc", "nfkc"} |
|
||||||
for i := 0; i < FNumberOfFormTypes; i++ { |
|
||||||
trie := triegen.NewTrie(varnames[i]) |
|
||||||
|
|
||||||
for r, c := range chars { |
|
||||||
f := c.forms[i] |
|
||||||
d := f.expandedDecomp |
|
||||||
if len(d) != 0 { |
|
||||||
_, key := mkstr(c.codePoint, &f) |
|
||||||
trie.Insert(rune(r), uint64(positionMap[key])) |
|
||||||
if c.ccc != ccc(d[0]) { |
|
||||||
// We assume the lead ccc of a decomposition !=0 in this case.
|
|
||||||
if ccc(d[0]) == 0 { |
|
||||||
log.Fatalf("Expected leading CCC to be non-zero; ccc is %d", c.ccc) |
|
||||||
} |
|
||||||
} |
|
||||||
} else if c.nLeadingNonStarters > 0 && len(f.expandedDecomp) == 0 && c.ccc == 0 && !f.combinesBackward { |
|
||||||
// Handle cases where it can't be detected that the nLead should be equal
|
|
||||||
// to nTrail.
|
|
||||||
trie.Insert(c.codePoint, uint64(positionMap[nLeadStr])) |
|
||||||
} else if v := makeEntry(&f, &c)<<8 | uint16(c.ccc); v != 0 { |
|
||||||
trie.Insert(c.codePoint, uint64(0x8000|v)) |
|
||||||
} |
|
||||||
} |
|
||||||
sz, err := trie.Gen(w, triegen.Compact(&normCompacter{name: varnames[i]})) |
|
||||||
if err != nil { |
|
||||||
log.Fatal(err) |
|
||||||
} |
|
||||||
size += sz |
|
||||||
} |
|
||||||
return size |
|
||||||
} |
|
||||||
|
|
||||||
func contains(sa []string, s string) bool { |
|
||||||
for _, a := range sa { |
|
||||||
if a == s { |
|
||||||
return true |
|
||||||
} |
|
||||||
} |
|
||||||
return false |
|
||||||
} |
|
||||||
|
|
||||||
func makeTables() { |
|
||||||
w := &bytes.Buffer{} |
|
||||||
|
|
||||||
size := 0 |
|
||||||
if *tablelist == "" { |
|
||||||
return |
|
||||||
} |
|
||||||
list := strings.Split(*tablelist, ",") |
|
||||||
if *tablelist == "all" { |
|
||||||
list = []string{"recomp", "info"} |
|
||||||
} |
|
||||||
|
|
||||||
// Compute maximum decomposition size.
|
|
||||||
max := 0 |
|
||||||
for _, c := range chars { |
|
||||||
if n := len(string(c.forms[FCompatibility].expandedDecomp)); n > max { |
|
||||||
max = n |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
fmt.Fprintln(w, "const (") |
|
||||||
fmt.Fprintln(w, "\t// Version is the Unicode edition from which the tables are derived.") |
|
||||||
fmt.Fprintf(w, "\tVersion = %q\n", gen.UnicodeVersion()) |
|
||||||
fmt.Fprintln(w) |
|
||||||
fmt.Fprintln(w, "\t// MaxTransformChunkSize indicates the maximum number of bytes that Transform") |
|
||||||
fmt.Fprintln(w, "\t// may need to write atomically for any Form. Making a destination buffer at") |
|
||||||
fmt.Fprintln(w, "\t// least this size ensures that Transform can always make progress and that") |
|
||||||
fmt.Fprintln(w, "\t// the user does not need to grow the buffer on an ErrShortDst.") |
|
||||||
fmt.Fprintf(w, "\tMaxTransformChunkSize = %d+maxNonStarters*4\n", len(string(0x034F))+max) |
|
||||||
fmt.Fprintln(w, ")\n") |
|
||||||
|
|
||||||
// Print the CCC remap table.
|
|
||||||
size += len(cccMap) |
|
||||||
fmt.Fprintf(w, "var ccc = [%d]uint8{", len(cccMap)) |
|
||||||
for i := 0; i < len(cccMap); i++ { |
|
||||||
if i%8 == 0 { |
|
||||||
fmt.Fprintln(w) |
|
||||||
} |
|
||||||
fmt.Fprintf(w, "%3d, ", cccMap[uint8(i)]) |
|
||||||
} |
|
||||||
fmt.Fprintln(w, "\n}\n") |
|
||||||
|
|
||||||
if contains(list, "info") { |
|
||||||
size += printCharInfoTables(w) |
|
||||||
} |
|
||||||
|
|
||||||
if contains(list, "recomp") { |
|
||||||
// Note that we use 32 bit keys, instead of 64 bit.
|
|
||||||
// This clips the bits of three entries, but we know
|
|
||||||
// this won't cause a collision. The compiler will catch
|
|
||||||
// any changes made to UnicodeData.txt that introduces
|
|
||||||
// a collision.
|
|
||||||
// Note that the recomposition map for NFC and NFKC
|
|
||||||
// are identical.
|
|
||||||
|
|
||||||
// Recomposition map
|
|
||||||
nrentries := 0 |
|
||||||
for _, c := range chars { |
|
||||||
f := c.forms[FCanonical] |
|
||||||
if !f.isOneWay && len(f.decomp) > 0 { |
|
||||||
nrentries++ |
|
||||||
} |
|
||||||
} |
|
||||||
sz := nrentries * 8 |
|
||||||
size += sz |
|
||||||
fmt.Fprintf(w, "// recompMap: %d bytes (entries only)\n", sz) |
|
||||||
fmt.Fprintln(w, "var recompMap = map[uint32]rune{") |
|
||||||
for i, c := range chars { |
|
||||||
f := c.forms[FCanonical] |
|
||||||
d := f.decomp |
|
||||||
if !f.isOneWay && len(d) > 0 { |
|
||||||
key := uint32(uint16(d[0]))<<16 + uint32(uint16(d[1])) |
|
||||||
fmt.Fprintf(w, "0x%.8X: 0x%.4X,\n", key, i) |
|
||||||
} |
|
||||||
} |
|
||||||
fmt.Fprintf(w, "}\n\n") |
|
||||||
} |
|
||||||
|
|
||||||
fmt.Fprintf(w, "// Total size of tables: %dKB (%d bytes)\n", (size+512)/1024, size) |
|
||||||
gen.WriteGoFile("tables.go", "norm", w.Bytes()) |
|
||||||
} |
|
||||||
|
|
||||||
func printChars() { |
|
||||||
if *verbose { |
|
||||||
for _, c := range chars { |
|
||||||
if !c.isValid() || c.state == SMissing { |
|
||||||
continue |
|
||||||
} |
|
||||||
fmt.Println(c) |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// verifyComputed does various consistency tests.
|
|
||||||
func verifyComputed() { |
|
||||||
for i, c := range chars { |
|
||||||
for _, f := range c.forms { |
|
||||||
isNo := (f.quickCheck[MDecomposed] == QCNo) |
|
||||||
if (len(f.decomp) > 0) != isNo && !isHangul(rune(i)) { |
|
||||||
log.Fatalf("%U: NF*D QC must be No if rune decomposes", i) |
|
||||||
} |
|
||||||
|
|
||||||
isMaybe := f.quickCheck[MComposed] == QCMaybe |
|
||||||
if f.combinesBackward != isMaybe { |
|
||||||
log.Fatalf("%U: NF*C QC must be Maybe if combinesBackward", i) |
|
||||||
} |
|
||||||
if len(f.decomp) > 0 && f.combinesForward && isMaybe { |
|
||||||
log.Fatalf("%U: NF*C QC must be Yes or No if combinesForward and decomposes", i) |
|
||||||
} |
|
||||||
|
|
||||||
if len(f.expandedDecomp) != 0 { |
|
||||||
continue |
|
||||||
} |
|
||||||
if a, b := c.nLeadingNonStarters > 0, (c.ccc > 0 || f.combinesBackward); a != b { |
|
||||||
// We accept these runes to be treated differently (it only affects
|
|
||||||
// segment breaking in iteration, most likely on improper use), but
|
|
||||||
// reconsider if more characters are added.
|
|
||||||
// U+FF9E HALFWIDTH KATAKANA VOICED SOUND MARK;Lm;0;L;<narrow> 3099;;;;N;;;;;
|
|
||||||
// U+FF9F HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK;Lm;0;L;<narrow> 309A;;;;N;;;;;
|
|
||||||
// U+3133 HANGUL LETTER KIYEOK-SIOS;Lo;0;L;<compat> 11AA;;;;N;HANGUL LETTER GIYEOG SIOS;;;;
|
|
||||||
// U+318E HANGUL LETTER ARAEAE;Lo;0;L;<compat> 11A1;;;;N;HANGUL LETTER ALAE AE;;;;
|
|
||||||
// U+FFA3 HALFWIDTH HANGUL LETTER KIYEOK-SIOS;Lo;0;L;<narrow> 3133;;;;N;HALFWIDTH HANGUL LETTER GIYEOG SIOS;;;;
|
|
||||||
// U+FFDC HALFWIDTH HANGUL LETTER I;Lo;0;L;<narrow> 3163;;;;N;;;;;
|
|
||||||
if i != 0xFF9E && i != 0xFF9F && !(0x3133 <= i && i <= 0x318E) && !(0xFFA3 <= i && i <= 0xFFDC) { |
|
||||||
log.Fatalf("%U: nLead was %v; want %v", i, a, b) |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
nfc := c.forms[FCanonical] |
|
||||||
nfkc := c.forms[FCompatibility] |
|
||||||
if nfc.combinesBackward != nfkc.combinesBackward { |
|
||||||
log.Fatalf("%U: Cannot combine combinesBackward\n", c.codePoint) |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// Use values in DerivedNormalizationProps.txt to compare against the
|
|
||||||
// values we computed.
|
|
||||||
// DerivedNormalizationProps.txt has form:
|
|
||||||
// 00C0..00C5 ; NFD_QC; N # ...
|
|
||||||
// 0374 ; NFD_QC; N # ...
|
|
||||||
// See http://unicode.org/reports/tr44/ for full explanation
|
|
||||||
func testDerived() { |
|
||||||
f := gen.OpenUCDFile("DerivedNormalizationProps.txt") |
|
||||||
defer f.Close() |
|
||||||
p := ucd.New(f) |
|
||||||
for p.Next() { |
|
||||||
r := p.Rune(0) |
|
||||||
c := &chars[r] |
|
||||||
|
|
||||||
var ftype, mode int |
|
||||||
qt := p.String(1) |
|
||||||
switch qt { |
|
||||||
case "NFC_QC": |
|
||||||
ftype, mode = FCanonical, MComposed |
|
||||||
case "NFD_QC": |
|
||||||
ftype, mode = FCanonical, MDecomposed |
|
||||||
case "NFKC_QC": |
|
||||||
ftype, mode = FCompatibility, MComposed |
|
||||||
case "NFKD_QC": |
|
||||||
ftype, mode = FCompatibility, MDecomposed |
|
||||||
default: |
|
||||||
continue |
|
||||||
} |
|
||||||
var qr QCResult |
|
||||||
switch p.String(2) { |
|
||||||
case "Y": |
|
||||||
qr = QCYes |
|
||||||
case "N": |
|
||||||
qr = QCNo |
|
||||||
case "M": |
|
||||||
qr = QCMaybe |
|
||||||
default: |
|
||||||
log.Fatalf(`Unexpected quick check value "%s"`, p.String(2)) |
|
||||||
} |
|
||||||
if got := c.forms[ftype].quickCheck[mode]; got != qr { |
|
||||||
log.Printf("%U: FAILED %s (was %v need %v)\n", r, qt, got, qr) |
|
||||||
} |
|
||||||
c.forms[ftype].verified[mode] = true |
|
||||||
} |
|
||||||
if err := p.Err(); err != nil { |
|
||||||
log.Fatal(err) |
|
||||||
} |
|
||||||
// Any unspecified value must be QCYes. Verify this.
|
|
||||||
for i, c := range chars { |
|
||||||
for j, fd := range c.forms { |
|
||||||
for k, qr := range fd.quickCheck { |
|
||||||
if !fd.verified[k] && qr != QCYes { |
|
||||||
m := "%U: FAIL F:%d M:%d (was %v need Yes) %s\n" |
|
||||||
log.Printf(m, i, j, k, qr, c.name) |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
var testHeader = `const ( |
|
||||||
Yes = iota |
|
||||||
No |
|
||||||
Maybe |
|
||||||
) |
|
||||||
|
|
||||||
type formData struct { |
|
||||||
qc uint8 |
|
||||||
combinesForward bool |
|
||||||
decomposition string |
|
||||||
} |
|
||||||
|
|
||||||
type runeData struct { |
|
||||||
r rune |
|
||||||
ccc uint8 |
|
||||||
nLead uint8 |
|
||||||
nTrail uint8 |
|
||||||
f [2]formData // 0: canonical; 1: compatibility
|
|
||||||
} |
|
||||||
|
|
||||||
func f(qc uint8, cf bool, dec string) [2]formData { |
|
||||||
return [2]formData{{qc, cf, dec}, {qc, cf, dec}} |
|
||||||
} |
|
||||||
|
|
||||||
func g(qc, qck uint8, cf, cfk bool, d, dk string) [2]formData { |
|
||||||
return [2]formData{{qc, cf, d}, {qck, cfk, dk}} |
|
||||||
} |
|
||||||
|
|
||||||
var testData = []runeData{ |
|
||||||
` |
|
||||||
|
|
||||||
func printTestdata() { |
|
||||||
type lastInfo struct { |
|
||||||
ccc uint8 |
|
||||||
nLead uint8 |
|
||||||
nTrail uint8 |
|
||||||
f string |
|
||||||
} |
|
||||||
|
|
||||||
last := lastInfo{} |
|
||||||
w := &bytes.Buffer{} |
|
||||||
fmt.Fprintf(w, testHeader) |
|
||||||
for r, c := range chars { |
|
||||||
f := c.forms[FCanonical] |
|
||||||
qc, cf, d := f.quickCheck[MComposed], f.combinesForward, string(f.expandedDecomp) |
|
||||||
f = c.forms[FCompatibility] |
|
||||||
qck, cfk, dk := f.quickCheck[MComposed], f.combinesForward, string(f.expandedDecomp) |
|
||||||
s := "" |
|
||||||
if d == dk && qc == qck && cf == cfk { |
|
||||||
s = fmt.Sprintf("f(%s, %v, %q)", qc, cf, d) |
|
||||||
} else { |
|
||||||
s = fmt.Sprintf("g(%s, %s, %v, %v, %q, %q)", qc, qck, cf, cfk, d, dk) |
|
||||||
} |
|
||||||
current := lastInfo{c.ccc, c.nLeadingNonStarters, c.nTrailingNonStarters, s} |
|
||||||
if last != current { |
|
||||||
fmt.Fprintf(w, "\t{0x%x, %d, %d, %d, %s},\n", r, c.origCCC, c.nLeadingNonStarters, c.nTrailingNonStarters, s) |
|
||||||
last = current |
|
||||||
} |
|
||||||
} |
|
||||||
fmt.Fprintln(w, "}") |
|
||||||
gen.WriteGoFile("data_test.go", "norm", w.Bytes()) |
|
||||||
} |
|
@ -1,117 +0,0 @@ |
|||||||
// Copyright 2011 The Go Authors. All rights reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
// +build ignore
|
|
||||||
|
|
||||||
// Trie table generator.
|
|
||||||
// Used by make*tables tools to generate a go file with trie data structures
|
|
||||||
// for mapping UTF-8 to a 16-bit value. All but the last byte in a UTF-8 byte
|
|
||||||
// sequence are used to lookup offsets in the index table to be used for the
|
|
||||||
// next byte. The last byte is used to index into a table with 16-bit values.
|
|
||||||
|
|
||||||
package main |
|
||||||
|
|
||||||
import ( |
|
||||||
"fmt" |
|
||||||
"io" |
|
||||||
) |
|
||||||
|
|
||||||
const maxSparseEntries = 16 |
|
||||||
|
|
||||||
type normCompacter struct { |
|
||||||
sparseBlocks [][]uint64 |
|
||||||
sparseOffset []uint16 |
|
||||||
sparseCount int |
|
||||||
name string |
|
||||||
} |
|
||||||
|
|
||||||
func mostFrequentStride(a []uint64) int { |
|
||||||
counts := make(map[int]int) |
|
||||||
var v int |
|
||||||
for _, x := range a { |
|
||||||
if stride := int(x) - v; v != 0 && stride >= 0 { |
|
||||||
counts[stride]++ |
|
||||||
} |
|
||||||
v = int(x) |
|
||||||
} |
|
||||||
var maxs, maxc int |
|
||||||
for stride, cnt := range counts { |
|
||||||
if cnt > maxc || (cnt == maxc && stride < maxs) { |
|
||||||
maxs, maxc = stride, cnt |
|
||||||
} |
|
||||||
} |
|
||||||
return maxs |
|
||||||
} |
|
||||||
|
|
||||||
func countSparseEntries(a []uint64) int { |
|
||||||
stride := mostFrequentStride(a) |
|
||||||
var v, count int |
|
||||||
for _, tv := range a { |
|
||||||
if int(tv)-v != stride { |
|
||||||
if tv != 0 { |
|
||||||
count++ |
|
||||||
} |
|
||||||
} |
|
||||||
v = int(tv) |
|
||||||
} |
|
||||||
return count |
|
||||||
} |
|
||||||
|
|
||||||
func (c *normCompacter) Size(v []uint64) (sz int, ok bool) { |
|
||||||
if n := countSparseEntries(v); n <= maxSparseEntries { |
|
||||||
return (n+1)*4 + 2, true |
|
||||||
} |
|
||||||
return 0, false |
|
||||||
} |
|
||||||
|
|
||||||
func (c *normCompacter) Store(v []uint64) uint32 { |
|
||||||
h := uint32(len(c.sparseOffset)) |
|
||||||
c.sparseBlocks = append(c.sparseBlocks, v) |
|
||||||
c.sparseOffset = append(c.sparseOffset, uint16(c.sparseCount)) |
|
||||||
c.sparseCount += countSparseEntries(v) + 1 |
|
||||||
return h |
|
||||||
} |
|
||||||
|
|
||||||
func (c *normCompacter) Handler() string { |
|
||||||
return c.name + "Sparse.lookup" |
|
||||||
} |
|
||||||
|
|
||||||
func (c *normCompacter) Print(w io.Writer) (retErr error) { |
|
||||||
p := func(f string, x ...interface{}) { |
|
||||||
if _, err := fmt.Fprintf(w, f, x...); retErr == nil && err != nil { |
|
||||||
retErr = err |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
ls := len(c.sparseBlocks) |
|
||||||
p("// %sSparseOffset: %d entries, %d bytes\n", c.name, ls, ls*2) |
|
||||||
p("var %sSparseOffset = %#v\n\n", c.name, c.sparseOffset) |
|
||||||
|
|
||||||
ns := c.sparseCount |
|
||||||
p("// %sSparseValues: %d entries, %d bytes\n", c.name, ns, ns*4) |
|
||||||
p("var %sSparseValues = [%d]valueRange {", c.name, ns) |
|
||||||
for i, b := range c.sparseBlocks { |
|
||||||
p("\n// Block %#x, offset %#x", i, c.sparseOffset[i]) |
|
||||||
var v int |
|
||||||
stride := mostFrequentStride(b) |
|
||||||
n := countSparseEntries(b) |
|
||||||
p("\n{value:%#04x,lo:%#02x},", stride, uint8(n)) |
|
||||||
for i, nv := range b { |
|
||||||
if int(nv)-v != stride { |
|
||||||
if v != 0 { |
|
||||||
p(",hi:%#02x},", 0x80+i-1) |
|
||||||
} |
|
||||||
if nv != 0 { |
|
||||||
p("\n{value:%#04x,lo:%#02x", nv, 0x80+i) |
|
||||||
} |
|
||||||
} |
|
||||||
v = int(nv) |
|
||||||
} |
|
||||||
if v != 0 { |
|
||||||
p(",hi:%#02x},", 0x80+len(b)-1) |
|
||||||
} |
|
||||||
} |
|
||||||
p("\n}\n\n") |
|
||||||
return |
|
||||||
} |
|
Loading…
Reference in new issue