You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							219 lines
						
					
					
						
							5.4 KiB
						
					
					
				
			
		
		
	
	
							219 lines
						
					
					
						
							5.4 KiB
						
					
					
				| //  Copyright (c) 2015 Couchbase, Inc.
 | |
| //  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
 | |
| //  except in compliance with the License. You may obtain a copy of the License at
 | |
| //    http://www.apache.org/licenses/LICENSE-2.0
 | |
| //  Unless required by applicable law or agreed to in writing, software distributed under the
 | |
| //  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 | |
| //  either express or implied. See the License for the specific language governing permissions
 | |
| //  and limitations under the License.
 | |
| 
 | |
| // +build ignore
 | |
| 
 | |
| package main
 | |
| 
 | |
| import (
 | |
| 	"bufio"
 | |
| 	"bytes"
 | |
| 	"flag"
 | |
| 	"fmt"
 | |
| 	"io"
 | |
| 	"log"
 | |
| 	"net/http"
 | |
| 	"os"
 | |
| 	"os/exec"
 | |
| 	"strconv"
 | |
| 	"strings"
 | |
| 	"unicode"
 | |
| )
 | |
| 
 | |
| var url = flag.String("url",
 | |
| 	"http://www.unicode.org/Public/"+unicode.Version+"/ucd/auxiliary/",
 | |
| 	"URL of Unicode database directory")
 | |
| var verbose = flag.Bool("verbose",
 | |
| 	false,
 | |
| 	"write data to stdout as it is parsed")
 | |
| var localFiles = flag.Bool("local",
 | |
| 	false,
 | |
| 	"data files have been copied to the current directory; for debugging only")
 | |
| 
 | |
| var outputFile = flag.String("output",
 | |
| 	"",
 | |
| 	"output file for generated tables; default stdout")
 | |
| 
 | |
| var output *bufio.Writer
 | |
| 
 | |
| func main() {
 | |
| 	flag.Parse()
 | |
| 	setupOutput()
 | |
| 
 | |
| 	graphemeTests := make([]test, 0)
 | |
| 	graphemeComments := make([]string, 0)
 | |
| 	graphemeTests, graphemeComments = loadUnicodeData("GraphemeBreakTest.txt", graphemeTests, graphemeComments)
 | |
| 	wordTests := make([]test, 0)
 | |
| 	wordComments := make([]string, 0)
 | |
| 	wordTests, wordComments = loadUnicodeData("WordBreakTest.txt", wordTests, wordComments)
 | |
| 	sentenceTests := make([]test, 0)
 | |
| 	sentenceComments := make([]string, 0)
 | |
| 	sentenceTests, sentenceComments = loadUnicodeData("SentenceBreakTest.txt", sentenceTests, sentenceComments)
 | |
| 
 | |
| 	fmt.Fprintf(output, fileHeader, *url)
 | |
| 	generateTestTables("Grapheme", graphemeTests, graphemeComments)
 | |
| 	generateTestTables("Word", wordTests, wordComments)
 | |
| 	generateTestTables("Sentence", sentenceTests, sentenceComments)
 | |
| 
 | |
| 	flushOutput()
 | |
| }
 | |
| 
 | |
| // WordBreakProperty.txt has the form:
 | |
| // 05F0..05F2    ; Hebrew_Letter # Lo   [3] HEBREW LIGATURE YIDDISH DOUBLE VAV..HEBREW LIGATURE YIDDISH DOUBLE YOD
 | |
| // FB1D          ; Hebrew_Letter # Lo       HEBREW LETTER YOD WITH HIRIQ
 | |
| func openReader(file string) (input io.ReadCloser) {
 | |
| 	if *localFiles {
 | |
| 		f, err := os.Open(file)
 | |
| 		if err != nil {
 | |
| 			log.Fatal(err)
 | |
| 		}
 | |
| 		input = f
 | |
| 	} else {
 | |
| 		path := *url + file
 | |
| 		resp, err := http.Get(path)
 | |
| 		if err != nil {
 | |
| 			log.Fatal(err)
 | |
| 		}
 | |
| 		if resp.StatusCode != 200 {
 | |
| 			log.Fatal("bad GET status for "+file, resp.Status)
 | |
| 		}
 | |
| 		input = resp.Body
 | |
| 	}
 | |
| 	return
 | |
| }
 | |
| 
 | |
| func loadUnicodeData(filename string, tests []test, comments []string) ([]test, []string) {
 | |
| 	f := openReader(filename)
 | |
| 	defer f.Close()
 | |
| 	bufioReader := bufio.NewReader(f)
 | |
| 	line, err := bufioReader.ReadString('\n')
 | |
| 	for err == nil {
 | |
| 		tests, comments = parseLine(line, tests, comments)
 | |
| 		line, err = bufioReader.ReadString('\n')
 | |
| 	}
 | |
| 	// if the err was EOF still need to process last value
 | |
| 	if err == io.EOF {
 | |
| 		tests, comments = parseLine(line, tests, comments)
 | |
| 	}
 | |
| 	return tests, comments
 | |
| }
 | |
| 
 | |
| const comment = "#"
 | |
| const brk = "÷"
 | |
| const nbrk = "×"
 | |
| 
 | |
| type test [][]byte
 | |
| 
 | |
| func parseLine(line string, tests []test, comments []string) ([]test, []string) {
 | |
| 	if strings.HasPrefix(line, comment) {
 | |
| 		return tests, comments
 | |
| 	}
 | |
| 	line = strings.TrimSpace(line)
 | |
| 	if len(line) == 0 {
 | |
| 		return tests, comments
 | |
| 	}
 | |
| 	commentStart := strings.Index(line, comment)
 | |
| 	comment := strings.TrimSpace(line[commentStart+1:])
 | |
| 	if commentStart > 0 {
 | |
| 		line = line[0:commentStart]
 | |
| 	}
 | |
| 	pieces := strings.Split(line, brk)
 | |
| 	t := make(test, 0)
 | |
| 	for _, piece := range pieces {
 | |
| 		piece = strings.TrimSpace(piece)
 | |
| 		if len(piece) > 0 {
 | |
| 			codePoints := strings.Split(piece, nbrk)
 | |
| 			word := ""
 | |
| 			for _, codePoint := range codePoints {
 | |
| 				codePoint = strings.TrimSpace(codePoint)
 | |
| 				r, err := strconv.ParseInt(codePoint, 16, 64)
 | |
| 				if err != nil {
 | |
| 					log.Printf("err: %v for '%s'", err, string(r))
 | |
| 					return tests, comments
 | |
| 				}
 | |
| 
 | |
| 				word += string(r)
 | |
| 			}
 | |
| 			t = append(t, []byte(word))
 | |
| 		}
 | |
| 	}
 | |
| 	tests = append(tests, t)
 | |
| 	comments = append(comments, comment)
 | |
| 	return tests, comments
 | |
| }
 | |
| 
 | |
| func generateTestTables(prefix string, tests []test, comments []string) {
 | |
| 	fmt.Fprintf(output, testHeader, prefix)
 | |
| 	for i, t := range tests {
 | |
| 		fmt.Fprintf(output, "\t\t{\n")
 | |
| 		fmt.Fprintf(output, "\t\t\tinput: %#v,\n", bytes.Join(t, []byte{}))
 | |
| 		fmt.Fprintf(output, "\t\t\toutput: %s,\n", generateTest(t))
 | |
| 		fmt.Fprintf(output, "\t\t\tcomment: `%s`,\n", comments[i])
 | |
| 		fmt.Fprintf(output, "\t\t},\n")
 | |
| 	}
 | |
| 	fmt.Fprintf(output, "}\n")
 | |
| }
 | |
| 
 | |
| func generateTest(t test) string {
 | |
| 	rv := "[][]byte{"
 | |
| 	for _, te := range t {
 | |
| 		rv += fmt.Sprintf("%#v,", te)
 | |
| 	}
 | |
| 	rv += "}"
 | |
| 	return rv
 | |
| }
 | |
| 
 | |
| const fileHeader = `// Generated by running
 | |
| //      maketesttables --url=%s
 | |
| // DO NOT EDIT
 | |
| 
 | |
| package segment
 | |
| `
 | |
| 
 | |
| const testHeader = `var unicode%sTests = []struct {
 | |
| 		input  []byte
 | |
| 		output [][]byte
 | |
| 		comment string
 | |
| 	}{
 | |
| `
 | |
| 
 | |
| func setupOutput() {
 | |
| 	output = bufio.NewWriter(startGofmt())
 | |
| }
 | |
| 
 | |
| // startGofmt connects output to a gofmt process if -output is set.
 | |
| func startGofmt() io.Writer {
 | |
| 	if *outputFile == "" {
 | |
| 		return os.Stdout
 | |
| 	}
 | |
| 	stdout, err := os.Create(*outputFile)
 | |
| 	if err != nil {
 | |
| 		log.Fatal(err)
 | |
| 	}
 | |
| 	// Pipe output to gofmt.
 | |
| 	gofmt := exec.Command("gofmt")
 | |
| 	fd, err := gofmt.StdinPipe()
 | |
| 	if err != nil {
 | |
| 		log.Fatal(err)
 | |
| 	}
 | |
| 	gofmt.Stdout = stdout
 | |
| 	gofmt.Stderr = os.Stderr
 | |
| 	err = gofmt.Start()
 | |
| 	if err != nil {
 | |
| 		log.Fatal(err)
 | |
| 	}
 | |
| 	return fd
 | |
| }
 | |
| 
 | |
| func flushOutput() {
 | |
| 	err := output.Flush()
 | |
| 	if err != nil {
 | |
| 		log.Fatal(err)
 | |
| 	}
 | |
| }
 | |
| 
 |