aboutsummaryrefslogtreecommitdiffhomepage
path: root/vendor/github.com/apparentlymart/go-textseg/textseg/make_test_tables.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/apparentlymart/go-textseg/textseg/make_test_tables.go')
-rw-r--r--vendor/github.com/apparentlymart/go-textseg/textseg/make_test_tables.go212
1 files changed, 212 insertions, 0 deletions
diff --git a/vendor/github.com/apparentlymart/go-textseg/textseg/make_test_tables.go b/vendor/github.com/apparentlymart/go-textseg/textseg/make_test_tables.go
new file mode 100644
index 0000000..ac42002
--- /dev/null
+++ b/vendor/github.com/apparentlymart/go-textseg/textseg/make_test_tables.go
@@ -0,0 +1,212 @@
1// Copyright (c) 2014 Couchbase, Inc.
2// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
3// except in compliance with the License. You may obtain a copy of the License at
4// http://www.apache.org/licenses/LICENSE-2.0
5// Unless required by applicable law or agreed to in writing, software distributed under the
6// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
7// either express or implied. See the License for the specific language governing permissions
8// and limitations under the License.
9
10// +build ignore
11
12package main
13
14import (
15 "bufio"
16 "bytes"
17 "flag"
18 "fmt"
19 "io"
20 "log"
21 "net/http"
22 "os"
23 "os/exec"
24 "strconv"
25 "strings"
26 "unicode"
27)
28
29var url = flag.String("url",
30 "http://www.unicode.org/Public/"+unicode.Version+"/ucd/auxiliary/",
31 "URL of Unicode database directory")
32var verbose = flag.Bool("verbose",
33 false,
34 "write data to stdout as it is parsed")
35var localFiles = flag.Bool("local",
36 false,
37 "data files have been copied to the current directory; for debugging only")
38
39var outputFile = flag.String("output",
40 "",
41 "output file for generated tables; default stdout")
42
43var output *bufio.Writer
44
45func main() {
46 flag.Parse()
47 setupOutput()
48
49 graphemeTests := make([]test, 0)
50 graphemeTests = loadUnicodeData("GraphemeBreakTest.txt", graphemeTests)
51 wordTests := make([]test, 0)
52 wordTests = loadUnicodeData("WordBreakTest.txt", wordTests)
53 sentenceTests := make([]test, 0)
54 sentenceTests = loadUnicodeData("SentenceBreakTest.txt", sentenceTests)
55
56 fmt.Fprintf(output, fileHeader, *url)
57 generateTestTables("Grapheme", graphemeTests)
58 generateTestTables("Word", wordTests)
59 generateTestTables("Sentence", sentenceTests)
60
61 flushOutput()
62}
63
64// WordBreakProperty.txt has the form:
65// 05F0..05F2 ; Hebrew_Letter # Lo [3] HEBREW LIGATURE YIDDISH DOUBLE VAV..HEBREW LIGATURE YIDDISH DOUBLE YOD
66// FB1D ; Hebrew_Letter # Lo HEBREW LETTER YOD WITH HIRIQ
67func openReader(file string) (input io.ReadCloser) {
68 if *localFiles {
69 f, err := os.Open(file)
70 if err != nil {
71 log.Fatal(err)
72 }
73 input = f
74 } else {
75 path := *url + file
76 resp, err := http.Get(path)
77 if err != nil {
78 log.Fatal(err)
79 }
80 if resp.StatusCode != 200 {
81 log.Fatal("bad GET status for "+file, resp.Status)
82 }
83 input = resp.Body
84 }
85 return
86}
87
88func loadUnicodeData(filename string, tests []test) []test {
89 f := openReader(filename)
90 defer f.Close()
91 bufioReader := bufio.NewReader(f)
92 line, err := bufioReader.ReadString('\n')
93 for err == nil {
94 tests = parseLine(line, tests)
95 line, err = bufioReader.ReadString('\n')
96 }
97 // if the err was EOF still need to process last value
98 if err == io.EOF {
99 tests = parseLine(line, tests)
100 }
101 return tests
102}
103
104const comment = "#"
105const brk = "÷"
106const nbrk = "×"
107
108type test [][]byte
109
110func parseLine(line string, tests []test) []test {
111 if strings.HasPrefix(line, comment) {
112 return tests
113 }
114 line = strings.TrimSpace(line)
115 if len(line) == 0 {
116 return tests
117 }
118 commentStart := strings.Index(line, comment)
119 if commentStart > 0 {
120 line = line[0:commentStart]
121 }
122 pieces := strings.Split(line, brk)
123 t := make(test, 0)
124 for _, piece := range pieces {
125 piece = strings.TrimSpace(piece)
126 if len(piece) > 0 {
127 codePoints := strings.Split(piece, nbrk)
128 word := ""
129 for _, codePoint := range codePoints {
130 codePoint = strings.TrimSpace(codePoint)
131 r, err := strconv.ParseInt(codePoint, 16, 64)
132 if err != nil {
133 log.Printf("err: %v for '%s'", err, string(r))
134 return tests
135 }
136
137 word += string(r)
138 }
139 t = append(t, []byte(word))
140 }
141 }
142 tests = append(tests, t)
143 return tests
144}
145
146func generateTestTables(prefix string, tests []test) {
147 fmt.Fprintf(output, testHeader, prefix)
148 for _, t := range tests {
149 fmt.Fprintf(output, "\t\t{\n")
150 fmt.Fprintf(output, "\t\t\tinput: %#v,\n", bytes.Join(t, []byte{}))
151 fmt.Fprintf(output, "\t\t\toutput: %s,\n", generateTest(t))
152 fmt.Fprintf(output, "\t\t},\n")
153 }
154 fmt.Fprintf(output, "}\n")
155}
156
157func generateTest(t test) string {
158 rv := "[][]byte{"
159 for _, te := range t {
160 rv += fmt.Sprintf("%#v,", te)
161 }
162 rv += "}"
163 return rv
164}
165
166const fileHeader = `// Generated by running
167// maketesttables --url=%s
168// DO NOT EDIT
169
170package textseg
171`
172
173const testHeader = `var unicode%sTests = []struct {
174 input []byte
175 output [][]byte
176 }{
177`
178
179func setupOutput() {
180 output = bufio.NewWriter(startGofmt())
181}
182
183// startGofmt connects output to a gofmt process if -output is set.
184func startGofmt() io.Writer {
185 if *outputFile == "" {
186 return os.Stdout
187 }
188 stdout, err := os.Create(*outputFile)
189 if err != nil {
190 log.Fatal(err)
191 }
192 // Pipe output to gofmt.
193 gofmt := exec.Command("gofmt")
194 fd, err := gofmt.StdinPipe()
195 if err != nil {
196 log.Fatal(err)
197 }
198 gofmt.Stdout = stdout
199 gofmt.Stderr = os.Stderr
200 err = gofmt.Start()
201 if err != nil {
202 log.Fatal(err)
203 }
204 return fd
205}
206
207func flushOutput() {
208 err := output.Flush()
209 if err != nil {
210 log.Fatal(err)
211 }
212}