aboutsummaryrefslogtreecommitdiffhomepage
path: root/vendor/github.com/apparentlymart/go-textseg/textseg/make_tables.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/apparentlymart/go-textseg/textseg/make_tables.go')
-rw-r--r--vendor/github.com/apparentlymart/go-textseg/textseg/make_tables.go307
1 files changed, 307 insertions, 0 deletions
diff --git a/vendor/github.com/apparentlymart/go-textseg/textseg/make_tables.go b/vendor/github.com/apparentlymart/go-textseg/textseg/make_tables.go
new file mode 100644
index 0000000..aad3d05
--- /dev/null
+++ b/vendor/github.com/apparentlymart/go-textseg/textseg/make_tables.go
@@ -0,0 +1,307 @@
1// Copyright (c) 2014 Couchbase, Inc.
2// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
3// except in compliance with the License. You may obtain a copy of the License at
4// http://www.apache.org/licenses/LICENSE-2.0
5// Unless required by applicable law or agreed to in writing, software distributed under the
6// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
7// either express or implied. See the License for the specific language governing permissions
8// and limitations under the License.
9
10// Modified by Martin Atkins to serve the needs of package textseg.
11
12// +build ignore
13
14package main
15
16import (
17 "bufio"
18 "flag"
19 "fmt"
20 "io"
21 "log"
22 "net/http"
23 "os"
24 "os/exec"
25 "sort"
26 "strconv"
27 "strings"
28 "unicode"
29)
30
31var url = flag.String("url",
32 "http://www.unicode.org/Public/"+unicode.Version+"/ucd/auxiliary/",
33 "URL of Unicode database directory")
34var verbose = flag.Bool("verbose",
35 false,
36 "write data to stdout as it is parsed")
37var localFiles = flag.Bool("local",
38 false,
39 "data files have been copied to the current directory; for debugging only")
40var outputFile = flag.String("output",
41 "",
42 "output file for generated tables; default stdout")
43
44var output *bufio.Writer
45
46func main() {
47 flag.Parse()
48 setupOutput()
49
50 graphemePropertyRanges := make(map[string]*unicode.RangeTable)
51 loadUnicodeData("GraphemeBreakProperty.txt", graphemePropertyRanges)
52 wordPropertyRanges := make(map[string]*unicode.RangeTable)
53 loadUnicodeData("WordBreakProperty.txt", wordPropertyRanges)
54 sentencePropertyRanges := make(map[string]*unicode.RangeTable)
55 loadUnicodeData("SentenceBreakProperty.txt", sentencePropertyRanges)
56
57 fmt.Fprintf(output, fileHeader, *url)
58 generateTables("Grapheme", graphemePropertyRanges)
59 generateTables("Word", wordPropertyRanges)
60 generateTables("Sentence", sentencePropertyRanges)
61
62 flushOutput()
63}
64
65// WordBreakProperty.txt has the form:
66// 05F0..05F2 ; Hebrew_Letter # Lo [3] HEBREW LIGATURE YIDDISH DOUBLE VAV..HEBREW LIGATURE YIDDISH DOUBLE YOD
67// FB1D ; Hebrew_Letter # Lo HEBREW LETTER YOD WITH HIRIQ
68func openReader(file string) (input io.ReadCloser) {
69 if *localFiles {
70 f, err := os.Open(file)
71 if err != nil {
72 log.Fatal(err)
73 }
74 input = f
75 } else {
76 path := *url + file
77 resp, err := http.Get(path)
78 if err != nil {
79 log.Fatal(err)
80 }
81 if resp.StatusCode != 200 {
82 log.Fatal("bad GET status for "+file, resp.Status)
83 }
84 input = resp.Body
85 }
86 return
87}
88
89func loadUnicodeData(filename string, propertyRanges map[string]*unicode.RangeTable) {
90 f := openReader(filename)
91 defer f.Close()
92 bufioReader := bufio.NewReader(f)
93 line, err := bufioReader.ReadString('\n')
94 for err == nil {
95 parseLine(line, propertyRanges)
96 line, err = bufioReader.ReadString('\n')
97 }
98 // if the err was EOF still need to process last value
99 if err == io.EOF {
100 parseLine(line, propertyRanges)
101 }
102}
103
104const comment = "#"
105const sep = ";"
106const rnge = ".."
107
108func parseLine(line string, propertyRanges map[string]*unicode.RangeTable) {
109 if strings.HasPrefix(line, comment) {
110 return
111 }
112 line = strings.TrimSpace(line)
113 if len(line) == 0 {
114 return
115 }
116 commentStart := strings.Index(line, comment)
117 if commentStart > 0 {
118 line = line[0:commentStart]
119 }
120 pieces := strings.Split(line, sep)
121 if len(pieces) != 2 {
122 log.Printf("unexpected %d pieces in %s", len(pieces), line)
123 return
124 }
125
126 propertyName := strings.TrimSpace(pieces[1])
127
128 rangeTable, ok := propertyRanges[propertyName]
129 if !ok {
130 rangeTable = &unicode.RangeTable{
131 LatinOffset: 0,
132 }
133 propertyRanges[propertyName] = rangeTable
134 }
135
136 codepointRange := strings.TrimSpace(pieces[0])
137 rngeIndex := strings.Index(codepointRange, rnge)
138
139 if rngeIndex < 0 {
140 // single codepoint, not range
141 codepointInt, err := strconv.ParseUint(codepointRange, 16, 64)
142 if err != nil {
143 log.Printf("error parsing int: %v", err)
144 return
145 }
146 if codepointInt < 0x10000 {
147 r16 := unicode.Range16{
148 Lo: uint16(codepointInt),
149 Hi: uint16(codepointInt),
150 Stride: 1,
151 }
152 addR16ToTable(rangeTable, r16)
153 } else {
154 r32 := unicode.Range32{
155 Lo: uint32(codepointInt),
156 Hi: uint32(codepointInt),
157 Stride: 1,
158 }
159 addR32ToTable(rangeTable, r32)
160 }
161 } else {
162 rngeStart := codepointRange[0:rngeIndex]
163 rngeEnd := codepointRange[rngeIndex+2:]
164 rngeStartInt, err := strconv.ParseUint(rngeStart, 16, 64)
165 if err != nil {
166 log.Printf("error parsing int: %v", err)
167 return
168 }
169 rngeEndInt, err := strconv.ParseUint(rngeEnd, 16, 64)
170 if err != nil {
171 log.Printf("error parsing int: %v", err)
172 return
173 }
174 if rngeStartInt < 0x10000 && rngeEndInt < 0x10000 {
175 r16 := unicode.Range16{
176 Lo: uint16(rngeStartInt),
177 Hi: uint16(rngeEndInt),
178 Stride: 1,
179 }
180 addR16ToTable(rangeTable, r16)
181 } else if rngeStartInt >= 0x10000 && rngeEndInt >= 0x10000 {
182 r32 := unicode.Range32{
183 Lo: uint32(rngeStartInt),
184 Hi: uint32(rngeEndInt),
185 Stride: 1,
186 }
187 addR32ToTable(rangeTable, r32)
188 } else {
189 log.Printf("unexpected range")
190 }
191 }
192}
193
194func addR16ToTable(r *unicode.RangeTable, r16 unicode.Range16) {
195 if r.R16 == nil {
196 r.R16 = make([]unicode.Range16, 0, 1)
197 }
198 r.R16 = append(r.R16, r16)
199 if r16.Hi <= unicode.MaxLatin1 {
200 r.LatinOffset++
201 }
202}
203
204func addR32ToTable(r *unicode.RangeTable, r32 unicode.Range32) {
205 if r.R32 == nil {
206 r.R32 = make([]unicode.Range32, 0, 1)
207 }
208 r.R32 = append(r.R32, r32)
209}
210
211func generateTables(prefix string, propertyRanges map[string]*unicode.RangeTable) {
212 prNames := make([]string, 0, len(propertyRanges))
213 for k := range propertyRanges {
214 prNames = append(prNames, k)
215 }
216 sort.Strings(prNames)
217 for _, key := range prNames {
218 rt := propertyRanges[key]
219 fmt.Fprintf(output, "var _%s%s = %s\n", prefix, key, generateRangeTable(rt))
220 }
221 fmt.Fprintf(output, "type _%sRuneRange unicode.RangeTable\n", prefix)
222
223 fmt.Fprintf(output, "func _%sRuneType(r rune) *_%sRuneRange {\n", prefix, prefix)
224 fmt.Fprintf(output, "\tswitch {\n")
225 for _, key := range prNames {
226 fmt.Fprintf(output, "\tcase unicode.Is(_%s%s, r):\n\t\treturn (*_%sRuneRange)(_%s%s)\n", prefix, key, prefix, prefix, key)
227 }
228 fmt.Fprintf(output, "\tdefault:\n\t\treturn nil\n")
229 fmt.Fprintf(output, "\t}\n")
230 fmt.Fprintf(output, "}\n")
231
232 fmt.Fprintf(output, "func (rng *_%sRuneRange) String() string {\n", prefix)
233 fmt.Fprintf(output, "\tswitch (*unicode.RangeTable)(rng) {\n")
234 for _, key := range prNames {
235 fmt.Fprintf(output, "\tcase _%s%s:\n\t\treturn %q\n", prefix, key, key)
236 }
237 fmt.Fprintf(output, "\tdefault:\n\t\treturn \"Other\"\n")
238 fmt.Fprintf(output, "\t}\n")
239 fmt.Fprintf(output, "}\n")
240}
241
242func generateRangeTable(rt *unicode.RangeTable) string {
243 rv := "&unicode.RangeTable{\n"
244 if rt.R16 != nil {
245 rv += "\tR16: []unicode.Range16{\n"
246 for _, r16 := range rt.R16 {
247 rv += fmt.Sprintf("\t\t%#v,\n", r16)
248 }
249 rv += "\t},\n"
250 }
251 if rt.R32 != nil {
252 rv += "\tR32: []unicode.Range32{\n"
253 for _, r32 := range rt.R32 {
254 rv += fmt.Sprintf("\t\t%#v,\n", r32)
255 }
256 rv += "\t},\n"
257 }
258 rv += fmt.Sprintf("\t\tLatinOffset: %d,\n", rt.LatinOffset)
259 rv += "}\n"
260 return rv
261}
262
263const fileHeader = `// Generated by running
264// maketables --url=%s
265// DO NOT EDIT
266
267package textseg
268
269import(
270 "unicode"
271)
272`
273
274func setupOutput() {
275 output = bufio.NewWriter(startGofmt())
276}
277
278// startGofmt connects output to a gofmt process if -output is set.
279func startGofmt() io.Writer {
280 if *outputFile == "" {
281 return os.Stdout
282 }
283 stdout, err := os.Create(*outputFile)
284 if err != nil {
285 log.Fatal(err)
286 }
287 // Pipe output to gofmt.
288 gofmt := exec.Command("gofmt")
289 fd, err := gofmt.StdinPipe()
290 if err != nil {
291 log.Fatal(err)
292 }
293 gofmt.Stdout = stdout
294 gofmt.Stderr = os.Stderr
295 err = gofmt.Start()
296 if err != nil {
297 log.Fatal(err)
298 }
299 return fd
300}
301
302func flushOutput() {
303 err := output.Flush()
304 if err != nil {
305 log.Fatal(err)
306 }
307}