diff options
Diffstat (limited to 'vendor/golang.org/x/text/unicode/norm/maketables.go')
-rw-r--r-- | vendor/golang.org/x/text/unicode/norm/maketables.go | 976 |
1 files changed, 976 insertions, 0 deletions
diff --git a/vendor/golang.org/x/text/unicode/norm/maketables.go b/vendor/golang.org/x/text/unicode/norm/maketables.go new file mode 100644 index 0000000..338c395 --- /dev/null +++ b/vendor/golang.org/x/text/unicode/norm/maketables.go | |||
@@ -0,0 +1,976 @@ | |||
1 | // Copyright 2011 The Go Authors. All rights reserved. | ||
2 | // Use of this source code is governed by a BSD-style | ||
3 | // license that can be found in the LICENSE file. | ||
4 | |||
5 | // +build ignore | ||
6 | |||
7 | // Normalization table generator. | ||
8 | // Data read from the web. | ||
9 | // See forminfo.go for a description of the trie values associated with each rune. | ||
10 | |||
11 | package main | ||
12 | |||
13 | import ( | ||
14 | "bytes" | ||
15 | "flag" | ||
16 | "fmt" | ||
17 | "io" | ||
18 | "log" | ||
19 | "sort" | ||
20 | "strconv" | ||
21 | "strings" | ||
22 | |||
23 | "golang.org/x/text/internal/gen" | ||
24 | "golang.org/x/text/internal/triegen" | ||
25 | "golang.org/x/text/internal/ucd" | ||
26 | ) | ||
27 | |||
28 | func main() { | ||
29 | gen.Init() | ||
30 | loadUnicodeData() | ||
31 | compactCCC() | ||
32 | loadCompositionExclusions() | ||
33 | completeCharFields(FCanonical) | ||
34 | completeCharFields(FCompatibility) | ||
35 | computeNonStarterCounts() | ||
36 | verifyComputed() | ||
37 | printChars() | ||
38 | testDerived() | ||
39 | printTestdata() | ||
40 | makeTables() | ||
41 | } | ||
42 | |||
43 | var ( | ||
44 | tablelist = flag.String("tables", | ||
45 | "all", | ||
46 | "comma-separated list of which tables to generate; "+ | ||
47 | "can be 'decomp', 'recomp', 'info' and 'all'") | ||
48 | test = flag.Bool("test", | ||
49 | false, | ||
50 | "test existing tables against DerivedNormalizationProps and generate test data for regression testing") | ||
51 | verbose = flag.Bool("verbose", | ||
52 | false, | ||
53 | "write data to stdout as it is parsed") | ||
54 | ) | ||
55 | |||
56 | const MaxChar = 0x10FFFF // anything above this shouldn't exist | ||
57 | |||
58 | // Quick Check properties of runes allow us to quickly | ||
59 | // determine whether a rune may occur in a normal form. | ||
60 | // For a given normal form, a rune may be guaranteed to occur | ||
61 | // verbatim (QC=Yes), may or may not combine with another | ||
62 | // rune (QC=Maybe), or may not occur (QC=No). | ||
63 | type QCResult int | ||
64 | |||
65 | const ( | ||
66 | QCUnknown QCResult = iota | ||
67 | QCYes | ||
68 | QCNo | ||
69 | QCMaybe | ||
70 | ) | ||
71 | |||
72 | func (r QCResult) String() string { | ||
73 | switch r { | ||
74 | case QCYes: | ||
75 | return "Yes" | ||
76 | case QCNo: | ||
77 | return "No" | ||
78 | case QCMaybe: | ||
79 | return "Maybe" | ||
80 | } | ||
81 | return "***UNKNOWN***" | ||
82 | } | ||
83 | |||
84 | const ( | ||
85 | FCanonical = iota // NFC or NFD | ||
86 | FCompatibility // NFKC or NFKD | ||
87 | FNumberOfFormTypes | ||
88 | ) | ||
89 | |||
90 | const ( | ||
91 | MComposed = iota // NFC or NFKC | ||
92 | MDecomposed // NFD or NFKD | ||
93 | MNumberOfModes | ||
94 | ) | ||
95 | |||
96 | // This contains only the properties we're interested in. | ||
97 | type Char struct { | ||
98 | name string | ||
99 | codePoint rune // if zero, this index is not a valid code point. | ||
100 | ccc uint8 // canonical combining class | ||
101 | origCCC uint8 | ||
102 | excludeInComp bool // from CompositionExclusions.txt | ||
103 | compatDecomp bool // it has a compatibility expansion | ||
104 | |||
105 | nTrailingNonStarters uint8 | ||
106 | nLeadingNonStarters uint8 // must be equal to trailing if non-zero | ||
107 | |||
108 | forms [FNumberOfFormTypes]FormInfo // For FCanonical and FCompatibility | ||
109 | |||
110 | state State | ||
111 | } | ||
112 | |||
113 | var chars = make([]Char, MaxChar+1) | ||
114 | var cccMap = make(map[uint8]uint8) | ||
115 | |||
116 | func (c Char) String() string { | ||
117 | buf := new(bytes.Buffer) | ||
118 | |||
119 | fmt.Fprintf(buf, "%U [%s]:\n", c.codePoint, c.name) | ||
120 | fmt.Fprintf(buf, " ccc: %v\n", c.ccc) | ||
121 | fmt.Fprintf(buf, " excludeInComp: %v\n", c.excludeInComp) | ||
122 | fmt.Fprintf(buf, " compatDecomp: %v\n", c.compatDecomp) | ||
123 | fmt.Fprintf(buf, " state: %v\n", c.state) | ||
124 | fmt.Fprintf(buf, " NFC:\n") | ||
125 | fmt.Fprint(buf, c.forms[FCanonical]) | ||
126 | fmt.Fprintf(buf, " NFKC:\n") | ||
127 | fmt.Fprint(buf, c.forms[FCompatibility]) | ||
128 | |||
129 | return buf.String() | ||
130 | } | ||
131 | |||
132 | // In UnicodeData.txt, some ranges are marked like this: | ||
133 | // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; | ||
134 | // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; | ||
135 | // parseCharacter keeps a state variable indicating the weirdness. | ||
136 | type State int | ||
137 | |||
138 | const ( | ||
139 | SNormal State = iota // known to be zero for the type | ||
140 | SFirst | ||
141 | SLast | ||
142 | SMissing | ||
143 | ) | ||
144 | |||
145 | var lastChar = rune('\u0000') | ||
146 | |||
147 | func (c Char) isValid() bool { | ||
148 | return c.codePoint != 0 && c.state != SMissing | ||
149 | } | ||
150 | |||
151 | type FormInfo struct { | ||
152 | quickCheck [MNumberOfModes]QCResult // index: MComposed or MDecomposed | ||
153 | verified [MNumberOfModes]bool // index: MComposed or MDecomposed | ||
154 | |||
155 | combinesForward bool // May combine with rune on the right | ||
156 | combinesBackward bool // May combine with rune on the left | ||
157 | isOneWay bool // Never appears in result | ||
158 | inDecomp bool // Some decompositions result in this char. | ||
159 | decomp Decomposition | ||
160 | expandedDecomp Decomposition | ||
161 | } | ||
162 | |||
163 | func (f FormInfo) String() string { | ||
164 | buf := bytes.NewBuffer(make([]byte, 0)) | ||
165 | |||
166 | fmt.Fprintf(buf, " quickCheck[C]: %v\n", f.quickCheck[MComposed]) | ||
167 | fmt.Fprintf(buf, " quickCheck[D]: %v\n", f.quickCheck[MDecomposed]) | ||
168 | fmt.Fprintf(buf, " cmbForward: %v\n", f.combinesForward) | ||
169 | fmt.Fprintf(buf, " cmbBackward: %v\n", f.combinesBackward) | ||
170 | fmt.Fprintf(buf, " isOneWay: %v\n", f.isOneWay) | ||
171 | fmt.Fprintf(buf, " inDecomp: %v\n", f.inDecomp) | ||
172 | fmt.Fprintf(buf, " decomposition: %X\n", f.decomp) | ||
173 | fmt.Fprintf(buf, " expandedDecomp: %X\n", f.expandedDecomp) | ||
174 | |||
175 | return buf.String() | ||
176 | } | ||
177 | |||
178 | type Decomposition []rune | ||
179 | |||
180 | func parseDecomposition(s string, skipfirst bool) (a []rune, err error) { | ||
181 | decomp := strings.Split(s, " ") | ||
182 | if len(decomp) > 0 && skipfirst { | ||
183 | decomp = decomp[1:] | ||
184 | } | ||
185 | for _, d := range decomp { | ||
186 | point, err := strconv.ParseUint(d, 16, 64) | ||
187 | if err != nil { | ||
188 | return a, err | ||
189 | } | ||
190 | a = append(a, rune(point)) | ||
191 | } | ||
192 | return a, nil | ||
193 | } | ||
194 | |||
195 | func loadUnicodeData() { | ||
196 | f := gen.OpenUCDFile("UnicodeData.txt") | ||
197 | defer f.Close() | ||
198 | p := ucd.New(f) | ||
199 | for p.Next() { | ||
200 | r := p.Rune(ucd.CodePoint) | ||
201 | char := &chars[r] | ||
202 | |||
203 | char.ccc = uint8(p.Uint(ucd.CanonicalCombiningClass)) | ||
204 | decmap := p.String(ucd.DecompMapping) | ||
205 | |||
206 | exp, err := parseDecomposition(decmap, false) | ||
207 | isCompat := false | ||
208 | if err != nil { | ||
209 | if len(decmap) > 0 { | ||
210 | exp, err = parseDecomposition(decmap, true) | ||
211 | if err != nil { | ||
212 | log.Fatalf(`%U: bad decomp |%v|: "%s"`, r, decmap, err) | ||
213 | } | ||
214 | isCompat = true | ||
215 | } | ||
216 | } | ||
217 | |||
218 | char.name = p.String(ucd.Name) | ||
219 | char.codePoint = r | ||
220 | char.forms[FCompatibility].decomp = exp | ||
221 | if !isCompat { | ||
222 | char.forms[FCanonical].decomp = exp | ||
223 | } else { | ||
224 | char.compatDecomp = true | ||
225 | } | ||
226 | if len(decmap) > 0 { | ||
227 | char.forms[FCompatibility].decomp = exp | ||
228 | } | ||
229 | } | ||
230 | if err := p.Err(); err != nil { | ||
231 | log.Fatal(err) | ||
232 | } | ||
233 | } | ||
234 | |||
235 | // compactCCC converts the sparse set of CCC values to a continguous one, | ||
236 | // reducing the number of bits needed from 8 to 6. | ||
237 | func compactCCC() { | ||
238 | m := make(map[uint8]uint8) | ||
239 | for i := range chars { | ||
240 | c := &chars[i] | ||
241 | m[c.ccc] = 0 | ||
242 | } | ||
243 | cccs := []int{} | ||
244 | for v, _ := range m { | ||
245 | cccs = append(cccs, int(v)) | ||
246 | } | ||
247 | sort.Ints(cccs) | ||
248 | for i, c := range cccs { | ||
249 | cccMap[uint8(i)] = uint8(c) | ||
250 | m[uint8(c)] = uint8(i) | ||
251 | } | ||
252 | for i := range chars { | ||
253 | c := &chars[i] | ||
254 | c.origCCC = c.ccc | ||
255 | c.ccc = m[c.ccc] | ||
256 | } | ||
257 | if len(m) >= 1<<6 { | ||
258 | log.Fatalf("too many difference CCC values: %d >= 64", len(m)) | ||
259 | } | ||
260 | } | ||
261 | |||
262 | // CompositionExclusions.txt has form: | ||
263 | // 0958 # ... | ||
264 | // See http://unicode.org/reports/tr44/ for full explanation | ||
265 | func loadCompositionExclusions() { | ||
266 | f := gen.OpenUCDFile("CompositionExclusions.txt") | ||
267 | defer f.Close() | ||
268 | p := ucd.New(f) | ||
269 | for p.Next() { | ||
270 | c := &chars[p.Rune(0)] | ||
271 | if c.excludeInComp { | ||
272 | log.Fatalf("%U: Duplicate entry in exclusions.", c.codePoint) | ||
273 | } | ||
274 | c.excludeInComp = true | ||
275 | } | ||
276 | if e := p.Err(); e != nil { | ||
277 | log.Fatal(e) | ||
278 | } | ||
279 | } | ||
280 | |||
281 | // hasCompatDecomp returns true if any of the recursive | ||
282 | // decompositions contains a compatibility expansion. | ||
283 | // In this case, the character may not occur in NFK*. | ||
284 | func hasCompatDecomp(r rune) bool { | ||
285 | c := &chars[r] | ||
286 | if c.compatDecomp { | ||
287 | return true | ||
288 | } | ||
289 | for _, d := range c.forms[FCompatibility].decomp { | ||
290 | if hasCompatDecomp(d) { | ||
291 | return true | ||
292 | } | ||
293 | } | ||
294 | return false | ||
295 | } | ||
296 | |||
297 | // Hangul related constants. | ||
298 | const ( | ||
299 | HangulBase = 0xAC00 | ||
300 | HangulEnd = 0xD7A4 // hangulBase + Jamo combinations (19 * 21 * 28) | ||
301 | |||
302 | JamoLBase = 0x1100 | ||
303 | JamoLEnd = 0x1113 | ||
304 | JamoVBase = 0x1161 | ||
305 | JamoVEnd = 0x1176 | ||
306 | JamoTBase = 0x11A8 | ||
307 | JamoTEnd = 0x11C3 | ||
308 | |||
309 | JamoLVTCount = 19 * 21 * 28 | ||
310 | JamoTCount = 28 | ||
311 | ) | ||
312 | |||
313 | func isHangul(r rune) bool { | ||
314 | return HangulBase <= r && r < HangulEnd | ||
315 | } | ||
316 | |||
317 | func isHangulWithoutJamoT(r rune) bool { | ||
318 | if !isHangul(r) { | ||
319 | return false | ||
320 | } | ||
321 | r -= HangulBase | ||
322 | return r < JamoLVTCount && r%JamoTCount == 0 | ||
323 | } | ||
324 | |||
325 | func ccc(r rune) uint8 { | ||
326 | return chars[r].ccc | ||
327 | } | ||
328 | |||
329 | // Insert a rune in a buffer, ordered by Canonical Combining Class. | ||
330 | func insertOrdered(b Decomposition, r rune) Decomposition { | ||
331 | n := len(b) | ||
332 | b = append(b, 0) | ||
333 | cc := ccc(r) | ||
334 | if cc > 0 { | ||
335 | // Use bubble sort. | ||
336 | for ; n > 0; n-- { | ||
337 | if ccc(b[n-1]) <= cc { | ||
338 | break | ||
339 | } | ||
340 | b[n] = b[n-1] | ||
341 | } | ||
342 | } | ||
343 | b[n] = r | ||
344 | return b | ||
345 | } | ||
346 | |||
347 | // Recursively decompose. | ||
348 | func decomposeRecursive(form int, r rune, d Decomposition) Decomposition { | ||
349 | dcomp := chars[r].forms[form].decomp | ||
350 | if len(dcomp) == 0 { | ||
351 | return insertOrdered(d, r) | ||
352 | } | ||
353 | for _, c := range dcomp { | ||
354 | d = decomposeRecursive(form, c, d) | ||
355 | } | ||
356 | return d | ||
357 | } | ||
358 | |||
359 | func completeCharFields(form int) { | ||
360 | // Phase 0: pre-expand decomposition. | ||
361 | for i := range chars { | ||
362 | f := &chars[i].forms[form] | ||
363 | if len(f.decomp) == 0 { | ||
364 | continue | ||
365 | } | ||
366 | exp := make(Decomposition, 0) | ||
367 | for _, c := range f.decomp { | ||
368 | exp = decomposeRecursive(form, c, exp) | ||
369 | } | ||
370 | f.expandedDecomp = exp | ||
371 | } | ||
372 | |||
373 | // Phase 1: composition exclusion, mark decomposition. | ||
374 | for i := range chars { | ||
375 | c := &chars[i] | ||
376 | f := &c.forms[form] | ||
377 | |||
378 | // Marks script-specific exclusions and version restricted. | ||
379 | f.isOneWay = c.excludeInComp | ||
380 | |||
381 | // Singletons | ||
382 | f.isOneWay = f.isOneWay || len(f.decomp) == 1 | ||
383 | |||
384 | // Non-starter decompositions | ||
385 | if len(f.decomp) > 1 { | ||
386 | chk := c.ccc != 0 || chars[f.decomp[0]].ccc != 0 | ||
387 | f.isOneWay = f.isOneWay || chk | ||
388 | } | ||
389 | |||
390 | // Runes that decompose into more than two runes. | ||
391 | f.isOneWay = f.isOneWay || len(f.decomp) > 2 | ||
392 | |||
393 | if form == FCompatibility { | ||
394 | f.isOneWay = f.isOneWay || hasCompatDecomp(c.codePoint) | ||
395 | } | ||
396 | |||
397 | for _, r := range f.decomp { | ||
398 | chars[r].forms[form].inDecomp = true | ||
399 | } | ||
400 | } | ||
401 | |||
402 | // Phase 2: forward and backward combining. | ||
403 | for i := range chars { | ||
404 | c := &chars[i] | ||
405 | f := &c.forms[form] | ||
406 | |||
407 | if !f.isOneWay && len(f.decomp) == 2 { | ||
408 | f0 := &chars[f.decomp[0]].forms[form] | ||
409 | f1 := &chars[f.decomp[1]].forms[form] | ||
410 | if !f0.isOneWay { | ||
411 | f0.combinesForward = true | ||
412 | } | ||
413 | if !f1.isOneWay { | ||
414 | f1.combinesBackward = true | ||
415 | } | ||
416 | } | ||
417 | if isHangulWithoutJamoT(rune(i)) { | ||
418 | f.combinesForward = true | ||
419 | } | ||
420 | } | ||
421 | |||
422 | // Phase 3: quick check values. | ||
423 | for i := range chars { | ||
424 | c := &chars[i] | ||
425 | f := &c.forms[form] | ||
426 | |||
427 | switch { | ||
428 | case len(f.decomp) > 0: | ||
429 | f.quickCheck[MDecomposed] = QCNo | ||
430 | case isHangul(rune(i)): | ||
431 | f.quickCheck[MDecomposed] = QCNo | ||
432 | default: | ||
433 | f.quickCheck[MDecomposed] = QCYes | ||
434 | } | ||
435 | switch { | ||
436 | case f.isOneWay: | ||
437 | f.quickCheck[MComposed] = QCNo | ||
438 | case (i & 0xffff00) == JamoLBase: | ||
439 | f.quickCheck[MComposed] = QCYes | ||
440 | if JamoLBase <= i && i < JamoLEnd { | ||
441 | f.combinesForward = true | ||
442 | } | ||
443 | if JamoVBase <= i && i < JamoVEnd { | ||
444 | f.quickCheck[MComposed] = QCMaybe | ||
445 | f.combinesBackward = true | ||
446 | f.combinesForward = true | ||
447 | } | ||
448 | if JamoTBase <= i && i < JamoTEnd { | ||
449 | f.quickCheck[MComposed] = QCMaybe | ||
450 | f.combinesBackward = true | ||
451 | } | ||
452 | case !f.combinesBackward: | ||
453 | f.quickCheck[MComposed] = QCYes | ||
454 | default: | ||
455 | f.quickCheck[MComposed] = QCMaybe | ||
456 | } | ||
457 | } | ||
458 | } | ||
459 | |||
460 | func computeNonStarterCounts() { | ||
461 | // Phase 4: leading and trailing non-starter count | ||
462 | for i := range chars { | ||
463 | c := &chars[i] | ||
464 | |||
465 | runes := []rune{rune(i)} | ||
466 | // We always use FCompatibility so that the CGJ insertion points do not | ||
467 | // change for repeated normalizations with different forms. | ||
468 | if exp := c.forms[FCompatibility].expandedDecomp; len(exp) > 0 { | ||
469 | runes = exp | ||
470 | } | ||
471 | // We consider runes that combine backwards to be non-starters for the | ||
472 | // purpose of Stream-Safe Text Processing. | ||
473 | for _, r := range runes { | ||
474 | if cr := &chars[r]; cr.ccc == 0 && !cr.forms[FCompatibility].combinesBackward { | ||
475 | break | ||
476 | } | ||
477 | c.nLeadingNonStarters++ | ||
478 | } | ||
479 | for i := len(runes) - 1; i >= 0; i-- { | ||
480 | if cr := &chars[runes[i]]; cr.ccc == 0 && !cr.forms[FCompatibility].combinesBackward { | ||
481 | break | ||
482 | } | ||
483 | c.nTrailingNonStarters++ | ||
484 | } | ||
485 | if c.nTrailingNonStarters > 3 { | ||
486 | log.Fatalf("%U: Decomposition with more than 3 (%d) trailing modifiers (%U)", i, c.nTrailingNonStarters, runes) | ||
487 | } | ||
488 | |||
489 | if isHangul(rune(i)) { | ||
490 | c.nTrailingNonStarters = 2 | ||
491 | if isHangulWithoutJamoT(rune(i)) { | ||
492 | c.nTrailingNonStarters = 1 | ||
493 | } | ||
494 | } | ||
495 | |||
496 | if l, t := c.nLeadingNonStarters, c.nTrailingNonStarters; l > 0 && l != t { | ||
497 | log.Fatalf("%U: number of leading and trailing non-starters should be equal (%d vs %d)", i, l, t) | ||
498 | } | ||
499 | if t := c.nTrailingNonStarters; t > 3 { | ||
500 | log.Fatalf("%U: number of trailing non-starters is %d > 3", t) | ||
501 | } | ||
502 | } | ||
503 | } | ||
504 | |||
505 | func printBytes(w io.Writer, b []byte, name string) { | ||
506 | fmt.Fprintf(w, "// %s: %d bytes\n", name, len(b)) | ||
507 | fmt.Fprintf(w, "var %s = [...]byte {", name) | ||
508 | for i, c := range b { | ||
509 | switch { | ||
510 | case i%64 == 0: | ||
511 | fmt.Fprintf(w, "\n// Bytes %x - %x\n", i, i+63) | ||
512 | case i%8 == 0: | ||
513 | fmt.Fprintf(w, "\n") | ||
514 | } | ||
515 | fmt.Fprintf(w, "0x%.2X, ", c) | ||
516 | } | ||
517 | fmt.Fprint(w, "\n}\n\n") | ||
518 | } | ||
519 | |||
520 | // See forminfo.go for format. | ||
521 | func makeEntry(f *FormInfo, c *Char) uint16 { | ||
522 | e := uint16(0) | ||
523 | if r := c.codePoint; HangulBase <= r && r < HangulEnd { | ||
524 | e |= 0x40 | ||
525 | } | ||
526 | if f.combinesForward { | ||
527 | e |= 0x20 | ||
528 | } | ||
529 | if f.quickCheck[MDecomposed] == QCNo { | ||
530 | e |= 0x4 | ||
531 | } | ||
532 | switch f.quickCheck[MComposed] { | ||
533 | case QCYes: | ||
534 | case QCNo: | ||
535 | e |= 0x10 | ||
536 | case QCMaybe: | ||
537 | e |= 0x18 | ||
538 | default: | ||
539 | log.Fatalf("Illegal quickcheck value %v.", f.quickCheck[MComposed]) | ||
540 | } | ||
541 | e |= uint16(c.nTrailingNonStarters) | ||
542 | return e | ||
543 | } | ||
544 | |||
545 | // decompSet keeps track of unique decompositions, grouped by whether | ||
546 | // the decomposition is followed by a trailing and/or leading CCC. | ||
547 | type decompSet [7]map[string]bool | ||
548 | |||
549 | const ( | ||
550 | normalDecomp = iota | ||
551 | firstMulti | ||
552 | firstCCC | ||
553 | endMulti | ||
554 | firstLeadingCCC | ||
555 | firstCCCZeroExcept | ||
556 | firstStarterWithNLead | ||
557 | lastDecomp | ||
558 | ) | ||
559 | |||
560 | var cname = []string{"firstMulti", "firstCCC", "endMulti", "firstLeadingCCC", "firstCCCZeroExcept", "firstStarterWithNLead", "lastDecomp"} | ||
561 | |||
562 | func makeDecompSet() decompSet { | ||
563 | m := decompSet{} | ||
564 | for i := range m { | ||
565 | m[i] = make(map[string]bool) | ||
566 | } | ||
567 | return m | ||
568 | } | ||
569 | func (m *decompSet) insert(key int, s string) { | ||
570 | m[key][s] = true | ||
571 | } | ||
572 | |||
573 | func printCharInfoTables(w io.Writer) int { | ||
574 | mkstr := func(r rune, f *FormInfo) (int, string) { | ||
575 | d := f.expandedDecomp | ||
576 | s := string([]rune(d)) | ||
577 | if max := 1 << 6; len(s) >= max { | ||
578 | const msg = "%U: too many bytes in decomposition: %d >= %d" | ||
579 | log.Fatalf(msg, r, len(s), max) | ||
580 | } | ||
581 | head := uint8(len(s)) | ||
582 | if f.quickCheck[MComposed] != QCYes { | ||
583 | head |= 0x40 | ||
584 | } | ||
585 | if f.combinesForward { | ||
586 | head |= 0x80 | ||
587 | } | ||
588 | s = string([]byte{head}) + s | ||
589 | |||
590 | lccc := ccc(d[0]) | ||
591 | tccc := ccc(d[len(d)-1]) | ||
592 | cc := ccc(r) | ||
593 | if cc != 0 && lccc == 0 && tccc == 0 { | ||
594 | log.Fatalf("%U: trailing and leading ccc are 0 for non-zero ccc %d", r, cc) | ||
595 | } | ||
596 | if tccc < lccc && lccc != 0 { | ||
597 | const msg = "%U: lccc (%d) must be <= tcc (%d)" | ||
598 | log.Fatalf(msg, r, lccc, tccc) | ||
599 | } | ||
600 | index := normalDecomp | ||
601 | nTrail := chars[r].nTrailingNonStarters | ||
602 | nLead := chars[r].nLeadingNonStarters | ||
603 | if tccc > 0 || lccc > 0 || nTrail > 0 { | ||
604 | tccc <<= 2 | ||
605 | tccc |= nTrail | ||
606 | s += string([]byte{tccc}) | ||
607 | index = endMulti | ||
608 | for _, r := range d[1:] { | ||
609 | if ccc(r) == 0 { | ||
610 | index = firstCCC | ||
611 | } | ||
612 | } | ||
613 | if lccc > 0 || nLead > 0 { | ||
614 | s += string([]byte{lccc}) | ||
615 | if index == firstCCC { | ||
616 | log.Fatalf("%U: multi-segment decomposition not supported for decompositions with leading CCC != 0", r) | ||
617 | } | ||
618 | index = firstLeadingCCC | ||
619 | } | ||
620 | if cc != lccc { | ||
621 | if cc != 0 { | ||
622 | log.Fatalf("%U: for lccc != ccc, expected ccc to be 0; was %d", r, cc) | ||
623 | } | ||
624 | index = firstCCCZeroExcept | ||
625 | } | ||
626 | } else if len(d) > 1 { | ||
627 | index = firstMulti | ||
628 | } | ||
629 | return index, s | ||
630 | } | ||
631 | |||
632 | decompSet := makeDecompSet() | ||
633 | const nLeadStr = "\x00\x01" // 0-byte length and tccc with nTrail. | ||
634 | decompSet.insert(firstStarterWithNLead, nLeadStr) | ||
635 | |||
636 | // Store the uniqued decompositions in a byte buffer, | ||
637 | // preceded by their byte length. | ||
638 | for _, c := range chars { | ||
639 | for _, f := range c.forms { | ||
640 | if len(f.expandedDecomp) == 0 { | ||
641 | continue | ||
642 | } | ||
643 | if f.combinesBackward { | ||
644 | log.Fatalf("%U: combinesBackward and decompose", c.codePoint) | ||
645 | } | ||
646 | index, s := mkstr(c.codePoint, &f) | ||
647 | decompSet.insert(index, s) | ||
648 | } | ||
649 | } | ||
650 | |||
651 | decompositions := bytes.NewBuffer(make([]byte, 0, 10000)) | ||
652 | size := 0 | ||
653 | positionMap := make(map[string]uint16) | ||
654 | decompositions.WriteString("\000") | ||
655 | fmt.Fprintln(w, "const (") | ||
656 | for i, m := range decompSet { | ||
657 | sa := []string{} | ||
658 | for s := range m { | ||
659 | sa = append(sa, s) | ||
660 | } | ||
661 | sort.Strings(sa) | ||
662 | for _, s := range sa { | ||
663 | p := decompositions.Len() | ||
664 | decompositions.WriteString(s) | ||
665 | positionMap[s] = uint16(p) | ||
666 | } | ||
667 | if cname[i] != "" { | ||
668 | fmt.Fprintf(w, "%s = 0x%X\n", cname[i], decompositions.Len()) | ||
669 | } | ||
670 | } | ||
671 | fmt.Fprintln(w, "maxDecomp = 0x8000") | ||
672 | fmt.Fprintln(w, ")") | ||
673 | b := decompositions.Bytes() | ||
674 | printBytes(w, b, "decomps") | ||
675 | size += len(b) | ||
676 | |||
677 | varnames := []string{"nfc", "nfkc"} | ||
678 | for i := 0; i < FNumberOfFormTypes; i++ { | ||
679 | trie := triegen.NewTrie(varnames[i]) | ||
680 | |||
681 | for r, c := range chars { | ||
682 | f := c.forms[i] | ||
683 | d := f.expandedDecomp | ||
684 | if len(d) != 0 { | ||
685 | _, key := mkstr(c.codePoint, &f) | ||
686 | trie.Insert(rune(r), uint64(positionMap[key])) | ||
687 | if c.ccc != ccc(d[0]) { | ||
688 | // We assume the lead ccc of a decomposition !=0 in this case. | ||
689 | if ccc(d[0]) == 0 { | ||
690 | log.Fatalf("Expected leading CCC to be non-zero; ccc is %d", c.ccc) | ||
691 | } | ||
692 | } | ||
693 | } else if c.nLeadingNonStarters > 0 && len(f.expandedDecomp) == 0 && c.ccc == 0 && !f.combinesBackward { | ||
694 | // Handle cases where it can't be detected that the nLead should be equal | ||
695 | // to nTrail. | ||
696 | trie.Insert(c.codePoint, uint64(positionMap[nLeadStr])) | ||
697 | } else if v := makeEntry(&f, &c)<<8 | uint16(c.ccc); v != 0 { | ||
698 | trie.Insert(c.codePoint, uint64(0x8000|v)) | ||
699 | } | ||
700 | } | ||
701 | sz, err := trie.Gen(w, triegen.Compact(&normCompacter{name: varnames[i]})) | ||
702 | if err != nil { | ||
703 | log.Fatal(err) | ||
704 | } | ||
705 | size += sz | ||
706 | } | ||
707 | return size | ||
708 | } | ||
709 | |||
710 | func contains(sa []string, s string) bool { | ||
711 | for _, a := range sa { | ||
712 | if a == s { | ||
713 | return true | ||
714 | } | ||
715 | } | ||
716 | return false | ||
717 | } | ||
718 | |||
719 | func makeTables() { | ||
720 | w := &bytes.Buffer{} | ||
721 | |||
722 | size := 0 | ||
723 | if *tablelist == "" { | ||
724 | return | ||
725 | } | ||
726 | list := strings.Split(*tablelist, ",") | ||
727 | if *tablelist == "all" { | ||
728 | list = []string{"recomp", "info"} | ||
729 | } | ||
730 | |||
731 | // Compute maximum decomposition size. | ||
732 | max := 0 | ||
733 | for _, c := range chars { | ||
734 | if n := len(string(c.forms[FCompatibility].expandedDecomp)); n > max { | ||
735 | max = n | ||
736 | } | ||
737 | } | ||
738 | |||
739 | fmt.Fprintln(w, "const (") | ||
740 | fmt.Fprintln(w, "\t// Version is the Unicode edition from which the tables are derived.") | ||
741 | fmt.Fprintf(w, "\tVersion = %q\n", gen.UnicodeVersion()) | ||
742 | fmt.Fprintln(w) | ||
743 | fmt.Fprintln(w, "\t// MaxTransformChunkSize indicates the maximum number of bytes that Transform") | ||
744 | fmt.Fprintln(w, "\t// may need to write atomically for any Form. Making a destination buffer at") | ||
745 | fmt.Fprintln(w, "\t// least this size ensures that Transform can always make progress and that") | ||
746 | fmt.Fprintln(w, "\t// the user does not need to grow the buffer on an ErrShortDst.") | ||
747 | fmt.Fprintf(w, "\tMaxTransformChunkSize = %d+maxNonStarters*4\n", len(string(0x034F))+max) | ||
748 | fmt.Fprintln(w, ")\n") | ||
749 | |||
750 | // Print the CCC remap table. | ||
751 | size += len(cccMap) | ||
752 | fmt.Fprintf(w, "var ccc = [%d]uint8{", len(cccMap)) | ||
753 | for i := 0; i < len(cccMap); i++ { | ||
754 | if i%8 == 0 { | ||
755 | fmt.Fprintln(w) | ||
756 | } | ||
757 | fmt.Fprintf(w, "%3d, ", cccMap[uint8(i)]) | ||
758 | } | ||
759 | fmt.Fprintln(w, "\n}\n") | ||
760 | |||
761 | if contains(list, "info") { | ||
762 | size += printCharInfoTables(w) | ||
763 | } | ||
764 | |||
765 | if contains(list, "recomp") { | ||
766 | // Note that we use 32 bit keys, instead of 64 bit. | ||
767 | // This clips the bits of three entries, but we know | ||
768 | // this won't cause a collision. The compiler will catch | ||
769 | // any changes made to UnicodeData.txt that introduces | ||
770 | // a collision. | ||
771 | // Note that the recomposition map for NFC and NFKC | ||
772 | // are identical. | ||
773 | |||
774 | // Recomposition map | ||
775 | nrentries := 0 | ||
776 | for _, c := range chars { | ||
777 | f := c.forms[FCanonical] | ||
778 | if !f.isOneWay && len(f.decomp) > 0 { | ||
779 | nrentries++ | ||
780 | } | ||
781 | } | ||
782 | sz := nrentries * 8 | ||
783 | size += sz | ||
784 | fmt.Fprintf(w, "// recompMap: %d bytes (entries only)\n", sz) | ||
785 | fmt.Fprintln(w, "var recompMap = map[uint32]rune{") | ||
786 | for i, c := range chars { | ||
787 | f := c.forms[FCanonical] | ||
788 | d := f.decomp | ||
789 | if !f.isOneWay && len(d) > 0 { | ||
790 | key := uint32(uint16(d[0]))<<16 + uint32(uint16(d[1])) | ||
791 | fmt.Fprintf(w, "0x%.8X: 0x%.4X,\n", key, i) | ||
792 | } | ||
793 | } | ||
794 | fmt.Fprintf(w, "}\n\n") | ||
795 | } | ||
796 | |||
797 | fmt.Fprintf(w, "// Total size of tables: %dKB (%d bytes)\n", (size+512)/1024, size) | ||
798 | gen.WriteVersionedGoFile("tables.go", "norm", w.Bytes()) | ||
799 | } | ||
800 | |||
801 | func printChars() { | ||
802 | if *verbose { | ||
803 | for _, c := range chars { | ||
804 | if !c.isValid() || c.state == SMissing { | ||
805 | continue | ||
806 | } | ||
807 | fmt.Println(c) | ||
808 | } | ||
809 | } | ||
810 | } | ||
811 | |||
812 | // verifyComputed does various consistency tests. | ||
813 | func verifyComputed() { | ||
814 | for i, c := range chars { | ||
815 | for _, f := range c.forms { | ||
816 | isNo := (f.quickCheck[MDecomposed] == QCNo) | ||
817 | if (len(f.decomp) > 0) != isNo && !isHangul(rune(i)) { | ||
818 | log.Fatalf("%U: NF*D QC must be No if rune decomposes", i) | ||
819 | } | ||
820 | |||
821 | isMaybe := f.quickCheck[MComposed] == QCMaybe | ||
822 | if f.combinesBackward != isMaybe { | ||
823 | log.Fatalf("%U: NF*C QC must be Maybe if combinesBackward", i) | ||
824 | } | ||
825 | if len(f.decomp) > 0 && f.combinesForward && isMaybe { | ||
826 | log.Fatalf("%U: NF*C QC must be Yes or No if combinesForward and decomposes", i) | ||
827 | } | ||
828 | |||
829 | if len(f.expandedDecomp) != 0 { | ||
830 | continue | ||
831 | } | ||
832 | if a, b := c.nLeadingNonStarters > 0, (c.ccc > 0 || f.combinesBackward); a != b { | ||
833 | // We accept these runes to be treated differently (it only affects | ||
834 | // segment breaking in iteration, most likely on improper use), but | ||
835 | // reconsider if more characters are added. | ||
836 | // U+FF9E HALFWIDTH KATAKANA VOICED SOUND MARK;Lm;0;L;<narrow> 3099;;;;N;;;;; | ||
837 | // U+FF9F HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK;Lm;0;L;<narrow> 309A;;;;N;;;;; | ||
838 | // U+3133 HANGUL LETTER KIYEOK-SIOS;Lo;0;L;<compat> 11AA;;;;N;HANGUL LETTER GIYEOG SIOS;;;; | ||
839 | // U+318E HANGUL LETTER ARAEAE;Lo;0;L;<compat> 11A1;;;;N;HANGUL LETTER ALAE AE;;;; | ||
840 | // U+FFA3 HALFWIDTH HANGUL LETTER KIYEOK-SIOS;Lo;0;L;<narrow> 3133;;;;N;HALFWIDTH HANGUL LETTER GIYEOG SIOS;;;; | ||
841 | // U+FFDC HALFWIDTH HANGUL LETTER I;Lo;0;L;<narrow> 3163;;;;N;;;;; | ||
842 | if i != 0xFF9E && i != 0xFF9F && !(0x3133 <= i && i <= 0x318E) && !(0xFFA3 <= i && i <= 0xFFDC) { | ||
843 | log.Fatalf("%U: nLead was %v; want %v", i, a, b) | ||
844 | } | ||
845 | } | ||
846 | } | ||
847 | nfc := c.forms[FCanonical] | ||
848 | nfkc := c.forms[FCompatibility] | ||
849 | if nfc.combinesBackward != nfkc.combinesBackward { | ||
850 | log.Fatalf("%U: Cannot combine combinesBackward\n", c.codePoint) | ||
851 | } | ||
852 | } | ||
853 | } | ||
854 | |||
855 | // Use values in DerivedNormalizationProps.txt to compare against the | ||
856 | // values we computed. | ||
857 | // DerivedNormalizationProps.txt has form: | ||
858 | // 00C0..00C5 ; NFD_QC; N # ... | ||
859 | // 0374 ; NFD_QC; N # ... | ||
860 | // See http://unicode.org/reports/tr44/ for full explanation | ||
861 | func testDerived() { | ||
862 | f := gen.OpenUCDFile("DerivedNormalizationProps.txt") | ||
863 | defer f.Close() | ||
864 | p := ucd.New(f) | ||
865 | for p.Next() { | ||
866 | r := p.Rune(0) | ||
867 | c := &chars[r] | ||
868 | |||
869 | var ftype, mode int | ||
870 | qt := p.String(1) | ||
871 | switch qt { | ||
872 | case "NFC_QC": | ||
873 | ftype, mode = FCanonical, MComposed | ||
874 | case "NFD_QC": | ||
875 | ftype, mode = FCanonical, MDecomposed | ||
876 | case "NFKC_QC": | ||
877 | ftype, mode = FCompatibility, MComposed | ||
878 | case "NFKD_QC": | ||
879 | ftype, mode = FCompatibility, MDecomposed | ||
880 | default: | ||
881 | continue | ||
882 | } | ||
883 | var qr QCResult | ||
884 | switch p.String(2) { | ||
885 | case "Y": | ||
886 | qr = QCYes | ||
887 | case "N": | ||
888 | qr = QCNo | ||
889 | case "M": | ||
890 | qr = QCMaybe | ||
891 | default: | ||
892 | log.Fatalf(`Unexpected quick check value "%s"`, p.String(2)) | ||
893 | } | ||
894 | if got := c.forms[ftype].quickCheck[mode]; got != qr { | ||
895 | log.Printf("%U: FAILED %s (was %v need %v)\n", r, qt, got, qr) | ||
896 | } | ||
897 | c.forms[ftype].verified[mode] = true | ||
898 | } | ||
899 | if err := p.Err(); err != nil { | ||
900 | log.Fatal(err) | ||
901 | } | ||
902 | // Any unspecified value must be QCYes. Verify this. | ||
903 | for i, c := range chars { | ||
904 | for j, fd := range c.forms { | ||
905 | for k, qr := range fd.quickCheck { | ||
906 | if !fd.verified[k] && qr != QCYes { | ||
907 | m := "%U: FAIL F:%d M:%d (was %v need Yes) %s\n" | ||
908 | log.Printf(m, i, j, k, qr, c.name) | ||
909 | } | ||
910 | } | ||
911 | } | ||
912 | } | ||
913 | } | ||
914 | |||
915 | var testHeader = `const ( | ||
916 | Yes = iota | ||
917 | No | ||
918 | Maybe | ||
919 | ) | ||
920 | |||
921 | type formData struct { | ||
922 | qc uint8 | ||
923 | combinesForward bool | ||
924 | decomposition string | ||
925 | } | ||
926 | |||
927 | type runeData struct { | ||
928 | r rune | ||
929 | ccc uint8 | ||
930 | nLead uint8 | ||
931 | nTrail uint8 | ||
932 | f [2]formData // 0: canonical; 1: compatibility | ||
933 | } | ||
934 | |||
935 | func f(qc uint8, cf bool, dec string) [2]formData { | ||
936 | return [2]formData{{qc, cf, dec}, {qc, cf, dec}} | ||
937 | } | ||
938 | |||
939 | func g(qc, qck uint8, cf, cfk bool, d, dk string) [2]formData { | ||
940 | return [2]formData{{qc, cf, d}, {qck, cfk, dk}} | ||
941 | } | ||
942 | |||
943 | var testData = []runeData{ | ||
944 | ` | ||
945 | |||
946 | func printTestdata() { | ||
947 | type lastInfo struct { | ||
948 | ccc uint8 | ||
949 | nLead uint8 | ||
950 | nTrail uint8 | ||
951 | f string | ||
952 | } | ||
953 | |||
954 | last := lastInfo{} | ||
955 | w := &bytes.Buffer{} | ||
956 | fmt.Fprintf(w, testHeader) | ||
957 | for r, c := range chars { | ||
958 | f := c.forms[FCanonical] | ||
959 | qc, cf, d := f.quickCheck[MComposed], f.combinesForward, string(f.expandedDecomp) | ||
960 | f = c.forms[FCompatibility] | ||
961 | qck, cfk, dk := f.quickCheck[MComposed], f.combinesForward, string(f.expandedDecomp) | ||
962 | s := "" | ||
963 | if d == dk && qc == qck && cf == cfk { | ||
964 | s = fmt.Sprintf("f(%s, %v, %q)", qc, cf, d) | ||
965 | } else { | ||
966 | s = fmt.Sprintf("g(%s, %s, %v, %v, %q, %q)", qc, qck, cf, cfk, d, dk) | ||
967 | } | ||
968 | current := lastInfo{c.ccc, c.nLeadingNonStarters, c.nTrailingNonStarters, s} | ||
969 | if last != current { | ||
970 | fmt.Fprintf(w, "\t{0x%x, %d, %d, %d, %s},\n", r, c.origCCC, c.nLeadingNonStarters, c.nTrailingNonStarters, s) | ||
971 | last = current | ||
972 | } | ||
973 | } | ||
974 | fmt.Fprintln(w, "}") | ||
975 | gen.WriteVersionedGoFile("data_test.go", "norm", w.Bytes()) | ||
976 | } | ||