diff options
Diffstat (limited to 'vendor/golang.org/x/text/unicode/norm/forminfo.go')
-rw-r--r-- | vendor/golang.org/x/text/unicode/norm/forminfo.go | 259 |
1 files changed, 259 insertions, 0 deletions
diff --git a/vendor/golang.org/x/text/unicode/norm/forminfo.go b/vendor/golang.org/x/text/unicode/norm/forminfo.go new file mode 100644 index 0000000..e67e765 --- /dev/null +++ b/vendor/golang.org/x/text/unicode/norm/forminfo.go | |||
@@ -0,0 +1,259 @@ | |||
1 | // Copyright 2011 The Go Authors. All rights reserved. | ||
2 | // Use of this source code is governed by a BSD-style | ||
3 | // license that can be found in the LICENSE file. | ||
4 | |||
5 | package norm | ||
6 | |||
7 | // This file contains Form-specific logic and wrappers for data in tables.go. | ||
8 | |||
9 | // Rune info is stored in a separate trie per composing form. A composing form | ||
10 | // and its corresponding decomposing form share the same trie. Each trie maps | ||
11 | // a rune to a uint16. The values take two forms. For v >= 0x8000: | ||
12 | // bits | ||
13 | // 15: 1 (inverse of NFD_QC bit of qcInfo) | ||
14 | // 13..7: qcInfo (see below). isYesD is always true (no decompostion). | ||
15 | // 6..0: ccc (compressed CCC value). | ||
16 | // For v < 0x8000, the respective rune has a decomposition and v is an index | ||
17 | // into a byte array of UTF-8 decomposition sequences and additional info and | ||
18 | // has the form: | ||
19 | // <header> <decomp_byte>* [<tccc> [<lccc>]] | ||
20 | // The header contains the number of bytes in the decomposition (excluding this | ||
21 | // length byte). The two most significant bits of this length byte correspond | ||
22 | // to bit 5 and 4 of qcInfo (see below). The byte sequence itself starts at v+1. | ||
23 | // The byte sequence is followed by a trailing and leading CCC if the values | ||
24 | // for these are not zero. The value of v determines which ccc are appended | ||
25 | // to the sequences. For v < firstCCC, there are none, for v >= firstCCC, | ||
26 | // the sequence is followed by a trailing ccc, and for v >= firstLeadingCC | ||
27 | // there is an additional leading ccc. The value of tccc itself is the | ||
28 | // trailing CCC shifted left 2 bits. The two least-significant bits of tccc | ||
29 | // are the number of trailing non-starters. | ||
30 | |||
31 | const ( | ||
32 | qcInfoMask = 0x3F // to clear all but the relevant bits in a qcInfo | ||
33 | headerLenMask = 0x3F // extract the length value from the header byte | ||
34 | headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte | ||
35 | ) | ||
36 | |||
37 | // Properties provides access to normalization properties of a rune. | ||
38 | type Properties struct { | ||
39 | pos uint8 // start position in reorderBuffer; used in composition.go | ||
40 | size uint8 // length of UTF-8 encoding of this rune | ||
41 | ccc uint8 // leading canonical combining class (ccc if not decomposition) | ||
42 | tccc uint8 // trailing canonical combining class (ccc if not decomposition) | ||
43 | nLead uint8 // number of leading non-starters. | ||
44 | flags qcInfo // quick check flags | ||
45 | index uint16 | ||
46 | } | ||
47 | |||
48 | // functions dispatchable per form | ||
49 | type lookupFunc func(b input, i int) Properties | ||
50 | |||
51 | // formInfo holds Form-specific functions and tables. | ||
52 | type formInfo struct { | ||
53 | form Form | ||
54 | composing, compatibility bool // form type | ||
55 | info lookupFunc | ||
56 | nextMain iterFunc | ||
57 | } | ||
58 | |||
59 | var formTable = []*formInfo{{ | ||
60 | form: NFC, | ||
61 | composing: true, | ||
62 | compatibility: false, | ||
63 | info: lookupInfoNFC, | ||
64 | nextMain: nextComposed, | ||
65 | }, { | ||
66 | form: NFD, | ||
67 | composing: false, | ||
68 | compatibility: false, | ||
69 | info: lookupInfoNFC, | ||
70 | nextMain: nextDecomposed, | ||
71 | }, { | ||
72 | form: NFKC, | ||
73 | composing: true, | ||
74 | compatibility: true, | ||
75 | info: lookupInfoNFKC, | ||
76 | nextMain: nextComposed, | ||
77 | }, { | ||
78 | form: NFKD, | ||
79 | composing: false, | ||
80 | compatibility: true, | ||
81 | info: lookupInfoNFKC, | ||
82 | nextMain: nextDecomposed, | ||
83 | }} | ||
84 | |||
85 | // We do not distinguish between boundaries for NFC, NFD, etc. to avoid | ||
86 | // unexpected behavior for the user. For example, in NFD, there is a boundary | ||
87 | // after 'a'. However, 'a' might combine with modifiers, so from the application's | ||
88 | // perspective it is not a good boundary. We will therefore always use the | ||
89 | // boundaries for the combining variants. | ||
90 | |||
91 | // BoundaryBefore returns true if this rune starts a new segment and | ||
92 | // cannot combine with any rune on the left. | ||
93 | func (p Properties) BoundaryBefore() bool { | ||
94 | if p.ccc == 0 && !p.combinesBackward() { | ||
95 | return true | ||
96 | } | ||
97 | // We assume that the CCC of the first character in a decomposition | ||
98 | // is always non-zero if different from info.ccc and that we can return | ||
99 | // false at this point. This is verified by maketables. | ||
100 | return false | ||
101 | } | ||
102 | |||
103 | // BoundaryAfter returns true if runes cannot combine with or otherwise | ||
104 | // interact with this or previous runes. | ||
105 | func (p Properties) BoundaryAfter() bool { | ||
106 | // TODO: loosen these conditions. | ||
107 | return p.isInert() | ||
108 | } | ||
109 | |||
110 | // We pack quick check data in 4 bits: | ||
111 | // 5: Combines forward (0 == false, 1 == true) | ||
112 | // 4..3: NFC_QC Yes(00), No (10), or Maybe (11) | ||
113 | // 2: NFD_QC Yes (0) or No (1). No also means there is a decomposition. | ||
114 | // 1..0: Number of trailing non-starters. | ||
115 | // | ||
116 | // When all 4 bits are zero, the character is inert, meaning it is never | ||
117 | // influenced by normalization. | ||
118 | type qcInfo uint8 | ||
119 | |||
120 | func (p Properties) isYesC() bool { return p.flags&0x10 == 0 } | ||
121 | func (p Properties) isYesD() bool { return p.flags&0x4 == 0 } | ||
122 | |||
123 | func (p Properties) combinesForward() bool { return p.flags&0x20 != 0 } | ||
124 | func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe | ||
125 | func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD | ||
126 | |||
127 | func (p Properties) isInert() bool { | ||
128 | return p.flags&qcInfoMask == 0 && p.ccc == 0 | ||
129 | } | ||
130 | |||
131 | func (p Properties) multiSegment() bool { | ||
132 | return p.index >= firstMulti && p.index < endMulti | ||
133 | } | ||
134 | |||
135 | func (p Properties) nLeadingNonStarters() uint8 { | ||
136 | return p.nLead | ||
137 | } | ||
138 | |||
139 | func (p Properties) nTrailingNonStarters() uint8 { | ||
140 | return uint8(p.flags & 0x03) | ||
141 | } | ||
142 | |||
143 | // Decomposition returns the decomposition for the underlying rune | ||
144 | // or nil if there is none. | ||
145 | func (p Properties) Decomposition() []byte { | ||
146 | // TODO: create the decomposition for Hangul? | ||
147 | if p.index == 0 { | ||
148 | return nil | ||
149 | } | ||
150 | i := p.index | ||
151 | n := decomps[i] & headerLenMask | ||
152 | i++ | ||
153 | return decomps[i : i+uint16(n)] | ||
154 | } | ||
155 | |||
156 | // Size returns the length of UTF-8 encoding of the rune. | ||
157 | func (p Properties) Size() int { | ||
158 | return int(p.size) | ||
159 | } | ||
160 | |||
161 | // CCC returns the canonical combining class of the underlying rune. | ||
162 | func (p Properties) CCC() uint8 { | ||
163 | if p.index >= firstCCCZeroExcept { | ||
164 | return 0 | ||
165 | } | ||
166 | return ccc[p.ccc] | ||
167 | } | ||
168 | |||
169 | // LeadCCC returns the CCC of the first rune in the decomposition. | ||
170 | // If there is no decomposition, LeadCCC equals CCC. | ||
171 | func (p Properties) LeadCCC() uint8 { | ||
172 | return ccc[p.ccc] | ||
173 | } | ||
174 | |||
175 | // TrailCCC returns the CCC of the last rune in the decomposition. | ||
176 | // If there is no decomposition, TrailCCC equals CCC. | ||
177 | func (p Properties) TrailCCC() uint8 { | ||
178 | return ccc[p.tccc] | ||
179 | } | ||
180 | |||
181 | // Recomposition | ||
182 | // We use 32-bit keys instead of 64-bit for the two codepoint keys. | ||
183 | // This clips off the bits of three entries, but we know this will not | ||
184 | // result in a collision. In the unlikely event that changes to | ||
185 | // UnicodeData.txt introduce collisions, the compiler will catch it. | ||
186 | // Note that the recomposition map for NFC and NFKC are identical. | ||
187 | |||
188 | // combine returns the combined rune or 0 if it doesn't exist. | ||
189 | func combine(a, b rune) rune { | ||
190 | key := uint32(uint16(a))<<16 + uint32(uint16(b)) | ||
191 | return recompMap[key] | ||
192 | } | ||
193 | |||
194 | func lookupInfoNFC(b input, i int) Properties { | ||
195 | v, sz := b.charinfoNFC(i) | ||
196 | return compInfo(v, sz) | ||
197 | } | ||
198 | |||
199 | func lookupInfoNFKC(b input, i int) Properties { | ||
200 | v, sz := b.charinfoNFKC(i) | ||
201 | return compInfo(v, sz) | ||
202 | } | ||
203 | |||
204 | // Properties returns properties for the first rune in s. | ||
205 | func (f Form) Properties(s []byte) Properties { | ||
206 | if f == NFC || f == NFD { | ||
207 | return compInfo(nfcData.lookup(s)) | ||
208 | } | ||
209 | return compInfo(nfkcData.lookup(s)) | ||
210 | } | ||
211 | |||
212 | // PropertiesString returns properties for the first rune in s. | ||
213 | func (f Form) PropertiesString(s string) Properties { | ||
214 | if f == NFC || f == NFD { | ||
215 | return compInfo(nfcData.lookupString(s)) | ||
216 | } | ||
217 | return compInfo(nfkcData.lookupString(s)) | ||
218 | } | ||
219 | |||
220 | // compInfo converts the information contained in v and sz | ||
221 | // to a Properties. See the comment at the top of the file | ||
222 | // for more information on the format. | ||
223 | func compInfo(v uint16, sz int) Properties { | ||
224 | if v == 0 { | ||
225 | return Properties{size: uint8(sz)} | ||
226 | } else if v >= 0x8000 { | ||
227 | p := Properties{ | ||
228 | size: uint8(sz), | ||
229 | ccc: uint8(v), | ||
230 | tccc: uint8(v), | ||
231 | flags: qcInfo(v >> 8), | ||
232 | } | ||
233 | if p.ccc > 0 || p.combinesBackward() { | ||
234 | p.nLead = uint8(p.flags & 0x3) | ||
235 | } | ||
236 | return p | ||
237 | } | ||
238 | // has decomposition | ||
239 | h := decomps[v] | ||
240 | f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4 | ||
241 | p := Properties{size: uint8(sz), flags: f, index: v} | ||
242 | if v >= firstCCC { | ||
243 | v += uint16(h&headerLenMask) + 1 | ||
244 | c := decomps[v] | ||
245 | p.tccc = c >> 2 | ||
246 | p.flags |= qcInfo(c & 0x3) | ||
247 | if v >= firstLeadingCCC { | ||
248 | p.nLead = c & 0x3 | ||
249 | if v >= firstStarterWithNLead { | ||
250 | // We were tricked. Remove the decomposition. | ||
251 | p.flags &= 0x03 | ||
252 | p.index = 0 | ||
253 | return p | ||
254 | } | ||
255 | p.ccc = decomps[v+1] | ||
256 | } | ||
257 | } | ||
258 | return p | ||
259 | } | ||