]>
Commit | Line | Data |
---|---|---|
15c0b25d AP |
1 | // Copyright 2011 The Go Authors. All rights reserved. |
2 | // Use of this source code is governed by a BSD-style | |
3 | // license that can be found in the LICENSE file. | |
4 | ||
5 | package norm | |
6 | ||
107c1cdb ND |
7 | import "encoding/binary" |
8 | ||
15c0b25d AP |
9 | // This file contains Form-specific logic and wrappers for data in tables.go. |
10 | ||
11 | // Rune info is stored in a separate trie per composing form. A composing form | |
12 | // and its corresponding decomposing form share the same trie. Each trie maps | |
13 | // a rune to a uint16. The values take two forms. For v >= 0x8000: | |
14 | // bits | |
15 | // 15: 1 (inverse of NFD_QC bit of qcInfo) | |
16 | // 13..7: qcInfo (see below). isYesD is always true (no decompostion). | |
17 | // 6..0: ccc (compressed CCC value). | |
18 | // For v < 0x8000, the respective rune has a decomposition and v is an index | |
19 | // into a byte array of UTF-8 decomposition sequences and additional info and | |
20 | // has the form: | |
21 | // <header> <decomp_byte>* [<tccc> [<lccc>]] | |
22 | // The header contains the number of bytes in the decomposition (excluding this | |
23 | // length byte). The two most significant bits of this length byte correspond | |
24 | // to bit 5 and 4 of qcInfo (see below). The byte sequence itself starts at v+1. | |
25 | // The byte sequence is followed by a trailing and leading CCC if the values | |
26 | // for these are not zero. The value of v determines which ccc are appended | |
27 | // to the sequences. For v < firstCCC, there are none, for v >= firstCCC, | |
28 | // the sequence is followed by a trailing ccc, and for v >= firstLeadingCC | |
29 | // there is an additional leading ccc. The value of tccc itself is the | |
30 | // trailing CCC shifted left 2 bits. The two least-significant bits of tccc | |
31 | // are the number of trailing non-starters. | |
32 | ||
33 | const ( | |
34 | qcInfoMask = 0x3F // to clear all but the relevant bits in a qcInfo | |
35 | headerLenMask = 0x3F // extract the length value from the header byte | |
36 | headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte | |
37 | ) | |
38 | ||
39 | // Properties provides access to normalization properties of a rune. | |
40 | type Properties struct { | |
41 | pos uint8 // start position in reorderBuffer; used in composition.go | |
42 | size uint8 // length of UTF-8 encoding of this rune | |
43 | ccc uint8 // leading canonical combining class (ccc if not decomposition) | |
44 | tccc uint8 // trailing canonical combining class (ccc if not decomposition) | |
45 | nLead uint8 // number of leading non-starters. | |
46 | flags qcInfo // quick check flags | |
47 | index uint16 | |
48 | } | |
49 | ||
50 | // functions dispatchable per form | |
51 | type lookupFunc func(b input, i int) Properties | |
52 | ||
53 | // formInfo holds Form-specific functions and tables. | |
54 | type formInfo struct { | |
55 | form Form | |
56 | composing, compatibility bool // form type | |
57 | info lookupFunc | |
58 | nextMain iterFunc | |
59 | } | |
60 | ||
61 | var formTable = []*formInfo{{ | |
62 | form: NFC, | |
63 | composing: true, | |
64 | compatibility: false, | |
65 | info: lookupInfoNFC, | |
66 | nextMain: nextComposed, | |
67 | }, { | |
68 | form: NFD, | |
69 | composing: false, | |
70 | compatibility: false, | |
71 | info: lookupInfoNFC, | |
72 | nextMain: nextDecomposed, | |
73 | }, { | |
74 | form: NFKC, | |
75 | composing: true, | |
76 | compatibility: true, | |
77 | info: lookupInfoNFKC, | |
78 | nextMain: nextComposed, | |
79 | }, { | |
80 | form: NFKD, | |
81 | composing: false, | |
82 | compatibility: true, | |
83 | info: lookupInfoNFKC, | |
84 | nextMain: nextDecomposed, | |
85 | }} | |
86 | ||
87 | // We do not distinguish between boundaries for NFC, NFD, etc. to avoid | |
88 | // unexpected behavior for the user. For example, in NFD, there is a boundary | |
89 | // after 'a'. However, 'a' might combine with modifiers, so from the application's | |
90 | // perspective it is not a good boundary. We will therefore always use the | |
91 | // boundaries for the combining variants. | |
92 | ||
93 | // BoundaryBefore returns true if this rune starts a new segment and | |
94 | // cannot combine with any rune on the left. | |
95 | func (p Properties) BoundaryBefore() bool { | |
96 | if p.ccc == 0 && !p.combinesBackward() { | |
97 | return true | |
98 | } | |
99 | // We assume that the CCC of the first character in a decomposition | |
100 | // is always non-zero if different from info.ccc and that we can return | |
101 | // false at this point. This is verified by maketables. | |
102 | return false | |
103 | } | |
104 | ||
105 | // BoundaryAfter returns true if runes cannot combine with or otherwise | |
106 | // interact with this or previous runes. | |
107 | func (p Properties) BoundaryAfter() bool { | |
108 | // TODO: loosen these conditions. | |
109 | return p.isInert() | |
110 | } | |
111 | ||
112 | // We pack quick check data in 4 bits: | |
113 | // 5: Combines forward (0 == false, 1 == true) | |
114 | // 4..3: NFC_QC Yes(00), No (10), or Maybe (11) | |
115 | // 2: NFD_QC Yes (0) or No (1). No also means there is a decomposition. | |
116 | // 1..0: Number of trailing non-starters. | |
117 | // | |
118 | // When all 4 bits are zero, the character is inert, meaning it is never | |
119 | // influenced by normalization. | |
120 | type qcInfo uint8 | |
121 | ||
122 | func (p Properties) isYesC() bool { return p.flags&0x10 == 0 } | |
123 | func (p Properties) isYesD() bool { return p.flags&0x4 == 0 } | |
124 | ||
125 | func (p Properties) combinesForward() bool { return p.flags&0x20 != 0 } | |
126 | func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe | |
127 | func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD | |
128 | ||
129 | func (p Properties) isInert() bool { | |
130 | return p.flags&qcInfoMask == 0 && p.ccc == 0 | |
131 | } | |
132 | ||
133 | func (p Properties) multiSegment() bool { | |
134 | return p.index >= firstMulti && p.index < endMulti | |
135 | } | |
136 | ||
137 | func (p Properties) nLeadingNonStarters() uint8 { | |
138 | return p.nLead | |
139 | } | |
140 | ||
141 | func (p Properties) nTrailingNonStarters() uint8 { | |
142 | return uint8(p.flags & 0x03) | |
143 | } | |
144 | ||
145 | // Decomposition returns the decomposition for the underlying rune | |
146 | // or nil if there is none. | |
147 | func (p Properties) Decomposition() []byte { | |
148 | // TODO: create the decomposition for Hangul? | |
149 | if p.index == 0 { | |
150 | return nil | |
151 | } | |
152 | i := p.index | |
153 | n := decomps[i] & headerLenMask | |
154 | i++ | |
155 | return decomps[i : i+uint16(n)] | |
156 | } | |
157 | ||
158 | // Size returns the length of UTF-8 encoding of the rune. | |
159 | func (p Properties) Size() int { | |
160 | return int(p.size) | |
161 | } | |
162 | ||
163 | // CCC returns the canonical combining class of the underlying rune. | |
164 | func (p Properties) CCC() uint8 { | |
165 | if p.index >= firstCCCZeroExcept { | |
166 | return 0 | |
167 | } | |
168 | return ccc[p.ccc] | |
169 | } | |
170 | ||
171 | // LeadCCC returns the CCC of the first rune in the decomposition. | |
172 | // If there is no decomposition, LeadCCC equals CCC. | |
173 | func (p Properties) LeadCCC() uint8 { | |
174 | return ccc[p.ccc] | |
175 | } | |
176 | ||
177 | // TrailCCC returns the CCC of the last rune in the decomposition. | |
178 | // If there is no decomposition, TrailCCC equals CCC. | |
179 | func (p Properties) TrailCCC() uint8 { | |
180 | return ccc[p.tccc] | |
181 | } | |
182 | ||
107c1cdb ND |
183 | func buildRecompMap() { |
184 | recompMap = make(map[uint32]rune, len(recompMapPacked)/8) | |
185 | var buf [8]byte | |
186 | for i := 0; i < len(recompMapPacked); i += 8 { | |
187 | copy(buf[:], recompMapPacked[i:i+8]) | |
188 | key := binary.BigEndian.Uint32(buf[:4]) | |
189 | val := binary.BigEndian.Uint32(buf[4:]) | |
190 | recompMap[key] = rune(val) | |
191 | } | |
192 | } | |
193 | ||
15c0b25d AP |
194 | // Recomposition |
195 | // We use 32-bit keys instead of 64-bit for the two codepoint keys. | |
196 | // This clips off the bits of three entries, but we know this will not | |
197 | // result in a collision. In the unlikely event that changes to | |
198 | // UnicodeData.txt introduce collisions, the compiler will catch it. | |
199 | // Note that the recomposition map for NFC and NFKC are identical. | |
200 | ||
201 | // combine returns the combined rune or 0 if it doesn't exist. | |
107c1cdb ND |
202 | // |
203 | // The caller is responsible for calling | |
204 | // recompMapOnce.Do(buildRecompMap) sometime before this is called. | |
15c0b25d AP |
205 | func combine(a, b rune) rune { |
206 | key := uint32(uint16(a))<<16 + uint32(uint16(b)) | |
107c1cdb ND |
207 | if recompMap == nil { |
208 | panic("caller error") // see func comment | |
209 | } | |
15c0b25d AP |
210 | return recompMap[key] |
211 | } | |
212 | ||
213 | func lookupInfoNFC(b input, i int) Properties { | |
214 | v, sz := b.charinfoNFC(i) | |
215 | return compInfo(v, sz) | |
216 | } | |
217 | ||
218 | func lookupInfoNFKC(b input, i int) Properties { | |
219 | v, sz := b.charinfoNFKC(i) | |
220 | return compInfo(v, sz) | |
221 | } | |
222 | ||
223 | // Properties returns properties for the first rune in s. | |
224 | func (f Form) Properties(s []byte) Properties { | |
225 | if f == NFC || f == NFD { | |
226 | return compInfo(nfcData.lookup(s)) | |
227 | } | |
228 | return compInfo(nfkcData.lookup(s)) | |
229 | } | |
230 | ||
231 | // PropertiesString returns properties for the first rune in s. | |
232 | func (f Form) PropertiesString(s string) Properties { | |
233 | if f == NFC || f == NFD { | |
234 | return compInfo(nfcData.lookupString(s)) | |
235 | } | |
236 | return compInfo(nfkcData.lookupString(s)) | |
237 | } | |
238 | ||
239 | // compInfo converts the information contained in v and sz | |
240 | // to a Properties. See the comment at the top of the file | |
241 | // for more information on the format. | |
242 | func compInfo(v uint16, sz int) Properties { | |
243 | if v == 0 { | |
244 | return Properties{size: uint8(sz)} | |
245 | } else if v >= 0x8000 { | |
246 | p := Properties{ | |
247 | size: uint8(sz), | |
248 | ccc: uint8(v), | |
249 | tccc: uint8(v), | |
250 | flags: qcInfo(v >> 8), | |
251 | } | |
252 | if p.ccc > 0 || p.combinesBackward() { | |
253 | p.nLead = uint8(p.flags & 0x3) | |
254 | } | |
255 | return p | |
256 | } | |
257 | // has decomposition | |
258 | h := decomps[v] | |
259 | f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4 | |
260 | p := Properties{size: uint8(sz), flags: f, index: v} | |
261 | if v >= firstCCC { | |
262 | v += uint16(h&headerLenMask) + 1 | |
263 | c := decomps[v] | |
264 | p.tccc = c >> 2 | |
265 | p.flags |= qcInfo(c & 0x3) | |
266 | if v >= firstLeadingCCC { | |
267 | p.nLead = c & 0x3 | |
268 | if v >= firstStarterWithNLead { | |
269 | // We were tricked. Remove the decomposition. | |
270 | p.flags &= 0x03 | |
271 | p.index = 0 | |
272 | return p | |
273 | } | |
274 | p.ccc = decomps[v+1] | |
275 | } | |
276 | } | |
277 | return p | |
278 | } |