]>
Commit | Line | Data |
---|---|---|
1 | // Copyright 2011 The Go Authors. All rights reserved. | |
2 | // Use of this source code is governed by a BSD-style | |
3 | // license that can be found in the LICENSE file. | |
4 | ||
5 | package norm | |
6 | ||
7 | import ( | |
8 | "fmt" | |
9 | "unicode/utf8" | |
10 | ) | |
11 | ||
12 | // MaxSegmentSize is the maximum size of a byte buffer needed to consider any | |
13 | // sequence of starter and non-starter runes for the purpose of normalization. | |
14 | const MaxSegmentSize = maxByteBufferSize | |
15 | ||
16 | // An Iter iterates over a string or byte slice, while normalizing it | |
17 | // to a given Form. | |
18 | type Iter struct { | |
19 | rb reorderBuffer | |
20 | buf [maxByteBufferSize]byte | |
21 | info Properties // first character saved from previous iteration | |
22 | next iterFunc // implementation of next depends on form | |
23 | asciiF iterFunc | |
24 | ||
25 | p int // current position in input source | |
26 | multiSeg []byte // remainder of multi-segment decomposition | |
27 | } | |
28 | ||
29 | type iterFunc func(*Iter) []byte | |
30 | ||
31 | // Init initializes i to iterate over src after normalizing it to Form f. | |
32 | func (i *Iter) Init(f Form, src []byte) { | |
33 | i.p = 0 | |
34 | if len(src) == 0 { | |
35 | i.setDone() | |
36 | i.rb.nsrc = 0 | |
37 | return | |
38 | } | |
39 | i.multiSeg = nil | |
40 | i.rb.init(f, src) | |
41 | i.next = i.rb.f.nextMain | |
42 | i.asciiF = nextASCIIBytes | |
43 | i.info = i.rb.f.info(i.rb.src, i.p) | |
44 | i.rb.ss.first(i.info) | |
45 | } | |
46 | ||
47 | // InitString initializes i to iterate over src after normalizing it to Form f. | |
48 | func (i *Iter) InitString(f Form, src string) { | |
49 | i.p = 0 | |
50 | if len(src) == 0 { | |
51 | i.setDone() | |
52 | i.rb.nsrc = 0 | |
53 | return | |
54 | } | |
55 | i.multiSeg = nil | |
56 | i.rb.initString(f, src) | |
57 | i.next = i.rb.f.nextMain | |
58 | i.asciiF = nextASCIIString | |
59 | i.info = i.rb.f.info(i.rb.src, i.p) | |
60 | i.rb.ss.first(i.info) | |
61 | } | |
62 | ||
63 | // Seek sets the segment to be returned by the next call to Next to start | |
64 | // at position p. It is the responsibility of the caller to set p to the | |
65 | // start of a segment. | |
66 | func (i *Iter) Seek(offset int64, whence int) (int64, error) { | |
67 | var abs int64 | |
68 | switch whence { | |
69 | case 0: | |
70 | abs = offset | |
71 | case 1: | |
72 | abs = int64(i.p) + offset | |
73 | case 2: | |
74 | abs = int64(i.rb.nsrc) + offset | |
75 | default: | |
76 | return 0, fmt.Errorf("norm: invalid whence") | |
77 | } | |
78 | if abs < 0 { | |
79 | return 0, fmt.Errorf("norm: negative position") | |
80 | } | |
81 | if int(abs) >= i.rb.nsrc { | |
82 | i.setDone() | |
83 | return int64(i.p), nil | |
84 | } | |
85 | i.p = int(abs) | |
86 | i.multiSeg = nil | |
87 | i.next = i.rb.f.nextMain | |
88 | i.info = i.rb.f.info(i.rb.src, i.p) | |
89 | i.rb.ss.first(i.info) | |
90 | return abs, nil | |
91 | } | |
92 | ||
93 | // returnSlice returns a slice of the underlying input type as a byte slice. | |
94 | // If the underlying is of type []byte, it will simply return a slice. | |
95 | // If the underlying is of type string, it will copy the slice to the buffer | |
96 | // and return that. | |
97 | func (i *Iter) returnSlice(a, b int) []byte { | |
98 | if i.rb.src.bytes == nil { | |
99 | return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])] | |
100 | } | |
101 | return i.rb.src.bytes[a:b] | |
102 | } | |
103 | ||
104 | // Pos returns the byte position at which the next call to Next will commence processing. | |
105 | func (i *Iter) Pos() int { | |
106 | return i.p | |
107 | } | |
108 | ||
109 | func (i *Iter) setDone() { | |
110 | i.next = nextDone | |
111 | i.p = i.rb.nsrc | |
112 | } | |
113 | ||
114 | // Done returns true if there is no more input to process. | |
115 | func (i *Iter) Done() bool { | |
116 | return i.p >= i.rb.nsrc | |
117 | } | |
118 | ||
119 | // Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input. | |
120 | // For any input a and b for which f(a) == f(b), subsequent calls | |
121 | // to Next will return the same segments. | |
122 | // Modifying runes are grouped together with the preceding starter, if such a starter exists. | |
123 | // Although not guaranteed, n will typically be the smallest possible n. | |
124 | func (i *Iter) Next() []byte { | |
125 | return i.next(i) | |
126 | } | |
127 | ||
128 | func nextASCIIBytes(i *Iter) []byte { | |
129 | p := i.p + 1 | |
130 | if p >= i.rb.nsrc { | |
131 | p0 := i.p | |
132 | i.setDone() | |
133 | return i.rb.src.bytes[p0:p] | |
134 | } | |
135 | if i.rb.src.bytes[p] < utf8.RuneSelf { | |
136 | p0 := i.p | |
137 | i.p = p | |
138 | return i.rb.src.bytes[p0:p] | |
139 | } | |
140 | i.info = i.rb.f.info(i.rb.src, i.p) | |
141 | i.next = i.rb.f.nextMain | |
142 | return i.next(i) | |
143 | } | |
144 | ||
145 | func nextASCIIString(i *Iter) []byte { | |
146 | p := i.p + 1 | |
147 | if p >= i.rb.nsrc { | |
148 | i.buf[0] = i.rb.src.str[i.p] | |
149 | i.setDone() | |
150 | return i.buf[:1] | |
151 | } | |
152 | if i.rb.src.str[p] < utf8.RuneSelf { | |
153 | i.buf[0] = i.rb.src.str[i.p] | |
154 | i.p = p | |
155 | return i.buf[:1] | |
156 | } | |
157 | i.info = i.rb.f.info(i.rb.src, i.p) | |
158 | i.next = i.rb.f.nextMain | |
159 | return i.next(i) | |
160 | } | |
161 | ||
162 | func nextHangul(i *Iter) []byte { | |
163 | p := i.p | |
164 | next := p + hangulUTF8Size | |
165 | if next >= i.rb.nsrc { | |
166 | i.setDone() | |
167 | } else if i.rb.src.hangul(next) == 0 { | |
168 | i.rb.ss.next(i.info) | |
169 | i.info = i.rb.f.info(i.rb.src, i.p) | |
170 | i.next = i.rb.f.nextMain | |
171 | return i.next(i) | |
172 | } | |
173 | i.p = next | |
174 | return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))] | |
175 | } | |
176 | ||
177 | func nextDone(i *Iter) []byte { | |
178 | return nil | |
179 | } | |
180 | ||
181 | // nextMulti is used for iterating over multi-segment decompositions | |
182 | // for decomposing normal forms. | |
183 | func nextMulti(i *Iter) []byte { | |
184 | j := 0 | |
185 | d := i.multiSeg | |
186 | // skip first rune | |
187 | for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ { | |
188 | } | |
189 | for j < len(d) { | |
190 | info := i.rb.f.info(input{bytes: d}, j) | |
191 | if info.BoundaryBefore() { | |
192 | i.multiSeg = d[j:] | |
193 | return d[:j] | |
194 | } | |
195 | j += int(info.size) | |
196 | } | |
197 | // treat last segment as normal decomposition | |
198 | i.next = i.rb.f.nextMain | |
199 | return i.next(i) | |
200 | } | |
201 | ||
202 | // nextMultiNorm is used for iterating over multi-segment decompositions | |
203 | // for composing normal forms. | |
204 | func nextMultiNorm(i *Iter) []byte { | |
205 | j := 0 | |
206 | d := i.multiSeg | |
207 | for j < len(d) { | |
208 | info := i.rb.f.info(input{bytes: d}, j) | |
209 | if info.BoundaryBefore() { | |
210 | i.rb.compose() | |
211 | seg := i.buf[:i.rb.flushCopy(i.buf[:])] | |
212 | i.rb.insertUnsafe(input{bytes: d}, j, info) | |
213 | i.multiSeg = d[j+int(info.size):] | |
214 | return seg | |
215 | } | |
216 | i.rb.insertUnsafe(input{bytes: d}, j, info) | |
217 | j += int(info.size) | |
218 | } | |
219 | i.multiSeg = nil | |
220 | i.next = nextComposed | |
221 | return doNormComposed(i) | |
222 | } | |
223 | ||
224 | // nextDecomposed is the implementation of Next for forms NFD and NFKD. | |
225 | func nextDecomposed(i *Iter) (next []byte) { | |
226 | outp := 0 | |
227 | inCopyStart, outCopyStart := i.p, 0 | |
228 | for { | |
229 | if sz := int(i.info.size); sz <= 1 { | |
230 | i.rb.ss = 0 | |
231 | p := i.p | |
232 | i.p++ // ASCII or illegal byte. Either way, advance by 1. | |
233 | if i.p >= i.rb.nsrc { | |
234 | i.setDone() | |
235 | return i.returnSlice(p, i.p) | |
236 | } else if i.rb.src._byte(i.p) < utf8.RuneSelf { | |
237 | i.next = i.asciiF | |
238 | return i.returnSlice(p, i.p) | |
239 | } | |
240 | outp++ | |
241 | } else if d := i.info.Decomposition(); d != nil { | |
242 | // Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero. | |
243 | // Case 1: there is a leftover to copy. In this case the decomposition | |
244 | // must begin with a modifier and should always be appended. | |
245 | // Case 2: no leftover. Simply return d if followed by a ccc == 0 value. | |
246 | p := outp + len(d) | |
247 | if outp > 0 { | |
248 | i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) | |
249 | // TODO: this condition should not be possible, but we leave it | |
250 | // in for defensive purposes. | |
251 | if p > len(i.buf) { | |
252 | return i.buf[:outp] | |
253 | } | |
254 | } else if i.info.multiSegment() { | |
255 | // outp must be 0 as multi-segment decompositions always | |
256 | // start a new segment. | |
257 | if i.multiSeg == nil { | |
258 | i.multiSeg = d | |
259 | i.next = nextMulti | |
260 | return nextMulti(i) | |
261 | } | |
262 | // We are in the last segment. Treat as normal decomposition. | |
263 | d = i.multiSeg | |
264 | i.multiSeg = nil | |
265 | p = len(d) | |
266 | } | |
267 | prevCC := i.info.tccc | |
268 | if i.p += sz; i.p >= i.rb.nsrc { | |
269 | i.setDone() | |
270 | i.info = Properties{} // Force BoundaryBefore to succeed. | |
271 | } else { | |
272 | i.info = i.rb.f.info(i.rb.src, i.p) | |
273 | } | |
274 | switch i.rb.ss.next(i.info) { | |
275 | case ssOverflow: | |
276 | i.next = nextCGJDecompose | |
277 | fallthrough | |
278 | case ssStarter: | |
279 | if outp > 0 { | |
280 | copy(i.buf[outp:], d) | |
281 | return i.buf[:p] | |
282 | } | |
283 | return d | |
284 | } | |
285 | copy(i.buf[outp:], d) | |
286 | outp = p | |
287 | inCopyStart, outCopyStart = i.p, outp | |
288 | if i.info.ccc < prevCC { | |
289 | goto doNorm | |
290 | } | |
291 | continue | |
292 | } else if r := i.rb.src.hangul(i.p); r != 0 { | |
293 | outp = decomposeHangul(i.buf[:], r) | |
294 | i.p += hangulUTF8Size | |
295 | inCopyStart, outCopyStart = i.p, outp | |
296 | if i.p >= i.rb.nsrc { | |
297 | i.setDone() | |
298 | break | |
299 | } else if i.rb.src.hangul(i.p) != 0 { | |
300 | i.next = nextHangul | |
301 | return i.buf[:outp] | |
302 | } | |
303 | } else { | |
304 | p := outp + sz | |
305 | if p > len(i.buf) { | |
306 | break | |
307 | } | |
308 | outp = p | |
309 | i.p += sz | |
310 | } | |
311 | if i.p >= i.rb.nsrc { | |
312 | i.setDone() | |
313 | break | |
314 | } | |
315 | prevCC := i.info.tccc | |
316 | i.info = i.rb.f.info(i.rb.src, i.p) | |
317 | if v := i.rb.ss.next(i.info); v == ssStarter { | |
318 | break | |
319 | } else if v == ssOverflow { | |
320 | i.next = nextCGJDecompose | |
321 | break | |
322 | } | |
323 | if i.info.ccc < prevCC { | |
324 | goto doNorm | |
325 | } | |
326 | } | |
327 | if outCopyStart == 0 { | |
328 | return i.returnSlice(inCopyStart, i.p) | |
329 | } else if inCopyStart < i.p { | |
330 | i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) | |
331 | } | |
332 | return i.buf[:outp] | |
333 | doNorm: | |
334 | // Insert what we have decomposed so far in the reorderBuffer. | |
335 | // As we will only reorder, there will always be enough room. | |
336 | i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) | |
337 | i.rb.insertDecomposed(i.buf[0:outp]) | |
338 | return doNormDecomposed(i) | |
339 | } | |
340 | ||
341 | func doNormDecomposed(i *Iter) []byte { | |
342 | for { | |
343 | i.rb.insertUnsafe(i.rb.src, i.p, i.info) | |
344 | if i.p += int(i.info.size); i.p >= i.rb.nsrc { | |
345 | i.setDone() | |
346 | break | |
347 | } | |
348 | i.info = i.rb.f.info(i.rb.src, i.p) | |
349 | if i.info.ccc == 0 { | |
350 | break | |
351 | } | |
352 | if s := i.rb.ss.next(i.info); s == ssOverflow { | |
353 | i.next = nextCGJDecompose | |
354 | break | |
355 | } | |
356 | } | |
357 | // new segment or too many combining characters: exit normalization | |
358 | return i.buf[:i.rb.flushCopy(i.buf[:])] | |
359 | } | |
360 | ||
361 | func nextCGJDecompose(i *Iter) []byte { | |
362 | i.rb.ss = 0 | |
363 | i.rb.insertCGJ() | |
364 | i.next = nextDecomposed | |
365 | i.rb.ss.first(i.info) | |
366 | buf := doNormDecomposed(i) | |
367 | return buf | |
368 | } | |
369 | ||
370 | // nextComposed is the implementation of Next for forms NFC and NFKC. | |
371 | func nextComposed(i *Iter) []byte { | |
372 | outp, startp := 0, i.p | |
373 | var prevCC uint8 | |
374 | for { | |
375 | if !i.info.isYesC() { | |
376 | goto doNorm | |
377 | } | |
378 | prevCC = i.info.tccc | |
379 | sz := int(i.info.size) | |
380 | if sz == 0 { | |
381 | sz = 1 // illegal rune: copy byte-by-byte | |
382 | } | |
383 | p := outp + sz | |
384 | if p > len(i.buf) { | |
385 | break | |
386 | } | |
387 | outp = p | |
388 | i.p += sz | |
389 | if i.p >= i.rb.nsrc { | |
390 | i.setDone() | |
391 | break | |
392 | } else if i.rb.src._byte(i.p) < utf8.RuneSelf { | |
393 | i.rb.ss = 0 | |
394 | i.next = i.asciiF | |
395 | break | |
396 | } | |
397 | i.info = i.rb.f.info(i.rb.src, i.p) | |
398 | if v := i.rb.ss.next(i.info); v == ssStarter { | |
399 | break | |
400 | } else if v == ssOverflow { | |
401 | i.next = nextCGJCompose | |
402 | break | |
403 | } | |
404 | if i.info.ccc < prevCC { | |
405 | goto doNorm | |
406 | } | |
407 | } | |
408 | return i.returnSlice(startp, i.p) | |
409 | doNorm: | |
410 | // reset to start position | |
411 | i.p = startp | |
412 | i.info = i.rb.f.info(i.rb.src, i.p) | |
413 | i.rb.ss.first(i.info) | |
414 | if i.info.multiSegment() { | |
415 | d := i.info.Decomposition() | |
416 | info := i.rb.f.info(input{bytes: d}, 0) | |
417 | i.rb.insertUnsafe(input{bytes: d}, 0, info) | |
418 | i.multiSeg = d[int(info.size):] | |
419 | i.next = nextMultiNorm | |
420 | return nextMultiNorm(i) | |
421 | } | |
422 | i.rb.ss.first(i.info) | |
423 | i.rb.insertUnsafe(i.rb.src, i.p, i.info) | |
424 | return doNormComposed(i) | |
425 | } | |
426 | ||
427 | func doNormComposed(i *Iter) []byte { | |
428 | // First rune should already be inserted. | |
429 | for { | |
430 | if i.p += int(i.info.size); i.p >= i.rb.nsrc { | |
431 | i.setDone() | |
432 | break | |
433 | } | |
434 | i.info = i.rb.f.info(i.rb.src, i.p) | |
435 | if s := i.rb.ss.next(i.info); s == ssStarter { | |
436 | break | |
437 | } else if s == ssOverflow { | |
438 | i.next = nextCGJCompose | |
439 | break | |
440 | } | |
441 | i.rb.insertUnsafe(i.rb.src, i.p, i.info) | |
442 | } | |
443 | i.rb.compose() | |
444 | seg := i.buf[:i.rb.flushCopy(i.buf[:])] | |
445 | return seg | |
446 | } | |
447 | ||
448 | func nextCGJCompose(i *Iter) []byte { | |
449 | i.rb.ss = 0 // instead of first | |
450 | i.rb.insertCGJ() | |
451 | i.next = nextComposed | |
452 | // Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter, | |
453 | // even if they are not. This is particularly dubious for U+FF9E and UFF9A. | |
454 | // If we ever change that, insert a check here. | |
455 | i.rb.ss.first(i.info) | |
456 | i.rb.insertUnsafe(i.rb.src, i.p, i.info) | |
457 | return doNormComposed(i) | |
458 | } |