1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
12 // MaxSegmentSize is the maximum size of a byte buffer needed to consider any
13 // sequence of starter and non-starter runes for the purpose of normalization.
14 const MaxSegmentSize = maxByteBufferSize
16 // An Iter iterates over a string or byte slice, while normalizing it
20 buf [maxByteBufferSize]byte
21 info Properties // first character saved from previous iteration
22 next iterFunc // implementation of next depends on form
25 p int // current position in input source
26 multiSeg []byte // remainder of multi-segment decomposition
29 type iterFunc func(*Iter) []byte
31 // Init initializes i to iterate over src after normalizing it to Form f.
32 func (i *Iter) Init(f Form, src []byte) {
41 i.next = i.rb.f.nextMain
42 i.asciiF = nextASCIIBytes
43 i.info = i.rb.f.info(i.rb.src, i.p)
47 // InitString initializes i to iterate over src after normalizing it to Form f.
48 func (i *Iter) InitString(f Form, src string) {
56 i.rb.initString(f, src)
57 i.next = i.rb.f.nextMain
58 i.asciiF = nextASCIIString
59 i.info = i.rb.f.info(i.rb.src, i.p)
63 // Seek sets the segment to be returned by the next call to Next to start
64 // at position p. It is the responsibility of the caller to set p to the
65 // start of a segment.
66 func (i *Iter) Seek(offset int64, whence int) (int64, error) {
72 abs = int64(i.p) + offset
74 abs = int64(i.rb.nsrc) + offset
76 return 0, fmt.Errorf("norm: invalid whence")
79 return 0, fmt.Errorf("norm: negative position")
81 if int(abs) >= i.rb.nsrc {
83 return int64(i.p), nil
87 i.next = i.rb.f.nextMain
88 i.info = i.rb.f.info(i.rb.src, i.p)
93 // returnSlice returns a slice of the underlying input type as a byte slice.
94 // If the underlying is of type []byte, it will simply return a slice.
95 // If the underlying is of type string, it will copy the slice to the buffer
97 func (i *Iter) returnSlice(a, b int) []byte {
98 if i.rb.src.bytes == nil {
99 return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])]
101 return i.rb.src.bytes[a:b]
104 // Pos returns the byte position at which the next call to Next will commence processing.
105 func (i *Iter) Pos() int {
109 func (i *Iter) setDone() {
114 // Done returns true if there is no more input to process.
115 func (i *Iter) Done() bool {
116 return i.p >= i.rb.nsrc
119 // Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
120 // For any input a and b for which f(a) == f(b), subsequent calls
121 // to Next will return the same segments.
122 // Modifying runes are grouped together with the preceding starter, if such a starter exists.
123 // Although not guaranteed, n will typically be the smallest possible n.
124 func (i *Iter) Next() []byte {
128 func nextASCIIBytes(i *Iter) []byte {
133 return i.rb.src.bytes[p0:p]
135 if i.rb.src.bytes[p] < utf8.RuneSelf {
138 return i.rb.src.bytes[p0:p]
140 i.info = i.rb.f.info(i.rb.src, i.p)
141 i.next = i.rb.f.nextMain
145 func nextASCIIString(i *Iter) []byte {
148 i.buf[0] = i.rb.src.str[i.p]
152 if i.rb.src.str[p] < utf8.RuneSelf {
153 i.buf[0] = i.rb.src.str[i.p]
157 i.info = i.rb.f.info(i.rb.src, i.p)
158 i.next = i.rb.f.nextMain
162 func nextHangul(i *Iter) []byte {
164 next := p + hangulUTF8Size
165 if next >= i.rb.nsrc {
167 } else if i.rb.src.hangul(next) == 0 {
169 i.info = i.rb.f.info(i.rb.src, i.p)
170 i.next = i.rb.f.nextMain
174 return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))]
177 func nextDone(i *Iter) []byte {
181 // nextMulti is used for iterating over multi-segment decompositions
182 // for decomposing normal forms.
183 func nextMulti(i *Iter) []byte {
187 for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
190 info := i.rb.f.info(input{bytes: d}, j)
191 if info.BoundaryBefore() {
197 // treat last segment as normal decomposition
198 i.next = i.rb.f.nextMain
202 // nextMultiNorm is used for iterating over multi-segment decompositions
203 // for composing normal forms.
204 func nextMultiNorm(i *Iter) []byte {
208 info := i.rb.f.info(input{bytes: d}, j)
209 if info.BoundaryBefore() {
211 seg := i.buf[:i.rb.flushCopy(i.buf[:])]
212 i.rb.insertUnsafe(input{bytes: d}, j, info)
213 i.multiSeg = d[j+int(info.size):]
216 i.rb.insertUnsafe(input{bytes: d}, j, info)
220 i.next = nextComposed
221 return doNormComposed(i)
224 // nextDecomposed is the implementation of Next for forms NFD and NFKD.
225 func nextDecomposed(i *Iter) (next []byte) {
227 inCopyStart, outCopyStart := i.p, 0
229 if sz := int(i.info.size); sz <= 1 {
232 i.p++ // ASCII or illegal byte. Either way, advance by 1.
233 if i.p >= i.rb.nsrc {
235 return i.returnSlice(p, i.p)
236 } else if i.rb.src._byte(i.p) < utf8.RuneSelf {
238 return i.returnSlice(p, i.p)
241 } else if d := i.info.Decomposition(); d != nil {
242 // Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
243 // Case 1: there is a leftover to copy. In this case the decomposition
244 // must begin with a modifier and should always be appended.
245 // Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
248 i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
249 // TODO: this condition should not be possible, but we leave it
250 // in for defensive purposes.
254 } else if i.info.multiSegment() {
255 // outp must be 0 as multi-segment decompositions always
256 // start a new segment.
257 if i.multiSeg == nil {
262 // We are in the last segment. Treat as normal decomposition.
267 prevCC := i.info.tccc
268 if i.p += sz; i.p >= i.rb.nsrc {
270 i.info = Properties{} // Force BoundaryBefore to succeed.
272 i.info = i.rb.f.info(i.rb.src, i.p)
274 switch i.rb.ss.next(i.info) {
276 i.next = nextCGJDecompose
280 copy(i.buf[outp:], d)
285 copy(i.buf[outp:], d)
287 inCopyStart, outCopyStart = i.p, outp
288 if i.info.ccc < prevCC {
292 } else if r := i.rb.src.hangul(i.p); r != 0 {
293 outp = decomposeHangul(i.buf[:], r)
294 i.p += hangulUTF8Size
295 inCopyStart, outCopyStart = i.p, outp
296 if i.p >= i.rb.nsrc {
299 } else if i.rb.src.hangul(i.p) != 0 {
311 if i.p >= i.rb.nsrc {
315 prevCC := i.info.tccc
316 i.info = i.rb.f.info(i.rb.src, i.p)
317 if v := i.rb.ss.next(i.info); v == ssStarter {
319 } else if v == ssOverflow {
320 i.next = nextCGJDecompose
323 if i.info.ccc < prevCC {
327 if outCopyStart == 0 {
328 return i.returnSlice(inCopyStart, i.p)
329 } else if inCopyStart < i.p {
330 i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
334 // Insert what we have decomposed so far in the reorderBuffer.
335 // As we will only reorder, there will always be enough room.
336 i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
337 i.rb.insertDecomposed(i.buf[0:outp])
338 return doNormDecomposed(i)
341 func doNormDecomposed(i *Iter) []byte {
343 i.rb.insertUnsafe(i.rb.src, i.p, i.info)
344 if i.p += int(i.info.size); i.p >= i.rb.nsrc {
348 i.info = i.rb.f.info(i.rb.src, i.p)
352 if s := i.rb.ss.next(i.info); s == ssOverflow {
353 i.next = nextCGJDecompose
357 // new segment or too many combining characters: exit normalization
358 return i.buf[:i.rb.flushCopy(i.buf[:])]
361 func nextCGJDecompose(i *Iter) []byte {
364 i.next = nextDecomposed
365 i.rb.ss.first(i.info)
366 buf := doNormDecomposed(i)
370 // nextComposed is the implementation of Next for forms NFC and NFKC.
371 func nextComposed(i *Iter) []byte {
372 outp, startp := 0, i.p
375 if !i.info.isYesC() {
379 sz := int(i.info.size)
381 sz = 1 // illegal rune: copy byte-by-byte
389 if i.p >= i.rb.nsrc {
392 } else if i.rb.src._byte(i.p) < utf8.RuneSelf {
397 i.info = i.rb.f.info(i.rb.src, i.p)
398 if v := i.rb.ss.next(i.info); v == ssStarter {
400 } else if v == ssOverflow {
401 i.next = nextCGJCompose
404 if i.info.ccc < prevCC {
408 return i.returnSlice(startp, i.p)
410 // reset to start position
412 i.info = i.rb.f.info(i.rb.src, i.p)
413 i.rb.ss.first(i.info)
414 if i.info.multiSegment() {
415 d := i.info.Decomposition()
416 info := i.rb.f.info(input{bytes: d}, 0)
417 i.rb.insertUnsafe(input{bytes: d}, 0, info)
418 i.multiSeg = d[int(info.size):]
419 i.next = nextMultiNorm
420 return nextMultiNorm(i)
422 i.rb.ss.first(i.info)
423 i.rb.insertUnsafe(i.rb.src, i.p, i.info)
424 return doNormComposed(i)
427 func doNormComposed(i *Iter) []byte {
428 // First rune should already be inserted.
430 if i.p += int(i.info.size); i.p >= i.rb.nsrc {
434 i.info = i.rb.f.info(i.rb.src, i.p)
435 if s := i.rb.ss.next(i.info); s == ssStarter {
437 } else if s == ssOverflow {
438 i.next = nextCGJCompose
441 i.rb.insertUnsafe(i.rb.src, i.p, i.info)
444 seg := i.buf[:i.rb.flushCopy(i.buf[:])]
448 func nextCGJCompose(i *Iter) []byte {
449 i.rb.ss = 0 // instead of first
451 i.next = nextComposed
452 // Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter,
453 // even if they are not. This is particularly dubious for U+FF9E and UFF9A.
454 // If we ever change that, insert a check here.
455 i.rb.ss.first(i.info)
456 i.rb.insertUnsafe(i.rb.src, i.p, i.info)
457 return doNormComposed(i)