]>
Commit | Line | Data |
---|---|---|
15c0b25d AP |
1 | // Copyright 2011 The Go Authors. All rights reserved. |
2 | // Use of this source code is governed by a BSD-style | |
3 | // license that can be found in the LICENSE file. | |
4 | ||
5 | // Note: the file data_test.go that is generated should not be checked in. | |
6 | //go:generate go run maketables.go triegen.go | |
7 | //go:generate go test -tags test | |
8 | ||
9 | // Package norm contains types and functions for normalizing Unicode strings. | |
10 | package norm // import "golang.org/x/text/unicode/norm" | |
11 | ||
12 | import ( | |
13 | "unicode/utf8" | |
14 | ||
15 | "golang.org/x/text/transform" | |
16 | ) | |
17 | ||
18 | // A Form denotes a canonical representation of Unicode code points. | |
19 | // The Unicode-defined normalization and equivalence forms are: | |
20 | // | |
21 | // NFC Unicode Normalization Form C | |
22 | // NFD Unicode Normalization Form D | |
23 | // NFKC Unicode Normalization Form KC | |
24 | // NFKD Unicode Normalization Form KD | |
25 | // | |
26 | // For a Form f, this documentation uses the notation f(x) to mean | |
27 | // the bytes or string x converted to the given form. | |
28 | // A position n in x is called a boundary if conversion to the form can | |
29 | // proceed independently on both sides: | |
30 | // f(x) == append(f(x[0:n]), f(x[n:])...) | |
31 | // | |
107c1cdb ND |
32 | // References: https://unicode.org/reports/tr15/ and |
33 | // https://unicode.org/notes/tn5/. | |
15c0b25d AP |
34 | type Form int |
35 | ||
36 | const ( | |
37 | NFC Form = iota | |
38 | NFD | |
39 | NFKC | |
40 | NFKD | |
41 | ) | |
42 | ||
43 | // Bytes returns f(b). May return b if f(b) = b. | |
44 | func (f Form) Bytes(b []byte) []byte { | |
45 | src := inputBytes(b) | |
46 | ft := formTable[f] | |
47 | n, ok := ft.quickSpan(src, 0, len(b), true) | |
48 | if ok { | |
49 | return b | |
50 | } | |
51 | out := make([]byte, n, len(b)) | |
52 | copy(out, b[0:n]) | |
53 | rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush} | |
54 | return doAppendInner(&rb, n) | |
55 | } | |
56 | ||
57 | // String returns f(s). | |
58 | func (f Form) String(s string) string { | |
59 | src := inputString(s) | |
60 | ft := formTable[f] | |
61 | n, ok := ft.quickSpan(src, 0, len(s), true) | |
62 | if ok { | |
63 | return s | |
64 | } | |
65 | out := make([]byte, n, len(s)) | |
66 | copy(out, s[0:n]) | |
67 | rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush} | |
68 | return string(doAppendInner(&rb, n)) | |
69 | } | |
70 | ||
71 | // IsNormal returns true if b == f(b). | |
72 | func (f Form) IsNormal(b []byte) bool { | |
73 | src := inputBytes(b) | |
74 | ft := formTable[f] | |
75 | bp, ok := ft.quickSpan(src, 0, len(b), true) | |
76 | if ok { | |
77 | return true | |
78 | } | |
79 | rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)} | |
80 | rb.setFlusher(nil, cmpNormalBytes) | |
81 | for bp < len(b) { | |
82 | rb.out = b[bp:] | |
83 | if bp = decomposeSegment(&rb, bp, true); bp < 0 { | |
84 | return false | |
85 | } | |
86 | bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true) | |
87 | } | |
88 | return true | |
89 | } | |
90 | ||
91 | func cmpNormalBytes(rb *reorderBuffer) bool { | |
92 | b := rb.out | |
93 | for i := 0; i < rb.nrune; i++ { | |
94 | info := rb.rune[i] | |
95 | if int(info.size) > len(b) { | |
96 | return false | |
97 | } | |
98 | p := info.pos | |
99 | pe := p + info.size | |
100 | for ; p < pe; p++ { | |
101 | if b[0] != rb.byte[p] { | |
102 | return false | |
103 | } | |
104 | b = b[1:] | |
105 | } | |
106 | } | |
107 | return true | |
108 | } | |
109 | ||
110 | // IsNormalString returns true if s == f(s). | |
111 | func (f Form) IsNormalString(s string) bool { | |
112 | src := inputString(s) | |
113 | ft := formTable[f] | |
114 | bp, ok := ft.quickSpan(src, 0, len(s), true) | |
115 | if ok { | |
116 | return true | |
117 | } | |
118 | rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)} | |
119 | rb.setFlusher(nil, func(rb *reorderBuffer) bool { | |
120 | for i := 0; i < rb.nrune; i++ { | |
121 | info := rb.rune[i] | |
122 | if bp+int(info.size) > len(s) { | |
123 | return false | |
124 | } | |
125 | p := info.pos | |
126 | pe := p + info.size | |
127 | for ; p < pe; p++ { | |
128 | if s[bp] != rb.byte[p] { | |
129 | return false | |
130 | } | |
131 | bp++ | |
132 | } | |
133 | } | |
134 | return true | |
135 | }) | |
136 | for bp < len(s) { | |
137 | if bp = decomposeSegment(&rb, bp, true); bp < 0 { | |
138 | return false | |
139 | } | |
140 | bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true) | |
141 | } | |
142 | return true | |
143 | } | |
144 | ||
145 | // patchTail fixes a case where a rune may be incorrectly normalized | |
146 | // if it is followed by illegal continuation bytes. It returns the | |
147 | // patched buffer and whether the decomposition is still in progress. | |
148 | func patchTail(rb *reorderBuffer) bool { | |
149 | info, p := lastRuneStart(&rb.f, rb.out) | |
150 | if p == -1 || info.size == 0 { | |
151 | return true | |
152 | } | |
153 | end := p + int(info.size) | |
154 | extra := len(rb.out) - end | |
155 | if extra > 0 { | |
156 | // Potentially allocating memory. However, this only | |
157 | // happens with ill-formed UTF-8. | |
158 | x := make([]byte, 0) | |
159 | x = append(x, rb.out[len(rb.out)-extra:]...) | |
160 | rb.out = rb.out[:end] | |
161 | decomposeToLastBoundary(rb) | |
162 | rb.doFlush() | |
163 | rb.out = append(rb.out, x...) | |
164 | return false | |
165 | } | |
166 | buf := rb.out[p:] | |
167 | rb.out = rb.out[:p] | |
168 | decomposeToLastBoundary(rb) | |
169 | if s := rb.ss.next(info); s == ssStarter { | |
170 | rb.doFlush() | |
171 | rb.ss.first(info) | |
172 | } else if s == ssOverflow { | |
173 | rb.doFlush() | |
174 | rb.insertCGJ() | |
175 | rb.ss = 0 | |
176 | } | |
177 | rb.insertUnsafe(inputBytes(buf), 0, info) | |
178 | return true | |
179 | } | |
180 | ||
181 | func appendQuick(rb *reorderBuffer, i int) int { | |
182 | if rb.nsrc == i { | |
183 | return i | |
184 | } | |
185 | end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true) | |
186 | rb.out = rb.src.appendSlice(rb.out, i, end) | |
187 | return end | |
188 | } | |
189 | ||
190 | // Append returns f(append(out, b...)). | |
191 | // The buffer out must be nil, empty, or equal to f(out). | |
192 | func (f Form) Append(out []byte, src ...byte) []byte { | |
193 | return f.doAppend(out, inputBytes(src), len(src)) | |
194 | } | |
195 | ||
196 | func (f Form) doAppend(out []byte, src input, n int) []byte { | |
197 | if n == 0 { | |
198 | return out | |
199 | } | |
200 | ft := formTable[f] | |
201 | // Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer. | |
202 | if len(out) == 0 { | |
203 | p, _ := ft.quickSpan(src, 0, n, true) | |
204 | out = src.appendSlice(out, 0, p) | |
205 | if p == n { | |
206 | return out | |
207 | } | |
208 | rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush} | |
209 | return doAppendInner(&rb, p) | |
210 | } | |
211 | rb := reorderBuffer{f: *ft, src: src, nsrc: n} | |
212 | return doAppend(&rb, out, 0) | |
213 | } | |
214 | ||
215 | func doAppend(rb *reorderBuffer, out []byte, p int) []byte { | |
216 | rb.setFlusher(out, appendFlush) | |
217 | src, n := rb.src, rb.nsrc | |
218 | doMerge := len(out) > 0 | |
219 | if q := src.skipContinuationBytes(p); q > p { | |
220 | // Move leading non-starters to destination. | |
221 | rb.out = src.appendSlice(rb.out, p, q) | |
222 | p = q | |
223 | doMerge = patchTail(rb) | |
224 | } | |
225 | fd := &rb.f | |
226 | if doMerge { | |
227 | var info Properties | |
228 | if p < n { | |
229 | info = fd.info(src, p) | |
230 | if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 { | |
231 | if p == 0 { | |
232 | decomposeToLastBoundary(rb) | |
233 | } | |
234 | p = decomposeSegment(rb, p, true) | |
235 | } | |
236 | } | |
237 | if info.size == 0 { | |
238 | rb.doFlush() | |
239 | // Append incomplete UTF-8 encoding. | |
240 | return src.appendSlice(rb.out, p, n) | |
241 | } | |
242 | if rb.nrune > 0 { | |
243 | return doAppendInner(rb, p) | |
244 | } | |
245 | } | |
246 | p = appendQuick(rb, p) | |
247 | return doAppendInner(rb, p) | |
248 | } | |
249 | ||
250 | func doAppendInner(rb *reorderBuffer, p int) []byte { | |
251 | for n := rb.nsrc; p < n; { | |
252 | p = decomposeSegment(rb, p, true) | |
253 | p = appendQuick(rb, p) | |
254 | } | |
255 | return rb.out | |
256 | } | |
257 | ||
258 | // AppendString returns f(append(out, []byte(s))). | |
259 | // The buffer out must be nil, empty, or equal to f(out). | |
260 | func (f Form) AppendString(out []byte, src string) []byte { | |
261 | return f.doAppend(out, inputString(src), len(src)) | |
262 | } | |
263 | ||
264 | // QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]). | |
265 | // It is not guaranteed to return the largest such n. | |
266 | func (f Form) QuickSpan(b []byte) int { | |
267 | n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true) | |
268 | return n | |
269 | } | |
270 | ||
271 | // Span implements transform.SpanningTransformer. It returns a boundary n such | |
272 | // that b[0:n] == f(b[0:n]). It is not guaranteed to return the largest such n. | |
273 | func (f Form) Span(b []byte, atEOF bool) (n int, err error) { | |
274 | n, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), atEOF) | |
275 | if n < len(b) { | |
276 | if !ok { | |
277 | err = transform.ErrEndOfSpan | |
278 | } else { | |
279 | err = transform.ErrShortSrc | |
280 | } | |
281 | } | |
282 | return n, err | |
283 | } | |
284 | ||
285 | // SpanString returns a boundary n such that s[0:n] == f(s[0:n]). | |
286 | // It is not guaranteed to return the largest such n. | |
287 | func (f Form) SpanString(s string, atEOF bool) (n int, err error) { | |
288 | n, ok := formTable[f].quickSpan(inputString(s), 0, len(s), atEOF) | |
289 | if n < len(s) { | |
290 | if !ok { | |
291 | err = transform.ErrEndOfSpan | |
292 | } else { | |
293 | err = transform.ErrShortSrc | |
294 | } | |
295 | } | |
296 | return n, err | |
297 | } | |
298 | ||
299 | // quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and | |
300 | // whether any non-normalized parts were found. If atEOF is false, n will | |
301 | // not point past the last segment if this segment might be become | |
302 | // non-normalized by appending other runes. | |
303 | func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) { | |
304 | var lastCC uint8 | |
305 | ss := streamSafe(0) | |
306 | lastSegStart := i | |
307 | for n = end; i < n; { | |
308 | if j := src.skipASCII(i, n); i != j { | |
309 | i = j | |
310 | lastSegStart = i - 1 | |
311 | lastCC = 0 | |
312 | ss = 0 | |
313 | continue | |
314 | } | |
315 | info := f.info(src, i) | |
316 | if info.size == 0 { | |
317 | if atEOF { | |
318 | // include incomplete runes | |
319 | return n, true | |
320 | } | |
321 | return lastSegStart, true | |
322 | } | |
323 | // This block needs to be before the next, because it is possible to | |
324 | // have an overflow for runes that are starters (e.g. with U+FF9E). | |
325 | switch ss.next(info) { | |
326 | case ssStarter: | |
327 | lastSegStart = i | |
328 | case ssOverflow: | |
329 | return lastSegStart, false | |
330 | case ssSuccess: | |
331 | if lastCC > info.ccc { | |
332 | return lastSegStart, false | |
333 | } | |
334 | } | |
335 | if f.composing { | |
336 | if !info.isYesC() { | |
337 | break | |
338 | } | |
339 | } else { | |
340 | if !info.isYesD() { | |
341 | break | |
342 | } | |
343 | } | |
344 | lastCC = info.ccc | |
345 | i += int(info.size) | |
346 | } | |
347 | if i == n { | |
348 | if !atEOF { | |
349 | n = lastSegStart | |
350 | } | |
351 | return n, true | |
352 | } | |
353 | return lastSegStart, false | |
354 | } | |
355 | ||
356 | // QuickSpanString returns a boundary n such that s[0:n] == f(s[0:n]). | |
357 | // It is not guaranteed to return the largest such n. | |
358 | func (f Form) QuickSpanString(s string) int { | |
359 | n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true) | |
360 | return n | |
361 | } | |
362 | ||
363 | // FirstBoundary returns the position i of the first boundary in b | |
364 | // or -1 if b contains no boundary. | |
365 | func (f Form) FirstBoundary(b []byte) int { | |
366 | return f.firstBoundary(inputBytes(b), len(b)) | |
367 | } | |
368 | ||
369 | func (f Form) firstBoundary(src input, nsrc int) int { | |
370 | i := src.skipContinuationBytes(0) | |
371 | if i >= nsrc { | |
372 | return -1 | |
373 | } | |
374 | fd := formTable[f] | |
375 | ss := streamSafe(0) | |
376 | // We should call ss.first here, but we can't as the first rune is | |
377 | // skipped already. This means FirstBoundary can't really determine | |
378 | // CGJ insertion points correctly. Luckily it doesn't have to. | |
379 | for { | |
380 | info := fd.info(src, i) | |
381 | if info.size == 0 { | |
382 | return -1 | |
383 | } | |
384 | if s := ss.next(info); s != ssSuccess { | |
385 | return i | |
386 | } | |
387 | i += int(info.size) | |
388 | if i >= nsrc { | |
389 | if !info.BoundaryAfter() && !ss.isMax() { | |
390 | return -1 | |
391 | } | |
392 | return nsrc | |
393 | } | |
394 | } | |
395 | } | |
396 | ||
397 | // FirstBoundaryInString returns the position i of the first boundary in s | |
398 | // or -1 if s contains no boundary. | |
399 | func (f Form) FirstBoundaryInString(s string) int { | |
400 | return f.firstBoundary(inputString(s), len(s)) | |
401 | } | |
402 | ||
403 | // NextBoundary reports the index of the boundary between the first and next | |
404 | // segment in b or -1 if atEOF is false and there are not enough bytes to | |
405 | // determine this boundary. | |
406 | func (f Form) NextBoundary(b []byte, atEOF bool) int { | |
407 | return f.nextBoundary(inputBytes(b), len(b), atEOF) | |
408 | } | |
409 | ||
410 | // NextBoundaryInString reports the index of the boundary between the first and | |
411 | // next segment in b or -1 if atEOF is false and there are not enough bytes to | |
412 | // determine this boundary. | |
413 | func (f Form) NextBoundaryInString(s string, atEOF bool) int { | |
414 | return f.nextBoundary(inputString(s), len(s), atEOF) | |
415 | } | |
416 | ||
417 | func (f Form) nextBoundary(src input, nsrc int, atEOF bool) int { | |
418 | if nsrc == 0 { | |
419 | if atEOF { | |
420 | return 0 | |
421 | } | |
422 | return -1 | |
423 | } | |
424 | fd := formTable[f] | |
425 | info := fd.info(src, 0) | |
426 | if info.size == 0 { | |
427 | if atEOF { | |
428 | return 1 | |
429 | } | |
430 | return -1 | |
431 | } | |
432 | ss := streamSafe(0) | |
433 | ss.first(info) | |
434 | ||
435 | for i := int(info.size); i < nsrc; i += int(info.size) { | |
436 | info = fd.info(src, i) | |
437 | if info.size == 0 { | |
438 | if atEOF { | |
439 | return i | |
440 | } | |
441 | return -1 | |
442 | } | |
443 | // TODO: Using streamSafe to determine the boundary isn't the same as | |
444 | // using BoundaryBefore. Determine which should be used. | |
445 | if s := ss.next(info); s != ssSuccess { | |
446 | return i | |
447 | } | |
448 | } | |
449 | if !atEOF && !info.BoundaryAfter() && !ss.isMax() { | |
450 | return -1 | |
451 | } | |
452 | return nsrc | |
453 | } | |
454 | ||
455 | // LastBoundary returns the position i of the last boundary in b | |
456 | // or -1 if b contains no boundary. | |
457 | func (f Form) LastBoundary(b []byte) int { | |
458 | return lastBoundary(formTable[f], b) | |
459 | } | |
460 | ||
461 | func lastBoundary(fd *formInfo, b []byte) int { | |
462 | i := len(b) | |
463 | info, p := lastRuneStart(fd, b) | |
464 | if p == -1 { | |
465 | return -1 | |
466 | } | |
467 | if info.size == 0 { // ends with incomplete rune | |
468 | if p == 0 { // starts with incomplete rune | |
469 | return -1 | |
470 | } | |
471 | i = p | |
472 | info, p = lastRuneStart(fd, b[:i]) | |
473 | if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter | |
474 | return i | |
475 | } | |
476 | } | |
477 | if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8 | |
478 | return i | |
479 | } | |
480 | if info.BoundaryAfter() { | |
481 | return i | |
482 | } | |
483 | ss := streamSafe(0) | |
484 | v := ss.backwards(info) | |
485 | for i = p; i >= 0 && v != ssStarter; i = p { | |
486 | info, p = lastRuneStart(fd, b[:i]) | |
487 | if v = ss.backwards(info); v == ssOverflow { | |
488 | break | |
489 | } | |
490 | if p+int(info.size) != i { | |
491 | if p == -1 { // no boundary found | |
492 | return -1 | |
493 | } | |
494 | return i // boundary after an illegal UTF-8 encoding | |
495 | } | |
496 | } | |
497 | return i | |
498 | } | |
499 | ||
500 | // decomposeSegment scans the first segment in src into rb. It inserts 0x034f | |
501 | // (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters | |
502 | // and returns the number of bytes consumed from src or iShortDst or iShortSrc. | |
503 | func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int { | |
504 | // Force one character to be consumed. | |
505 | info := rb.f.info(rb.src, sp) | |
506 | if info.size == 0 { | |
507 | return 0 | |
508 | } | |
509 | if s := rb.ss.next(info); s == ssStarter { | |
510 | // TODO: this could be removed if we don't support merging. | |
511 | if rb.nrune > 0 { | |
512 | goto end | |
513 | } | |
514 | } else if s == ssOverflow { | |
515 | rb.insertCGJ() | |
516 | goto end | |
517 | } | |
518 | if err := rb.insertFlush(rb.src, sp, info); err != iSuccess { | |
519 | return int(err) | |
520 | } | |
521 | for { | |
522 | sp += int(info.size) | |
523 | if sp >= rb.nsrc { | |
524 | if !atEOF && !info.BoundaryAfter() { | |
525 | return int(iShortSrc) | |
526 | } | |
527 | break | |
528 | } | |
529 | info = rb.f.info(rb.src, sp) | |
530 | if info.size == 0 { | |
531 | if !atEOF { | |
532 | return int(iShortSrc) | |
533 | } | |
534 | break | |
535 | } | |
536 | if s := rb.ss.next(info); s == ssStarter { | |
537 | break | |
538 | } else if s == ssOverflow { | |
539 | rb.insertCGJ() | |
540 | break | |
541 | } | |
542 | if err := rb.insertFlush(rb.src, sp, info); err != iSuccess { | |
543 | return int(err) | |
544 | } | |
545 | } | |
546 | end: | |
547 | if !rb.doFlush() { | |
548 | return int(iShortDst) | |
549 | } | |
550 | return sp | |
551 | } | |
552 | ||
553 | // lastRuneStart returns the runeInfo and position of the last | |
554 | // rune in buf or the zero runeInfo and -1 if no rune was found. | |
555 | func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) { | |
556 | p := len(buf) - 1 | |
557 | for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- { | |
558 | } | |
559 | if p < 0 { | |
560 | return Properties{}, -1 | |
561 | } | |
562 | return fd.info(inputBytes(buf), p), p | |
563 | } | |
564 | ||
565 | // decomposeToLastBoundary finds an open segment at the end of the buffer | |
566 | // and scans it into rb. Returns the buffer minus the last segment. | |
567 | func decomposeToLastBoundary(rb *reorderBuffer) { | |
568 | fd := &rb.f | |
569 | info, i := lastRuneStart(fd, rb.out) | |
570 | if int(info.size) != len(rb.out)-i { | |
571 | // illegal trailing continuation bytes | |
572 | return | |
573 | } | |
574 | if info.BoundaryAfter() { | |
575 | return | |
576 | } | |
577 | var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order | |
578 | padd := 0 | |
579 | ss := streamSafe(0) | |
580 | p := len(rb.out) | |
581 | for { | |
582 | add[padd] = info | |
583 | v := ss.backwards(info) | |
584 | if v == ssOverflow { | |
585 | // Note that if we have an overflow, it the string we are appending to | |
586 | // is not correctly normalized. In this case the behavior is undefined. | |
587 | break | |
588 | } | |
589 | padd++ | |
590 | p -= int(info.size) | |
591 | if v == ssStarter || p < 0 { | |
592 | break | |
593 | } | |
594 | info, i = lastRuneStart(fd, rb.out[:p]) | |
595 | if int(info.size) != p-i { | |
596 | break | |
597 | } | |
598 | } | |
599 | rb.ss = ss | |
600 | // Copy bytes for insertion as we may need to overwrite rb.out. | |
601 | var buf [maxBufferSize * utf8.UTFMax]byte | |
602 | cp := buf[:copy(buf[:], rb.out[p:])] | |
603 | rb.out = rb.out[:p] | |
604 | for padd--; padd >= 0; padd-- { | |
605 | info = add[padd] | |
606 | rb.insertUnsafe(inputBytes(cp), 0, info) | |
607 | cp = cp[info.size:] | |
608 | } | |
609 | } |