8 // Generated from grapheme_clusters.rl. DO NOT EDIT
10 # (except you are actually in grapheme_clusters.rl here, so edit away!)
16 var Error = errors.New("invalid UTF8 text")
18 // ScanGraphemeClusters is a split function for bufio.Scanner that splits
19 // on grapheme cluster boundaries.
20 func ScanGraphemeClusters(data []byte, atEOF bool) (int, []byte, error) {
26 cs := 0 // Current State
27 p := 0 // "Pointer" into data
28 pe := len(data) // End-of-data "pointer"
34 // Make Go compiler happy
44 include GraphemeCluster "grapheme_clusters_table.rl";
55 return endPos+1, data[startPos:endPos+1], nil
58 ZWJGlue = ZWJ (Glue_After_Zwj | E_Base_GAZ Extend* E_Modifier?)?;
59 AnyExtender = Extend | ZWJGlue | SpacingMark;
60 Extension = AnyExtender*;
61 ReplacementChar = (0xEF 0xBF 0xBD);
64 ControlSeq = Control | ReplacementChar;
66 L+ (((LV? V+ | LVT) T*)?|LV?) |
72 EmojiSeq = (E_Base | E_Base_GAZ) Extend* E_Modifier? Extension;
73 ZWJSeq = ZWJGlue Extension;
74 EmojiFlagSeq = Regional_Indicator Regional_Indicator? Extension;
76 UTF8Cont = 0x80 .. 0xBF;
79 0xC0..0xDF . UTF8Cont |
80 0xE0..0xEF . UTF8Cont . UTF8Cont |
81 0xF0..0xF7 . UTF8Cont . UTF8Cont . UTF8Cont
84 # OtherSeq is any character that isn't at the start of one of the extended sequences above, followed by extension
85 OtherSeq = (AnyUTF8 - (CR|LF|Control|ReplacementChar|L|LV|V|LVT|T|E_Base|E_Base_GAZ|ZWJ|Regional_Indicator|Prepend)) Extension;
87 # PrependSeq is prepend followed by any of the other patterns above, except control characters which explicitly break
88 PrependSeq = Prepend+ (HangulSeq|EmojiSeq|ZWJSeq|EmojiFlagSeq|OtherSeq)?;
90 CRLFTok = CRLFSeq >start @end;
91 ControlTok = ControlSeq >start @end;
92 HangulTok = HangulSeq >start @end;
93 EmojiTok = EmojiSeq >start @end;
94 ZWJTok = ZWJSeq >start @end;
95 EmojiFlagTok = EmojiFlagSeq >start @end;
96 OtherTok = OtherSeq >start @end;
97 PrependTok = PrependSeq >start @end;
105 EmojiFlagTok => emit;
109 # any single valid UTF-8 character would also be valid per spec,
110 # but we'll handle that separately after the loop so we can deal
111 # with requesting more bytes if we're not at EOF.
118 // If we fall out here then we were unable to complete a sequence.
119 // If we weren't able to complete a sequence then either we've
120 // reached the end of a partial buffer (so there's more data to come)
121 // or we have an isolated symbol that would normally be part of a
122 // grapheme cluster but has appeared in isolation here.
129 // Just take the first UTF-8 sequence and return that.
130 _, seqLen := utf8.DecodeRune(data)
131 return seqLen, data[:seqLen], nil