diff options
Diffstat (limited to 'vendor/github.com/apparentlymart/go-textseg/textseg/grapheme_clusters.rl')
-rw-r--r-- | vendor/github.com/apparentlymart/go-textseg/textseg/grapheme_clusters.rl | 132 |
1 files changed, 132 insertions, 0 deletions
diff --git a/vendor/github.com/apparentlymart/go-textseg/textseg/grapheme_clusters.rl b/vendor/github.com/apparentlymart/go-textseg/textseg/grapheme_clusters.rl new file mode 100644 index 0000000..003ffbf --- /dev/null +++ b/vendor/github.com/apparentlymart/go-textseg/textseg/grapheme_clusters.rl | |||
@@ -0,0 +1,132 @@ | |||
1 | package textseg | ||
2 | |||
3 | import ( | ||
4 | "errors" | ||
5 | "unicode/utf8" | ||
6 | ) | ||
7 | |||
8 | // Generated from grapheme_clusters.rl. DO NOT EDIT | ||
9 | %%{ | ||
10 | # (except you are actually in grapheme_clusters.rl here, so edit away!) | ||
11 | |||
12 | machine graphclust; | ||
13 | write data; | ||
14 | }%% | ||
15 | |||
16 | var Error = errors.New("invalid UTF8 text") | ||
17 | |||
18 | // ScanGraphemeClusters is a split function for bufio.Scanner that splits | ||
19 | // on grapheme cluster boundaries. | ||
20 | func ScanGraphemeClusters(data []byte, atEOF bool) (int, []byte, error) { | ||
21 | if len(data) == 0 { | ||
22 | return 0, nil, nil | ||
23 | } | ||
24 | |||
25 | // Ragel state | ||
26 | cs := 0 // Current State | ||
27 | p := 0 // "Pointer" into data | ||
28 | pe := len(data) // End-of-data "pointer" | ||
29 | ts := 0 | ||
30 | te := 0 | ||
31 | act := 0 | ||
32 | eof := pe | ||
33 | |||
34 | // Make Go compiler happy | ||
35 | _ = ts | ||
36 | _ = te | ||
37 | _ = act | ||
38 | _ = eof | ||
39 | |||
40 | startPos := 0 | ||
41 | endPos := 0 | ||
42 | |||
43 | %%{ | ||
44 | include GraphemeCluster "grapheme_clusters_table.rl"; | ||
45 | |||
46 | action start { | ||
47 | startPos = p | ||
48 | } | ||
49 | |||
50 | action end { | ||
51 | endPos = p | ||
52 | } | ||
53 | |||
54 | action emit { | ||
55 | return endPos+1, data[startPos:endPos+1], nil | ||
56 | } | ||
57 | |||
58 | ZWJGlue = ZWJ (Glue_After_Zwj | E_Base_GAZ Extend* E_Modifier?)?; | ||
59 | AnyExtender = Extend | ZWJGlue | SpacingMark; | ||
60 | Extension = AnyExtender*; | ||
61 | ReplacementChar = (0xEF 0xBF 0xBD); | ||
62 | |||
63 | CRLFSeq = CR LF; | ||
64 | ControlSeq = Control | ReplacementChar; | ||
65 | HangulSeq = ( | ||
66 | L+ (((LV? V+ | LVT) T*)?|LV?) | | ||
67 | LV V* T* | | ||
68 | V+ T* | | ||
69 | LVT T* | | ||
70 | T+ | ||
71 | ) Extension; | ||
72 | EmojiSeq = (E_Base | E_Base_GAZ) Extend* E_Modifier? Extension; | ||
73 | ZWJSeq = ZWJGlue Extension; | ||
74 | EmojiFlagSeq = Regional_Indicator Regional_Indicator? Extension; | ||
75 | |||
76 | UTF8Cont = 0x80 .. 0xBF; | ||
77 | AnyUTF8 = ( | ||
78 | 0x00..0x7F | | ||
79 | 0xC0..0xDF . UTF8Cont | | ||
80 | 0xE0..0xEF . UTF8Cont . UTF8Cont | | ||
81 | 0xF0..0xF7 . UTF8Cont . UTF8Cont . UTF8Cont | ||
82 | ); | ||
83 | |||
84 | # OtherSeq is any character that isn't at the start of one of the extended sequences above, followed by extension | ||
85 | OtherSeq = (AnyUTF8 - (CR|LF|Control|ReplacementChar|L|LV|V|LVT|T|E_Base|E_Base_GAZ|ZWJ|Regional_Indicator|Prepend)) Extension; | ||
86 | |||
87 | # PrependSeq is prepend followed by any of the other patterns above, except control characters which explicitly break | ||
88 | PrependSeq = Prepend+ (HangulSeq|EmojiSeq|ZWJSeq|EmojiFlagSeq|OtherSeq)?; | ||
89 | |||
90 | CRLFTok = CRLFSeq >start @end; | ||
91 | ControlTok = ControlSeq >start @end; | ||
92 | HangulTok = HangulSeq >start @end; | ||
93 | EmojiTok = EmojiSeq >start @end; | ||
94 | ZWJTok = ZWJSeq >start @end; | ||
95 | EmojiFlagTok = EmojiFlagSeq >start @end; | ||
96 | OtherTok = OtherSeq >start @end; | ||
97 | PrependTok = PrependSeq >start @end; | ||
98 | |||
99 | main := |* | ||
100 | CRLFTok => emit; | ||
101 | ControlTok => emit; | ||
102 | HangulTok => emit; | ||
103 | EmojiTok => emit; | ||
104 | ZWJTok => emit; | ||
105 | EmojiFlagTok => emit; | ||
106 | PrependTok => emit; | ||
107 | OtherTok => emit; | ||
108 | |||
109 | # any single valid UTF-8 character would also be valid per spec, | ||
110 | # but we'll handle that separately after the loop so we can deal | ||
111 | # with requesting more bytes if we're not at EOF. | ||
112 | *|; | ||
113 | |||
114 | write init; | ||
115 | write exec; | ||
116 | }%% | ||
117 | |||
118 | // If we fall out here then we were unable to complete a sequence. | ||
119 | // If we weren't able to complete a sequence then either we've | ||
120 | // reached the end of a partial buffer (so there's more data to come) | ||
121 | // or we have an isolated symbol that would normally be part of a | ||
122 | // grapheme cluster but has appeared in isolation here. | ||
123 | |||
124 | if !atEOF { | ||
125 | // Request more | ||
126 | return 0, nil, nil | ||
127 | } | ||
128 | |||
129 | // Just take the first UTF-8 sequence and return that. | ||
130 | _, seqLen := utf8.DecodeRune(data) | ||
131 | return seqLen, data[:seqLen], nil | ||
132 | } | ||