]> git.immae.eu Git - github/fretlink/terraform-provider-statuscake.git/blob - vendor/github.com/apparentlymart/go-textseg/textseg/grapheme_clusters.rl
deps: github.com/hashicorp/terraform@sdk-v0.11-with-go-modules
[github/fretlink/terraform-provider-statuscake.git] / vendor / github.com / apparentlymart / go-textseg / textseg / grapheme_clusters.rl
1 package textseg
2
3 import (
4 "errors"
5 "unicode/utf8"
6 )
7
8 // Generated from grapheme_clusters.rl. DO NOT EDIT
9 %%{
10 # (except you are actually in grapheme_clusters.rl here, so edit away!)
11
12 machine graphclust;
13 write data;
14 }%%
15
16 var Error = errors.New("invalid UTF8 text")
17
18 // ScanGraphemeClusters is a split function for bufio.Scanner that splits
19 // on grapheme cluster boundaries.
20 func ScanGraphemeClusters(data []byte, atEOF bool) (int, []byte, error) {
21 if len(data) == 0 {
22 return 0, nil, nil
23 }
24
25 // Ragel state
26 cs := 0 // Current State
27 p := 0 // "Pointer" into data
28 pe := len(data) // End-of-data "pointer"
29 ts := 0
30 te := 0
31 act := 0
32 eof := pe
33
34 // Make Go compiler happy
35 _ = ts
36 _ = te
37 _ = act
38 _ = eof
39
40 startPos := 0
41 endPos := 0
42
43 %%{
44 include GraphemeCluster "grapheme_clusters_table.rl";
45
46 action start {
47 startPos = p
48 }
49
50 action end {
51 endPos = p
52 }
53
54 action emit {
55 return endPos+1, data[startPos:endPos+1], nil
56 }
57
58 ZWJGlue = ZWJ (Glue_After_Zwj | E_Base_GAZ Extend* E_Modifier?)?;
59 AnyExtender = Extend | ZWJGlue | SpacingMark;
60 Extension = AnyExtender*;
61 ReplacementChar = (0xEF 0xBF 0xBD);
62
63 CRLFSeq = CR LF;
64 ControlSeq = Control | ReplacementChar;
65 HangulSeq = (
66 L+ (((LV? V+ | LVT) T*)?|LV?) |
67 LV V* T* |
68 V+ T* |
69 LVT T* |
70 T+
71 ) Extension;
72 EmojiSeq = (E_Base | E_Base_GAZ) Extend* E_Modifier? Extension;
73 ZWJSeq = ZWJGlue Extension;
74 EmojiFlagSeq = Regional_Indicator Regional_Indicator? Extension;
75
76 UTF8Cont = 0x80 .. 0xBF;
77 AnyUTF8 = (
78 0x00..0x7F |
79 0xC0..0xDF . UTF8Cont |
80 0xE0..0xEF . UTF8Cont . UTF8Cont |
81 0xF0..0xF7 . UTF8Cont . UTF8Cont . UTF8Cont
82 );
83
84 # OtherSeq is any character that isn't at the start of one of the extended sequences above, followed by extension
85 OtherSeq = (AnyUTF8 - (CR|LF|Control|ReplacementChar|L|LV|V|LVT|T|E_Base|E_Base_GAZ|ZWJ|Regional_Indicator|Prepend)) Extension;
86
87 # PrependSeq is prepend followed by any of the other patterns above, except control characters which explicitly break
88 PrependSeq = Prepend+ (HangulSeq|EmojiSeq|ZWJSeq|EmojiFlagSeq|OtherSeq)?;
89
90 CRLFTok = CRLFSeq >start @end;
91 ControlTok = ControlSeq >start @end;
92 HangulTok = HangulSeq >start @end;
93 EmojiTok = EmojiSeq >start @end;
94 ZWJTok = ZWJSeq >start @end;
95 EmojiFlagTok = EmojiFlagSeq >start @end;
96 OtherTok = OtherSeq >start @end;
97 PrependTok = PrependSeq >start @end;
98
99 main := |*
100 CRLFTok => emit;
101 ControlTok => emit;
102 HangulTok => emit;
103 EmojiTok => emit;
104 ZWJTok => emit;
105 EmojiFlagTok => emit;
106 PrependTok => emit;
107 OtherTok => emit;
108
109 # any single valid UTF-8 character would also be valid per spec,
110 # but we'll handle that separately after the loop so we can deal
111 # with requesting more bytes if we're not at EOF.
112 *|;
113
114 write init;
115 write exec;
116 }%%
117
118 // If we fall out here then we were unable to complete a sequence.
119 // If we weren't able to complete a sequence then either we've
120 // reached the end of a partial buffer (so there's more data to come)
121 // or we have an isolated symbol that would normally be part of a
122 // grapheme cluster but has appeared in isolation here.
123
124 if !atEOF {
125 // Request more
126 return 0, nil, nil
127 }
128
129 // Just take the first UTF-8 sequence and return that.
130 _, seqLen := utf8.DecodeRune(data)
131 return seqLen, data[:seqLen], nil
132 }