]>
Commit | Line | Data |
---|---|---|
15c0b25d AP |
1 | |
2 | package hclsyntax | |
3 | ||
4 | import ( | |
5 | "bytes" | |
6 | ||
7 | "github.com/hashicorp/hcl2/hcl" | |
8 | ) | |
9 | ||
10 | // This file is generated from scan_tokens.rl. DO NOT EDIT. | |
11 | %%{ | |
107c1cdb | 12 | # (except when you are actually in scan_tokens.rl here, so edit away!) |
15c0b25d AP |
13 | |
14 | machine hcltok; | |
15 | write data; | |
16 | }%% | |
17 | ||
18 | func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []Token { | |
107c1cdb ND |
19 | stripData := stripUTF8BOM(data) |
20 | start.Byte += len(data) - len(stripData) | |
21 | data = stripData | |
22 | ||
15c0b25d | 23 | f := &tokenAccum{ |
107c1cdb ND |
24 | Filename: filename, |
25 | Bytes: data, | |
26 | Pos: start, | |
27 | StartByte: start.Byte, | |
15c0b25d AP |
28 | } |
29 | ||
30 | %%{ | |
31 | include UnicodeDerived "unicode_derived.rl"; | |
32 | ||
33 | UTF8Cont = 0x80 .. 0xBF; | |
34 | AnyUTF8 = ( | |
35 | 0x00..0x7F | | |
36 | 0xC0..0xDF . UTF8Cont | | |
37 | 0xE0..0xEF . UTF8Cont . UTF8Cont | | |
38 | 0xF0..0xF7 . UTF8Cont . UTF8Cont . UTF8Cont | |
39 | ); | |
40 | BrokenUTF8 = any - AnyUTF8; | |
41 | ||
42 | NumberLitContinue = (digit|'.'|('e'|'E') ('+'|'-')? digit); | |
43 | NumberLit = digit ("" | (NumberLitContinue - '.') | (NumberLitContinue* (NumberLitContinue - '.'))); | |
44 | Ident = (ID_Start | '_') (ID_Continue | '-')*; | |
45 | ||
46 | # Symbols that just represent themselves are handled as a single rule. | |
107c1cdb | 47 | SelfToken = "[" | "]" | "(" | ")" | "." | "," | "*" | "/" | "%" | "+" | "-" | "=" | "<" | ">" | "!" | "?" | ":" | "\n" | "&" | "|" | "~" | "^" | ";" | "`" | "'"; |
15c0b25d AP |
48 | |
49 | EqualOp = "=="; | |
50 | NotEqual = "!="; | |
51 | GreaterThanEqual = ">="; | |
52 | LessThanEqual = "<="; | |
53 | LogicalAnd = "&&"; | |
54 | LogicalOr = "||"; | |
55 | ||
56 | Ellipsis = "..."; | |
57 | FatArrow = "=>"; | |
58 | ||
59 | Newline = '\r' ? '\n'; | |
60 | EndOfLine = Newline; | |
61 | ||
62 | BeginStringTmpl = '"'; | |
63 | BeginHeredocTmpl = '<<' ('-')? Ident Newline; | |
64 | ||
65 | Comment = ( | |
107c1cdb ND |
66 | # The :>> operator in these is a "finish-guarded concatenation", |
67 | # which terminates the sequence on its left when it completes | |
68 | # the sequence on its right. | |
69 | # In the single-line comment cases this is allowing us to make | |
70 | # the trailing EndOfLine optional while still having the overall | |
71 | # pattern terminate. In the multi-line case it ensures that | |
72 | # the first comment in the file ends at the first */, rather than | |
73 | # gobbling up all of the "any*" until the _final_ */ in the file. | |
74 | ("#" (any - EndOfLine)* :>> EndOfLine?) | | |
75 | ("//" (any - EndOfLine)* :>> EndOfLine?) | | |
76 | ("/*" any* :>> "*/") | |
15c0b25d AP |
77 | ); |
78 | ||
79 | # Note: hclwrite assumes that only ASCII spaces appear between tokens, | |
80 | # and uses this assumption to recreate the spaces between tokens by | |
81 | # looking at byte offset differences. This means it will produce | |
82 | # incorrect results in the presence of tabs, but that's acceptable | |
83 | # because the canonical style (which hclwrite itself can impose | |
84 | # automatically is to never use tabs). | |
85 | Spaces = (' ' | 0x09)+; | |
86 | ||
87 | action beginStringTemplate { | |
88 | token(TokenOQuote); | |
89 | fcall stringTemplate; | |
90 | } | |
91 | ||
92 | action endStringTemplate { | |
93 | token(TokenCQuote); | |
94 | fret; | |
95 | } | |
96 | ||
97 | action beginHeredocTemplate { | |
98 | token(TokenOHeredoc); | |
99 | // the token is currently the whole heredoc introducer, like | |
100 | // <<EOT or <<-EOT, followed by a newline. We want to extract | |
101 | // just the "EOT" portion that we'll use as the closing marker. | |
102 | ||
103 | marker := data[ts+2:te-1] | |
104 | if marker[0] == '-' { | |
105 | marker = marker[1:] | |
106 | } | |
107 | if marker[len(marker)-1] == '\r' { | |
108 | marker = marker[:len(marker)-1] | |
109 | } | |
110 | ||
111 | heredocs = append(heredocs, heredocInProgress{ | |
112 | Marker: marker, | |
113 | StartOfLine: true, | |
114 | }) | |
115 | ||
116 | fcall heredocTemplate; | |
117 | } | |
118 | ||
119 | action heredocLiteralEOL { | |
120 | // This action is called specificially when a heredoc literal | |
121 | // ends with a newline character. | |
122 | ||
123 | // This might actually be our end marker. | |
124 | topdoc := &heredocs[len(heredocs)-1] | |
125 | if topdoc.StartOfLine { | |
126 | maybeMarker := bytes.TrimSpace(data[ts:te]) | |
127 | if bytes.Equal(maybeMarker, topdoc.Marker) { | |
128 | // We actually emit two tokens here: the end-of-heredoc | |
129 | // marker first, and then separately the newline that | |
130 | // follows it. This then avoids issues with the closing | |
131 | // marker consuming a newline that would normally be used | |
132 | // to mark the end of an attribute definition. | |
133 | // We might have either a \n sequence or an \r\n sequence | |
134 | // here, so we must handle both. | |
135 | nls := te-1 | |
136 | nle := te | |
137 | te-- | |
138 | if data[te-1] == '\r' { | |
139 | // back up one more byte | |
140 | nls-- | |
141 | te-- | |
142 | } | |
143 | token(TokenCHeredoc); | |
144 | ts = nls | |
145 | te = nle | |
146 | token(TokenNewline); | |
147 | heredocs = heredocs[:len(heredocs)-1] | |
148 | fret; | |
149 | } | |
150 | } | |
151 | ||
152 | topdoc.StartOfLine = true; | |
153 | token(TokenStringLit); | |
154 | } | |
155 | ||
156 | action heredocLiteralMidline { | |
157 | // This action is called when a heredoc literal _doesn't_ end | |
158 | // with a newline character, e.g. because we're about to enter | |
159 | // an interpolation sequence. | |
160 | heredocs[len(heredocs)-1].StartOfLine = false; | |
161 | token(TokenStringLit); | |
162 | } | |
163 | ||
164 | action bareTemplateLiteral { | |
165 | token(TokenStringLit); | |
166 | } | |
167 | ||
168 | action beginTemplateInterp { | |
169 | token(TokenTemplateInterp); | |
170 | braces++; | |
171 | retBraces = append(retBraces, braces); | |
172 | if len(heredocs) > 0 { | |
173 | heredocs[len(heredocs)-1].StartOfLine = false; | |
174 | } | |
175 | fcall main; | |
176 | } | |
177 | ||
178 | action beginTemplateControl { | |
179 | token(TokenTemplateControl); | |
180 | braces++; | |
181 | retBraces = append(retBraces, braces); | |
182 | if len(heredocs) > 0 { | |
183 | heredocs[len(heredocs)-1].StartOfLine = false; | |
184 | } | |
185 | fcall main; | |
186 | } | |
187 | ||
188 | action openBrace { | |
189 | token(TokenOBrace); | |
190 | braces++; | |
191 | } | |
192 | ||
193 | action closeBrace { | |
194 | if len(retBraces) > 0 && retBraces[len(retBraces)-1] == braces { | |
195 | token(TokenTemplateSeqEnd); | |
196 | braces--; | |
197 | retBraces = retBraces[0:len(retBraces)-1] | |
198 | fret; | |
199 | } else { | |
200 | token(TokenCBrace); | |
201 | braces--; | |
202 | } | |
203 | } | |
204 | ||
205 | action closeTemplateSeqEatWhitespace { | |
206 | // Only consume from the retBraces stack and return if we are at | |
207 | // a suitable brace nesting level, otherwise things will get | |
208 | // confused. (Not entering this branch indicates a syntax error, | |
209 | // which we will catch in the parser.) | |
210 | if len(retBraces) > 0 && retBraces[len(retBraces)-1] == braces { | |
211 | token(TokenTemplateSeqEnd); | |
212 | braces--; | |
213 | retBraces = retBraces[0:len(retBraces)-1] | |
214 | fret; | |
215 | } else { | |
216 | // We intentionally generate a TokenTemplateSeqEnd here, | |
217 | // even though the user apparently wanted a brace, because | |
218 | // we want to allow the parser to catch the incorrect use | |
219 | // of a ~} to balance a generic opening brace, rather than | |
220 | // a template sequence. | |
221 | token(TokenTemplateSeqEnd); | |
222 | braces--; | |
223 | } | |
224 | } | |
225 | ||
226 | TemplateInterp = "${" ("~")?; | |
227 | TemplateControl = "%{" ("~")?; | |
228 | EndStringTmpl = '"'; | |
107c1cdb ND |
229 | NewlineChars = ("\r"|"\n"); |
230 | NewlineCharsSeq = NewlineChars+; | |
231 | StringLiteralChars = (AnyUTF8 - NewlineChars); | |
232 | TemplateIgnoredNonBrace = (^'{' %{ fhold; }); | |
233 | TemplateNotInterp = '$' (TemplateIgnoredNonBrace | TemplateInterp); | |
234 | TemplateNotControl = '%' (TemplateIgnoredNonBrace | TemplateControl); | |
235 | QuotedStringLiteralWithEsc = ('\\' StringLiteralChars) | (StringLiteralChars - ("$" | '%' | '"' | "\\")); | |
15c0b25d | 236 | TemplateStringLiteral = ( |
107c1cdb ND |
237 | (TemplateNotInterp) | |
238 | (TemplateNotControl) | | |
239 | (QuotedStringLiteralWithEsc)+ | |
240 | ); | |
15c0b25d | 241 | HeredocStringLiteral = ( |
107c1cdb ND |
242 | (TemplateNotInterp) | |
243 | (TemplateNotControl) | | |
244 | (StringLiteralChars - ("$" | '%'))* | |
245 | ); | |
15c0b25d | 246 | BareStringLiteral = ( |
107c1cdb ND |
247 | (TemplateNotInterp) | |
248 | (TemplateNotControl) | | |
249 | (StringLiteralChars - ("$" | '%'))* | |
250 | ) Newline?; | |
15c0b25d AP |
251 | |
252 | stringTemplate := |* | |
253 | TemplateInterp => beginTemplateInterp; | |
254 | TemplateControl => beginTemplateControl; | |
255 | EndStringTmpl => endStringTemplate; | |
256 | TemplateStringLiteral => { token(TokenQuotedLit); }; | |
107c1cdb | 257 | NewlineCharsSeq => { token(TokenQuotedNewline); }; |
15c0b25d AP |
258 | AnyUTF8 => { token(TokenInvalid); }; |
259 | BrokenUTF8 => { token(TokenBadUTF8); }; | |
260 | *|; | |
261 | ||
262 | heredocTemplate := |* | |
263 | TemplateInterp => beginTemplateInterp; | |
264 | TemplateControl => beginTemplateControl; | |
265 | HeredocStringLiteral EndOfLine => heredocLiteralEOL; | |
266 | HeredocStringLiteral => heredocLiteralMidline; | |
267 | BrokenUTF8 => { token(TokenBadUTF8); }; | |
268 | *|; | |
269 | ||
270 | bareTemplate := |* | |
271 | TemplateInterp => beginTemplateInterp; | |
272 | TemplateControl => beginTemplateControl; | |
273 | BareStringLiteral => bareTemplateLiteral; | |
274 | BrokenUTF8 => { token(TokenBadUTF8); }; | |
275 | *|; | |
276 | ||
277 | identOnly := |* | |
278 | Ident => { token(TokenIdent) }; | |
279 | BrokenUTF8 => { token(TokenBadUTF8) }; | |
280 | AnyUTF8 => { token(TokenInvalid) }; | |
281 | *|; | |
282 | ||
283 | main := |* | |
284 | Spaces => {}; | |
285 | NumberLit => { token(TokenNumberLit) }; | |
286 | Ident => { token(TokenIdent) }; | |
287 | ||
288 | Comment => { token(TokenComment) }; | |
289 | Newline => { token(TokenNewline) }; | |
290 | ||
291 | EqualOp => { token(TokenEqualOp); }; | |
292 | NotEqual => { token(TokenNotEqual); }; | |
293 | GreaterThanEqual => { token(TokenGreaterThanEq); }; | |
294 | LessThanEqual => { token(TokenLessThanEq); }; | |
295 | LogicalAnd => { token(TokenAnd); }; | |
296 | LogicalOr => { token(TokenOr); }; | |
297 | Ellipsis => { token(TokenEllipsis); }; | |
298 | FatArrow => { token(TokenFatArrow); }; | |
299 | SelfToken => { selfToken() }; | |
300 | ||
301 | "{" => openBrace; | |
302 | "}" => closeBrace; | |
303 | ||
304 | "~}" => closeTemplateSeqEatWhitespace; | |
305 | ||
306 | BeginStringTmpl => beginStringTemplate; | |
307 | BeginHeredocTmpl => beginHeredocTemplate; | |
308 | ||
309 | BrokenUTF8 => { token(TokenBadUTF8) }; | |
310 | AnyUTF8 => { token(TokenInvalid) }; | |
311 | *|; | |
312 | ||
313 | }%% | |
314 | ||
315 | // Ragel state | |
316 | p := 0 // "Pointer" into data | |
317 | pe := len(data) // End-of-data "pointer" | |
318 | ts := 0 | |
319 | te := 0 | |
320 | act := 0 | |
321 | eof := pe | |
322 | var stack []int | |
323 | var top int | |
324 | ||
325 | var cs int // current state | |
326 | switch mode { | |
327 | case scanNormal: | |
328 | cs = hcltok_en_main | |
329 | case scanTemplate: | |
330 | cs = hcltok_en_bareTemplate | |
331 | case scanIdentOnly: | |
332 | cs = hcltok_en_identOnly | |
333 | default: | |
334 | panic("invalid scanMode") | |
335 | } | |
336 | ||
337 | braces := 0 | |
338 | var retBraces []int // stack of brace levels that cause us to use fret | |
339 | var heredocs []heredocInProgress // stack of heredocs we're currently processing | |
340 | ||
341 | %%{ | |
342 | prepush { | |
343 | stack = append(stack, 0); | |
344 | } | |
345 | postpop { | |
346 | stack = stack[:len(stack)-1]; | |
347 | } | |
348 | }%% | |
349 | ||
350 | // Make Go compiler happy | |
351 | _ = ts | |
352 | _ = te | |
353 | _ = act | |
354 | _ = eof | |
355 | ||
356 | token := func (ty TokenType) { | |
357 | f.emitToken(ty, ts, te) | |
358 | } | |
359 | selfToken := func () { | |
360 | b := data[ts:te] | |
361 | if len(b) != 1 { | |
362 | // should never happen | |
363 | panic("selfToken only works for single-character tokens") | |
364 | } | |
365 | f.emitToken(TokenType(b[0]), ts, te) | |
366 | } | |
367 | ||
368 | %%{ | |
369 | write init nocs; | |
370 | write exec; | |
371 | }%% | |
372 | ||
373 | // If we fall out here without being in a final state then we've | |
374 | // encountered something that the scanner can't match, which we'll | |
375 | // deal with as an invalid. | |
376 | if cs < hcltok_first_final { | |
377 | if mode == scanTemplate && len(stack) == 0 { | |
378 | // If we're scanning a bare template then any straggling | |
379 | // top-level stuff is actually literal string, rather than | |
380 | // invalid. This handles the case where the template ends | |
381 | // with a single "$" or "%", which trips us up because we | |
382 | // want to see another character to decide if it's a sequence | |
383 | // or an escape. | |
384 | f.emitToken(TokenStringLit, ts, len(data)) | |
385 | } else { | |
386 | f.emitToken(TokenInvalid, ts, len(data)) | |
387 | } | |
388 | } | |
389 | ||
390 | // We always emit a synthetic EOF token at the end, since it gives the | |
391 | // parser position information for an "unexpected EOF" diagnostic. | |
392 | f.emitToken(TokenEOF, len(data), len(data)) | |
393 | ||
394 | return f.Tokens | |
395 | } |