7 "github.com/hashicorp/hil/ast"
10 // Scan returns a channel that recieves Tokens from the given input string.
12 // The scanner's job is just to partition the string into meaningful parts.
13 // It doesn't do any transformation of the raw input string, so the caller
14 // must deal with any further interpretation required, such as parsing INTEGER
15 // tokens into real ints, or dealing with escape sequences in LITERAL or
18 // Strings in the returned tokens are slices from the original string.
20 // startPos should be set to ast.InitPos unless the caller knows that
21 // this interpolation string is part of a larger file and knows the position
22 // of the first character in that larger file.
23 func Scan(s string, startPos ast.Pos) <-chan *Token {
24 ch := make(chan *Token)
25 go scan(s, ch, startPos)
29 func scan(s string, ch chan<- *Token, pos ast.Pos) {
30 // 'remain' starts off as the whole string but we gradually
31 // slice of the front of it as we work our way through.
34 // nesting keeps track of how many ${ .. } sequences we are
35 // inside, so we can recognize the minor differences in syntax
36 // between outer string literals (LITERAL tokens) and quoted
37 // string literals (STRING tokens).
40 // We're going to flip back and forth between parsing literals/strings
41 // and parsing interpolation sequences ${ .. } until we reach EOF or
42 // some INVALID token.
46 // Literal string processing first, since the beginning of
47 // a string is always outside of an interpolation sequence.
48 literalVal, terminator := scanLiteral(remain, pos, nesting > 0)
50 if len(literalVal) > 0 {
60 remain = remain[len(literalVal):]
64 remain = remain[len(terminator.Content):]
66 // Safe to use len() here because none of the terminator tokens
67 // can contain UTF-8 sequences.
68 pos.Column = pos.Column + len(terminator.Content)
70 switch terminator.Type {
72 // Synthetic EOF after invalid token, since further scanning
73 // is likely to just produce more garbage.
86 // nothing special to do
88 // Should never happen
89 panic("invalid string/literal terminator")
92 // Now we do the processing of the insides of ${ .. } sequences.
93 // This loop terminates when we encounter either a closing } or
94 // an opening ", which will cause us to return to literal processing.
98 token, size, newPos := scanInterpolationToken(remain, pos)
100 remain = remain[size:]
105 // Synthetic EOF after invalid token, since further scanning
106 // is likely to just produce more garbage.
115 // (though a syntax error that we'll catch in the parser)
120 // Can happen if there are unbalanced ${ and } sequences
121 // in the input, which we'll catch in the parser.
126 // Beginning of nested quoted string
135 // Returns the token found at the start of the given string, followed by
136 // the number of bytes that were consumed from the string and the adjusted
139 // Note that the number of bytes consumed can be more than the length of
140 // the returned token contents if the string begins with whitespace, since
141 // it will be silently consumed before reading the token.
142 func scanInterpolationToken(s string, startPos ast.Pos) (*Token, int, ast.Pos) {
146 // Consume whitespace, if any
147 for len(s) > 0 && byteIsSpace(s[0]) {
158 // Unexpected EOF during sequence
171 case '(', ')', '[', ']', ',', '.', '+', '-', '*', '/', '%', '?', ':':
172 // Easy punctuation symbols that don't have any special meaning
173 // during scanning, and that stand for themselves in the
174 // TokenType enumeration.
176 Type: TokenType(next),
193 if len(s) >= 2 && s[:2] == "!=" {
207 if len(s) >= 2 && s[:2] == "<=" {
221 if len(s) >= 2 && s[:2] == ">=" {
235 if len(s) >= 2 && s[:2] == "==" {
242 // A single equals is not a valid operator
250 if len(s) >= 2 && s[:2] == "&&" {
264 if len(s) >= 2 && s[:2] == "||" {
278 if next >= '0' && next <= '9' {
279 num, numType := scanNumber(s)
285 } else if stringStartsWithIdentifier(s) {
286 ident, runeLen := scanIdentifier(s)
287 tokenType := IDENTIFIER
288 if ident == "true" || ident == "false" {
296 // Skip usual token handling because it doesn't
297 // know how to deal with UTF-8 sequences.
298 pos.Column = pos.Column + runeLen
299 return token, size + len(ident), pos
301 _, byteLen := utf8.DecodeRuneInString(s)
304 Content: s[:byteLen],
307 // Skip usual token handling because it doesn't
308 // know how to deal with UTF-8 sequences.
309 pos.Column = pos.Column + 1
310 return token, size + byteLen, pos
314 // Here we assume that the token content contains no UTF-8 sequences,
315 // because we dealt with UTF-8 characters as a special case where
317 size = size + len(token.Content)
318 pos.Column = pos.Column + len(token.Content)
320 return token, size, pos
323 // Returns the (possibly-empty) prefix of the given string that represents
324 // a literal, followed by the token that marks the end of the literal.
325 func scanLiteral(s string, startPos ast.Pos, nested bool) (string, *Token) {
328 var terminator *Token
331 if litLen >= len(s) {
333 // We've ended in the middle of a quoted string,
334 // which means this token is actually invalid.
351 if next == '$' && len(s) > litLen+1 {
352 follow := s[litLen+1]
357 Content: s[litLen : litLen+2],
360 pos.Column = pos.Column + 2
362 } else if follow == '$' {
363 // Double-$ escapes the special processing of $,
364 // so we will consume both characters here.
365 pos.Column = pos.Column + 2
371 // special handling that applies only to quoted strings
376 Content: s[litLen : litLen+1],
379 pos.Column = pos.Column + 1
383 // Escaped quote marks do not terminate the string.
385 // All we do here in the scanner is avoid terminating a string
386 // due to an escaped quote. The parser is responsible for the
387 // full handling of escape sequences, since it's able to produce
388 // better error messages than we can produce in here.
389 if next == '\\' && len(s) > litLen+1 {
390 follow := s[litLen+1]
393 // \" escapes the special processing of ",
394 // so we will consume both characters here.
395 pos.Column = pos.Column + 2
409 // "Column" measures runes, so we need to actually consume
410 // a valid UTF-8 character here.
411 _, size := utf8.DecodeRuneInString(s[litLen:])
412 litLen = litLen + size
417 return s[:litLen], terminator
420 // scanNumber returns the extent of the prefix of the string that represents
421 // a valid number, along with what type of number it represents: INT or FLOAT.
423 // scanNumber does only basic character analysis: numbers consist of digits
424 // and periods, with at least one period signalling a FLOAT. It's the parser's
425 // responsibility to validate the form and range of the number, such as ensuring
426 // that a FLOAT actually contains only one period, etc.
427 func scanNumber(s string) (string, TokenType) {
432 if byteLen >= len(s) {
437 if next != '.' && (next < '0' || next > '9') {
438 // If our last value was a period, then we're not a float,
439 // we're just an integer that ends in a period.
440 if period == byteLen-1 {
449 // If we've already seen a period, break out
461 return s[:byteLen], numType
464 // scanIdentifier returns the extent of the prefix of the string that
465 // represents a valid identifier, along with the length of that prefix
468 // Identifiers may contain utf8-encoded non-Latin letters, which will
469 // cause the returned "rune length" to be shorter than the byte length
470 // of the returned string.
471 func scanIdentifier(s string) (string, int) {
475 if byteLen >= len(s) {
479 nextRune, size := utf8.DecodeRuneInString(s[byteLen:])
480 if !(nextRune == '_' ||
484 unicode.IsNumber(nextRune) ||
485 unicode.IsLetter(nextRune) ||
486 unicode.IsMark(nextRune)) {
490 // If we reach a star, it must be between periods to be part
491 // of the same identifier.
492 if nextRune == '*' && s[byteLen-1] != '.' {
496 // If our previous character was a star, then the current must
497 // be period. Otherwise, undo that and exit.
498 if byteLen > 0 && s[byteLen-1] == '*' && nextRune != '.' {
500 if s[byteLen-1] == '.' {
507 byteLen = byteLen + size
508 runeLen = runeLen + 1
511 return s[:byteLen], runeLen
514 // byteIsSpace implements a restrictive interpretation of spaces that includes
515 // only what's valid inside interpolation sequences: spaces, tabs, newlines.
516 func byteIsSpace(b byte) bool {
518 case ' ', '\t', '\r', '\n':
525 // stringStartsWithIdentifier returns true if the given string begins with
526 // a character that is a legal start of an identifier: an underscore or
527 // any character that Unicode considers to be a letter.
528 func stringStartsWithIdentifier(s string) bool {
535 // Easy ASCII cases first
536 if (first >= 'a' && first <= 'z') || (first >= 'A' && first <= 'Z') || first == '_' {
540 // If our first byte begins a UTF-8 sequence then the sequence might
541 // be a unicode letter.
542 if utf8.RuneStart(first) {
543 firstRune, _ := utf8.DecodeRuneInString(s)
544 if unicode.IsLetter(firstRune) {