package scanner import ( "unicode" "unicode/utf8" "github.com/hashicorp/hil/ast" ) // Scan returns a channel that recieves Tokens from the given input string. // // The scanner's job is just to partition the string into meaningful parts. // It doesn't do any transformation of the raw input string, so the caller // must deal with any further interpretation required, such as parsing INTEGER // tokens into real ints, or dealing with escape sequences in LITERAL or // STRING tokens. // // Strings in the returned tokens are slices from the original string. // // startPos should be set to ast.InitPos unless the caller knows that // this interpolation string is part of a larger file and knows the position // of the first character in that larger file. func Scan(s string, startPos ast.Pos) <-chan *Token { ch := make(chan *Token) go scan(s, ch, startPos) return ch } func scan(s string, ch chan<- *Token, pos ast.Pos) { // 'remain' starts off as the whole string but we gradually // slice of the front of it as we work our way through. remain := s // nesting keeps track of how many ${ .. } sequences we are // inside, so we can recognize the minor differences in syntax // between outer string literals (LITERAL tokens) and quoted // string literals (STRING tokens). nesting := 0 // We're going to flip back and forth between parsing literals/strings // and parsing interpolation sequences ${ .. } until we reach EOF or // some INVALID token. All: for { startPos := pos // Literal string processing first, since the beginning of // a string is always outside of an interpolation sequence. literalVal, terminator := scanLiteral(remain, pos, nesting > 0) if len(literalVal) > 0 { litType := LITERAL if nesting > 0 { litType = STRING } ch <- &Token{ Type: litType, Content: literalVal, Pos: startPos, } remain = remain[len(literalVal):] } ch <- terminator remain = remain[len(terminator.Content):] pos = terminator.Pos // Safe to use len() here because none of the terminator tokens // can contain UTF-8 sequences. pos.Column = pos.Column + len(terminator.Content) switch terminator.Type { case INVALID: // Synthetic EOF after invalid token, since further scanning // is likely to just produce more garbage. ch <- &Token{ Type: EOF, Content: "", Pos: pos, } break All case EOF: // All done! break All case BEGIN: nesting++ case CQUOTE: // nothing special to do default: // Should never happen panic("invalid string/literal terminator") } // Now we do the processing of the insides of ${ .. } sequences. // This loop terminates when we encounter either a closing } or // an opening ", which will cause us to return to literal processing. Interpolation: for { token, size, newPos := scanInterpolationToken(remain, pos) ch <- token remain = remain[size:] pos = newPos switch token.Type { case INVALID: // Synthetic EOF after invalid token, since further scanning // is likely to just produce more garbage. ch <- &Token{ Type: EOF, Content: "", Pos: pos, } break All case EOF: // All done // (though a syntax error that we'll catch in the parser) break All case END: nesting-- if nesting < 0 { // Can happen if there are unbalanced ${ and } sequences // in the input, which we'll catch in the parser. nesting = 0 } break Interpolation case OQUOTE: // Beginning of nested quoted string break Interpolation } } } close(ch) } // Returns the token found at the start of the given string, followed by // the number of bytes that were consumed from the string and the adjusted // source position. // // Note that the number of bytes consumed can be more than the length of // the returned token contents if the string begins with whitespace, since // it will be silently consumed before reading the token. func scanInterpolationToken(s string, startPos ast.Pos) (*Token, int, ast.Pos) { pos := startPos size := 0 // Consume whitespace, if any for len(s) > 0 && byteIsSpace(s[0]) { if s[0] == '\n' { pos.Column = 1 pos.Line++ } else { pos.Column++ } size++ s = s[1:] } // Unexpected EOF during sequence if len(s) == 0 { return &Token{ Type: EOF, Content: "", Pos: pos, }, size, pos } next := s[0] var token *Token switch next { case '(', ')', '[', ']', ',', '.', '+', '-', '*', '/', '%', '?', ':': // Easy punctuation symbols that don't have any special meaning // during scanning, and that stand for themselves in the // TokenType enumeration. token = &Token{ Type: TokenType(next), Content: s[:1], Pos: pos, } case '}': token = &Token{ Type: END, Content: s[:1], Pos: pos, } case '"': token = &Token{ Type: OQUOTE, Content: s[:1], Pos: pos, } case '!': if len(s) >= 2 && s[:2] == "!=" { token = &Token{ Type: NOTEQUAL, Content: s[:2], Pos: pos, } } else { token = &Token{ Type: BANG, Content: s[:1], Pos: pos, } } case '<': if len(s) >= 2 && s[:2] == "<=" { token = &Token{ Type: LTE, Content: s[:2], Pos: pos, } } else { token = &Token{ Type: LT, Content: s[:1], Pos: pos, } } case '>': if len(s) >= 2 && s[:2] == ">=" { token = &Token{ Type: GTE, Content: s[:2], Pos: pos, } } else { token = &Token{ Type: GT, Content: s[:1], Pos: pos, } } case '=': if len(s) >= 2 && s[:2] == "==" { token = &Token{ Type: EQUAL, Content: s[:2], Pos: pos, } } else { // A single equals is not a valid operator token = &Token{ Type: INVALID, Content: s[:1], Pos: pos, } } case '&': if len(s) >= 2 && s[:2] == "&&" { token = &Token{ Type: AND, Content: s[:2], Pos: pos, } } else { token = &Token{ Type: INVALID, Content: s[:1], Pos: pos, } } case '|': if len(s) >= 2 && s[:2] == "||" { token = &Token{ Type: OR, Content: s[:2], Pos: pos, } } else { token = &Token{ Type: INVALID, Content: s[:1], Pos: pos, } } default: if next >= '0' && next <= '9' { num, numType := scanNumber(s) token = &Token{ Type: numType, Content: num, Pos: pos, } } else if stringStartsWithIdentifier(s) { ident, runeLen := scanIdentifier(s) tokenType := IDENTIFIER if ident == "true" || ident == "false" { tokenType = BOOL } token = &Token{ Type: tokenType, Content: ident, Pos: pos, } // Skip usual token handling because it doesn't // know how to deal with UTF-8 sequences. pos.Column = pos.Column + runeLen return token, size + len(ident), pos } else { _, byteLen := utf8.DecodeRuneInString(s) token = &Token{ Type: INVALID, Content: s[:byteLen], Pos: pos, } // Skip usual token handling because it doesn't // know how to deal with UTF-8 sequences. pos.Column = pos.Column + 1 return token, size + byteLen, pos } } // Here we assume that the token content contains no UTF-8 sequences, // because we dealt with UTF-8 characters as a special case where // necessary above. size = size + len(token.Content) pos.Column = pos.Column + len(token.Content) return token, size, pos } // Returns the (possibly-empty) prefix of the given string that represents // a literal, followed by the token that marks the end of the literal. func scanLiteral(s string, startPos ast.Pos, nested bool) (string, *Token) { litLen := 0 pos := startPos var terminator *Token for { if litLen >= len(s) { if nested { // We've ended in the middle of a quoted string, // which means this token is actually invalid. return "", &Token{ Type: INVALID, Content: s, Pos: startPos, } } terminator = &Token{ Type: EOF, Content: "", Pos: pos, } break } next := s[litLen] if next == '$' && len(s) > litLen+1 { follow := s[litLen+1] if follow == '{' { terminator = &Token{ Type: BEGIN, Content: s[litLen : litLen+2], Pos: pos, } pos.Column = pos.Column + 2 break } else if follow == '$' { // Double-$ escapes the special processing of $, // so we will consume both characters here. pos.Column = pos.Column + 2 litLen = litLen + 2 continue } } // special handling that applies only to quoted strings if nested { if next == '"' { terminator = &Token{ Type: CQUOTE, Content: s[litLen : litLen+1], Pos: pos, } pos.Column = pos.Column + 1 break } // Escaped quote marks do not terminate the string. // // All we do here in the scanner is avoid terminating a string // due to an escaped quote. The parser is responsible for the // full handling of escape sequences, since it's able to produce // better error messages than we can produce in here. if next == '\\' && len(s) > litLen+1 { follow := s[litLen+1] if follow == '"' { // \" escapes the special processing of ", // so we will consume both characters here. pos.Column = pos.Column + 2 litLen = litLen + 2 continue } else if follow == '\\' { // \\ escapes \ // so we will consume both characters here. pos.Column = pos.Column + 2 litLen = litLen + 2 continue } } } if next == '\n' { pos.Column = 1 pos.Line++ litLen++ } else { pos.Column++ // "Column" measures runes, so we need to actually consume // a valid UTF-8 character here. _, size := utf8.DecodeRuneInString(s[litLen:]) litLen = litLen + size } } return s[:litLen], terminator } // scanNumber returns the extent of the prefix of the string that represents // a valid number, along with what type of number it represents: INT or FLOAT. // // scanNumber does only basic character analysis: numbers consist of digits // and periods, with at least one period signalling a FLOAT. It's the parser's // responsibility to validate the form and range of the number, such as ensuring // that a FLOAT actually contains only one period, etc. func scanNumber(s string) (string, TokenType) { period := -1 byteLen := 0 numType := INTEGER for { if byteLen >= len(s) { break } next := s[byteLen] if next != '.' && (next < '0' || next > '9') { // If our last value was a period, then we're not a float, // we're just an integer that ends in a period. if period == byteLen-1 { byteLen-- numType = INTEGER } break } if next == '.' { // If we've already seen a period, break out if period >= 0 { break } period = byteLen numType = FLOAT } byteLen++ } return s[:byteLen], numType } // scanIdentifier returns the extent of the prefix of the string that // represents a valid identifier, along with the length of that prefix // in runes. // // Identifiers may contain utf8-encoded non-Latin letters, which will // cause the returned "rune length" to be shorter than the byte length // of the returned string. func scanIdentifier(s string) (string, int) { byteLen := 0 runeLen := 0 for { if byteLen >= len(s) { break } nextRune, size := utf8.DecodeRuneInString(s[byteLen:]) if !(nextRune == '_' || nextRune == '-' || nextRune == '.' || nextRune == '*' || unicode.IsNumber(nextRune) || unicode.IsLetter(nextRune) || unicode.IsMark(nextRune)) { break } // If we reach a star, it must be between periods to be part // of the same identifier. if nextRune == '*' && s[byteLen-1] != '.' { break } // If our previous character was a star, then the current must // be period. Otherwise, undo that and exit. if byteLen > 0 && s[byteLen-1] == '*' && nextRune != '.' { byteLen-- if s[byteLen-1] == '.' { byteLen-- } break } byteLen = byteLen + size runeLen = runeLen + 1 } return s[:byteLen], runeLen } // byteIsSpace implements a restrictive interpretation of spaces that includes // only what's valid inside interpolation sequences: spaces, tabs, newlines. func byteIsSpace(b byte) bool { switch b { case ' ', '\t', '\r', '\n': return true default: return false } } // stringStartsWithIdentifier returns true if the given string begins with // a character that is a legal start of an identifier: an underscore or // any character that Unicode considers to be a letter. func stringStartsWithIdentifier(s string) bool { if len(s) == 0 { return false } first := s[0] // Easy ASCII cases first if (first >= 'a' && first <= 'z') || (first >= 'A' && first <= 'Z') || first == '_' { return true } // If our first byte begins a UTF-8 sequence then the sequence might // be a unicode letter. if utf8.RuneStart(first) { firstRune, _ := utf8.DecodeRuneInString(s) if unicode.IsLetter(firstRune) { return true } } return false }