aboutsummaryrefslogblamecommitdiffhomepage
path: root/vendor/github.com/hashicorp/hil/scanner/scanner.go
blob: 86085de018fa2d8455f13f0eb90be4e0e8eaa834 (plain) (tree)












































































































































































































































































































































































































                                                                                                





                                                                                   
























































































































































                                                                                             
package scanner

import (
	"unicode"
	"unicode/utf8"

	"github.com/hashicorp/hil/ast"
)

// Scan returns a channel that recieves Tokens from the given input string.
//
// The scanner's job is just to partition the string into meaningful parts.
// It doesn't do any transformation of the raw input string, so the caller
// must deal with any further interpretation required, such as parsing INTEGER
// tokens into real ints, or dealing with escape sequences in LITERAL or
// STRING tokens.
//
// Strings in the returned tokens are slices from the original string.
//
// startPos should be set to ast.InitPos unless the caller knows that
// this interpolation string is part of a larger file and knows the position
// of the first character in that larger file.
func Scan(s string, startPos ast.Pos) <-chan *Token {
	ch := make(chan *Token)
	go scan(s, ch, startPos)
	return ch
}

func scan(s string, ch chan<- *Token, pos ast.Pos) {
	// 'remain' starts off as the whole string but we gradually
	// slice of the front of it as we work our way through.
	remain := s

	// nesting keeps track of how many ${ .. } sequences we are
	// inside, so we can recognize the minor differences in syntax
	// between outer string literals (LITERAL tokens) and quoted
	// string literals (STRING tokens).
	nesting := 0

	// We're going to flip back and forth between parsing literals/strings
	// and parsing interpolation sequences ${ .. } until we reach EOF or
	// some INVALID token.
All:
	for {
		startPos := pos
		// Literal string processing first, since the beginning of
		// a string is always outside of an interpolation sequence.
		literalVal, terminator := scanLiteral(remain, pos, nesting > 0)

		if len(literalVal) > 0 {
			litType := LITERAL
			if nesting > 0 {
				litType = STRING
			}
			ch <- &Token{
				Type:    litType,
				Content: literalVal,
				Pos:     startPos,
			}
			remain = remain[len(literalVal):]
		}

		ch <- terminator
		remain = remain[len(terminator.Content):]
		pos = terminator.Pos
		// Safe to use len() here because none of the terminator tokens
		// can contain UTF-8 sequences.
		pos.Column = pos.Column + len(terminator.Content)

		switch terminator.Type {
		case INVALID:
			// Synthetic EOF after invalid token, since further scanning
			// is likely to just produce more garbage.
			ch <- &Token{
				Type:    EOF,
				Content: "",
				Pos:     pos,
			}
			break All
		case EOF:
			// All done!
			break All
		case BEGIN:
			nesting++
		case CQUOTE:
			// nothing special to do
		default:
			// Should never happen
			panic("invalid string/literal terminator")
		}

		// Now we do the processing of the insides of ${ .. } sequences.
		// This loop terminates when we encounter either a closing } or
		// an opening ", which will cause us to return to literal processing.
	Interpolation:
		for {

			token, size, newPos := scanInterpolationToken(remain, pos)
			ch <- token
			remain = remain[size:]
			pos = newPos

			switch token.Type {
			case INVALID:
				// Synthetic EOF after invalid token, since further scanning
				// is likely to just produce more garbage.
				ch <- &Token{
					Type:    EOF,
					Content: "",
					Pos:     pos,
				}
				break All
			case EOF:
				// All done
				// (though a syntax error that we'll catch in the parser)
				break All
			case END:
				nesting--
				if nesting < 0 {
					// Can happen if there are unbalanced ${ and } sequences
					// in the input, which we'll catch in the parser.
					nesting = 0
				}
				break Interpolation
			case OQUOTE:
				// Beginning of nested quoted string
				break Interpolation
			}
		}
	}

	close(ch)
}

// Returns the token found at the start of the given string, followed by
// the number of bytes that were consumed from the string and the adjusted
// source position.
//
// Note that the number of bytes consumed can be more than the length of
// the returned token contents if the string begins with whitespace, since
// it will be silently consumed before reading the token.
func scanInterpolationToken(s string, startPos ast.Pos) (*Token, int, ast.Pos) {
	pos := startPos
	size := 0

	// Consume whitespace, if any
	for len(s) > 0 && byteIsSpace(s[0]) {
		if s[0] == '\n' {
			pos.Column = 1
			pos.Line++
		} else {
			pos.Column++
		}
		size++
		s = s[1:]
	}

	// Unexpected EOF during sequence
	if len(s) == 0 {
		return &Token{
			Type:    EOF,
			Content: "",
			Pos:     pos,
		}, size, pos
	}

	next := s[0]
	var token *Token

	switch next {
	case '(', ')', '[', ']', ',', '.', '+', '-', '*', '/', '%', '?', ':':
		// Easy punctuation symbols that don't have any special meaning
		// during scanning, and that stand for themselves in the
		// TokenType enumeration.
		token = &Token{
			Type:    TokenType(next),
			Content: s[:1],
			Pos:     pos,
		}
	case '}':
		token = &Token{
			Type:    END,
			Content: s[:1],
			Pos:     pos,
		}
	case '"':
		token = &Token{
			Type:    OQUOTE,
			Content: s[:1],
			Pos:     pos,
		}
	case '!':
		if len(s) >= 2 && s[:2] == "!=" {
			token = &Token{
				Type:    NOTEQUAL,
				Content: s[:2],
				Pos:     pos,
			}
		} else {
			token = &Token{
				Type:    BANG,
				Content: s[:1],
				Pos:     pos,
			}
		}
	case '<':
		if len(s) >= 2 && s[:2] == "<=" {
			token = &Token{
				Type:    LTE,
				Content: s[:2],
				Pos:     pos,
			}
		} else {
			token = &Token{
				Type:    LT,
				Content: s[:1],
				Pos:     pos,
			}
		}
	case '>':
		if len(s) >= 2 && s[:2] == ">=" {
			token = &Token{
				Type:    GTE,
				Content: s[:2],
				Pos:     pos,
			}
		} else {
			token = &Token{
				Type:    GT,
				Content: s[:1],
				Pos:     pos,
			}
		}
	case '=':
		if len(s) >= 2 && s[:2] == "==" {
			token = &Token{
				Type:    EQUAL,
				Content: s[:2],
				Pos:     pos,
			}
		} else {
			// A single equals is not a valid operator
			token = &Token{
				Type:    INVALID,
				Content: s[:1],
				Pos:     pos,
			}
		}
	case '&':
		if len(s) >= 2 && s[:2] == "&&" {
			token = &Token{
				Type:    AND,
				Content: s[:2],
				Pos:     pos,
			}
		} else {
			token = &Token{
				Type:    INVALID,
				Content: s[:1],
				Pos:     pos,
			}
		}
	case '|':
		if len(s) >= 2 && s[:2] == "||" {
			token = &Token{
				Type:    OR,
				Content: s[:2],
				Pos:     pos,
			}
		} else {
			token = &Token{
				Type:    INVALID,
				Content: s[:1],
				Pos:     pos,
			}
		}
	default:
		if next >= '0' && next <= '9' {
			num, numType := scanNumber(s)
			token = &Token{
				Type:    numType,
				Content: num,
				Pos:     pos,
			}
		} else if stringStartsWithIdentifier(s) {
			ident, runeLen := scanIdentifier(s)
			tokenType := IDENTIFIER
			if ident == "true" || ident == "false" {
				tokenType = BOOL
			}
			token = &Token{
				Type:    tokenType,
				Content: ident,
				Pos:     pos,
			}
			// Skip usual token handling because it doesn't
			// know how to deal with UTF-8 sequences.
			pos.Column = pos.Column + runeLen
			return token, size + len(ident), pos
		} else {
			_, byteLen := utf8.DecodeRuneInString(s)
			token = &Token{
				Type:    INVALID,
				Content: s[:byteLen],
				Pos:     pos,
			}
			// Skip usual token handling because it doesn't
			// know how to deal with UTF-8 sequences.
			pos.Column = pos.Column + 1
			return token, size + byteLen, pos
		}
	}

	// Here we assume that the token content contains no UTF-8 sequences,
	// because we dealt with UTF-8 characters as a special case where
	// necessary above.
	size = size + len(token.Content)
	pos.Column = pos.Column + len(token.Content)

	return token, size, pos
}

// Returns the (possibly-empty) prefix of the given string that represents
// a literal, followed by the token that marks the end of the literal.
func scanLiteral(s string, startPos ast.Pos, nested bool) (string, *Token) {
	litLen := 0
	pos := startPos
	var terminator *Token
	for {

		if litLen >= len(s) {
			if nested {
				// We've ended in the middle of a quoted string,
				// which means this token is actually invalid.
				return "", &Token{
					Type:    INVALID,
					Content: s,
					Pos:     startPos,
				}
			}
			terminator = &Token{
				Type:    EOF,
				Content: "",
				Pos:     pos,
			}
			break
		}

		next := s[litLen]

		if next == '$' && len(s) > litLen+1 {
			follow := s[litLen+1]

			if follow == '{' {
				terminator = &Token{
					Type:    BEGIN,
					Content: s[litLen : litLen+2],
					Pos:     pos,
				}
				pos.Column = pos.Column + 2
				break
			} else if follow == '$' {
				// Double-$ escapes the special processing of $,
				// so we will consume both characters here.
				pos.Column = pos.Column + 2
				litLen = litLen + 2
				continue
			}
		}

		// special handling that applies only to quoted strings
		if nested {
			if next == '"' {
				terminator = &Token{
					Type:    CQUOTE,
					Content: s[litLen : litLen+1],
					Pos:     pos,
				}
				pos.Column = pos.Column + 1
				break
			}

			// Escaped quote marks do not terminate the string.
			//
			// All we do here in the scanner is avoid terminating a string
			// due to an escaped quote. The parser is responsible for the
			// full handling of escape sequences, since it's able to produce
			// better error messages than we can produce in here.
			if next == '\\' && len(s) > litLen+1 {
				follow := s[litLen+1]

				if follow == '"' {
					// \" escapes the special processing of ",
					// so we will consume both characters here.
					pos.Column = pos.Column + 2
					litLen = litLen + 2
					continue
				} else if follow == '\\' {
					// \\ escapes \
					// so we will consume both characters here.
					pos.Column = pos.Column + 2
					litLen = litLen + 2
					continue
				}
			}
		}

		if next == '\n' {
			pos.Column = 1
			pos.Line++
			litLen++
		} else {
			pos.Column++

			// "Column" measures runes, so we need to actually consume
			// a valid UTF-8 character here.
			_, size := utf8.DecodeRuneInString(s[litLen:])
			litLen = litLen + size
		}

	}

	return s[:litLen], terminator
}

// scanNumber returns the extent of the prefix of the string that represents
// a valid number, along with what type of number it represents: INT or FLOAT.
//
// scanNumber does only basic character analysis: numbers consist of digits
// and periods, with at least one period signalling a FLOAT. It's the parser's
// responsibility to validate the form and range of the number, such as ensuring
// that a FLOAT actually contains only one period, etc.
func scanNumber(s string) (string, TokenType) {
	period := -1
	byteLen := 0
	numType := INTEGER
	for {
		if byteLen >= len(s) {
			break
		}

		next := s[byteLen]
		if next != '.' && (next < '0' || next > '9') {
			// If our last value was a period, then we're not a float,
			// we're just an integer that ends in a period.
			if period == byteLen-1 {
				byteLen--
				numType = INTEGER
			}

			break
		}

		if next == '.' {
			// If we've already seen a period, break out
			if period >= 0 {
				break
			}

			period = byteLen
			numType = FLOAT
		}

		byteLen++
	}

	return s[:byteLen], numType
}

// scanIdentifier returns the extent of the prefix of the string that
// represents a valid identifier, along with the length of that prefix
// in runes.
//
// Identifiers may contain utf8-encoded non-Latin letters, which will
// cause the returned "rune length" to be shorter than the byte length
// of the returned string.
func scanIdentifier(s string) (string, int) {
	byteLen := 0
	runeLen := 0
	for {
		if byteLen >= len(s) {
			break
		}

		nextRune, size := utf8.DecodeRuneInString(s[byteLen:])
		if !(nextRune == '_' ||
			nextRune == '-' ||
			nextRune == '.' ||
			nextRune == '*' ||
			unicode.IsNumber(nextRune) ||
			unicode.IsLetter(nextRune) ||
			unicode.IsMark(nextRune)) {
			break
		}

		// If we reach a star, it must be between periods to be part
		// of the same identifier.
		if nextRune == '*' && s[byteLen-1] != '.' {
			break
		}

		// If our previous character was a star, then the current must
		// be period. Otherwise, undo that and exit.
		if byteLen > 0 && s[byteLen-1] == '*' && nextRune != '.' {
			byteLen--
			if s[byteLen-1] == '.' {
				byteLen--
			}

			break
		}

		byteLen = byteLen + size
		runeLen = runeLen + 1
	}

	return s[:byteLen], runeLen
}

// byteIsSpace implements a restrictive interpretation of spaces that includes
// only what's valid inside interpolation sequences: spaces, tabs, newlines.
func byteIsSpace(b byte) bool {
	switch b {
	case ' ', '\t', '\r', '\n':
		return true
	default:
		return false
	}
}

// stringStartsWithIdentifier returns true if the given string begins with
// a character that is a legal start of an identifier: an underscore or
// any character that Unicode considers to be a letter.
func stringStartsWithIdentifier(s string) bool {
	if len(s) == 0 {
		return false
	}

	first := s[0]

	// Easy ASCII cases first
	if (first >= 'a' && first <= 'z') || (first >= 'A' && first <= 'Z') || first == '_' {
		return true
	}

	// If our first byte begins a UTF-8 sequence then the sequence might
	// be a unicode letter.
	if utf8.RuneStart(first) {
		firstRune, _ := utf8.DecodeRuneInString(s)
		if unicode.IsLetter(firstRune) {
			return true
		}
	}

	return false
}