23 // Lexer contains information about the expression being tokenized.
25 expression string // The expression provided by the user.
26 currentPos int // The current position in the string.
27 lastWidth int // The width of the current rune. This
28 buf bytes.Buffer // Internal buffer used for building up values.
31 // SyntaxError is the main error used whenever a lexing or parsing error occurs.
32 type SyntaxError struct {
33 msg string // Error message displayed to user
34 Expression string // Expression that generated a SyntaxError
35 Offset int // The location in the string where the error occurred
38 func (e SyntaxError) Error() string {
39 // In the future, it would be good to underline the specific
40 // location where the error occurred.
41 return "SyntaxError: " + e.msg
44 // HighlightLocation will show where the syntax error occurred.
45 // It will place a "^" character on a line below the expression
46 // at the point where the syntax error occurred.
47 func (e SyntaxError) HighlightLocation() string {
48 return e.Expression + "\n" + strings.Repeat(" ", e.Offset) + "^"
51 //go:generate stringer -type=tokType
53 tUnknown tokType = iota
86 var basicTokens = map[rune]tokType{
93 ']': tRbracket, // tLbracket not included because it could be "[]"
99 // Bit mask for [a-zA-Z_] shifted down 64 bits to fit in a single uint64.
100 // When using this bitmask just be sure to shift the rune down 64 bits
101 // before checking against identifierStartBits.
102 const identifierStartBits uint64 = 576460745995190270
104 // Bit mask for [a-zA-Z0-9], 128 bits -> 2 uint64s.
105 var identifierTrailingBits = [2]uint64{287948901175001088, 576460745995190270}
107 var whiteSpace = map[rune]bool{
108 ' ': true, '\t': true, '\n': true, '\r': true,
111 func (t token) String() string {
112 return fmt.Sprintf("Token{%+v, %s, %d, %d}",
113 t.tokenType, t.value, t.position, t.length)
116 // NewLexer creates a new JMESPath lexer.
117 func NewLexer() *Lexer {
122 func (lexer *Lexer) next() rune {
123 if lexer.currentPos >= len(lexer.expression) {
127 r, w := utf8.DecodeRuneInString(lexer.expression[lexer.currentPos:])
129 lexer.currentPos += w
133 func (lexer *Lexer) back() {
134 lexer.currentPos -= lexer.lastWidth
137 func (lexer *Lexer) peek() rune {
143 // tokenize takes an expression and returns corresponding tokens.
144 func (lexer *Lexer) tokenize(expression string) ([]token, error) {
146 lexer.expression = expression
152 if identifierStartBits&(1<<(uint64(r)-64)) > 0 {
153 t := lexer.consumeUnquotedIdentifier()
154 tokens = append(tokens, t)
155 } else if val, ok := basicTokens[r]; ok {
156 // Basic single char token.
160 position: lexer.currentPos - lexer.lastWidth,
163 tokens = append(tokens, t)
164 } else if r == '-' || (r >= '0' && r <= '9') {
165 t := lexer.consumeNumber()
166 tokens = append(tokens, t)
168 t := lexer.consumeLBracket()
169 tokens = append(tokens, t)
171 t, err := lexer.consumeQuotedIdentifier()
175 tokens = append(tokens, t)
176 } else if r == '\'' {
177 t, err := lexer.consumeRawStringLiteral()
181 tokens = append(tokens, t)
183 t, err := lexer.consumeLiteral()
187 tokens = append(tokens, t)
189 t := lexer.matchOrElse(r, '|', tOr, tPipe)
190 tokens = append(tokens, t)
192 t := lexer.matchOrElse(r, '=', tLTE, tLT)
193 tokens = append(tokens, t)
195 t := lexer.matchOrElse(r, '=', tGTE, tGT)
196 tokens = append(tokens, t)
198 t := lexer.matchOrElse(r, '=', tNE, tNot)
199 tokens = append(tokens, t)
201 t := lexer.matchOrElse(r, '=', tEQ, tUnknown)
202 tokens = append(tokens, t)
204 t := lexer.matchOrElse(r, '&', tAnd, tExpref)
205 tokens = append(tokens, t)
208 } else if _, ok := whiteSpace[r]; ok {
211 return tokens, lexer.syntaxError(fmt.Sprintf("Unknown char: %s", strconv.QuoteRuneToASCII(r)))
214 tokens = append(tokens, token{tEOF, "", len(lexer.expression), 0})
218 // Consume characters until the ending rune "r" is reached.
219 // If the end of the expression is reached before seeing the
220 // terminating rune "r", then an error is returned.
221 // If no error occurs then the matching substring is returned.
222 // The returned string will not include the ending rune.
223 func (lexer *Lexer) consumeUntil(end rune) (string, error) {
224 start := lexer.currentPos
225 current := lexer.next()
226 for current != end && current != eof {
227 if current == '\\' && lexer.peek() != eof {
230 current = lexer.next()
232 if lexer.lastWidth == 0 {
233 // Then we hit an EOF so we never reached the closing
235 return "", SyntaxError{
236 msg: "Unclosed delimiter: " + string(end),
237 Expression: lexer.expression,
238 Offset: len(lexer.expression),
241 return lexer.expression[start : lexer.currentPos-lexer.lastWidth], nil
244 func (lexer *Lexer) consumeLiteral() (token, error) {
245 start := lexer.currentPos
246 value, err := lexer.consumeUntil('`')
250 value = strings.Replace(value, "\\`", "`", -1)
252 tokenType: tJSONLiteral,
259 func (lexer *Lexer) consumeRawStringLiteral() (token, error) {
260 start := lexer.currentPos
261 currentIndex := start
262 current := lexer.next()
263 for current != '\'' && lexer.peek() != eof {
264 if current == '\\' && lexer.peek() == '\'' {
265 chunk := lexer.expression[currentIndex : lexer.currentPos-1]
266 lexer.buf.WriteString(chunk)
267 lexer.buf.WriteString("'")
269 currentIndex = lexer.currentPos
271 current = lexer.next()
273 if lexer.lastWidth == 0 {
274 // Then we hit an EOF so we never reached the closing
276 return token{}, SyntaxError{
277 msg: "Unclosed delimiter: '",
278 Expression: lexer.expression,
279 Offset: len(lexer.expression),
282 if currentIndex < lexer.currentPos {
283 lexer.buf.WriteString(lexer.expression[currentIndex : lexer.currentPos-1])
285 value := lexer.buf.String()
286 // Reset the buffer so it can reused again.
289 tokenType: tStringLiteral,
296 func (lexer *Lexer) syntaxError(msg string) SyntaxError {
299 Expression: lexer.expression,
300 Offset: lexer.currentPos - 1,
304 // Checks for a two char token, otherwise matches a single character
305 // token. This is used whenever a two char token overlaps a single
306 // char token, e.g. "||" -> tPipe, "|" -> tOr.
307 func (lexer *Lexer) matchOrElse(first rune, second rune, matchedType tokType, singleCharType tokType) token {
308 start := lexer.currentPos - lexer.lastWidth
309 nextRune := lexer.next()
311 if nextRune == second {
313 tokenType: matchedType,
314 value: string(first) + string(second),
321 tokenType: singleCharType,
322 value: string(first),
330 func (lexer *Lexer) consumeLBracket() token {
331 // There's three options here:
332 // 1. A filter expression "[?"
333 // 2. A flatten operator "[]"
334 // 3. A bare rbracket "["
335 start := lexer.currentPos - lexer.lastWidth
336 nextRune := lexer.next()
345 } else if nextRune == ']' {
354 tokenType: tLbracket,
364 func (lexer *Lexer) consumeQuotedIdentifier() (token, error) {
365 start := lexer.currentPos
366 value, err := lexer.consumeUntil('"')
371 asJSON := []byte("\"" + value + "\"")
372 if err := json.Unmarshal([]byte(asJSON), &decoded); err != nil {
376 tokenType: tQuotedIdentifier,
379 length: len(decoded),
383 func (lexer *Lexer) consumeUnquotedIdentifier() token {
384 // Consume runes until we reach the end of an unquoted
386 start := lexer.currentPos - lexer.lastWidth
389 if r < 0 || r > 128 || identifierTrailingBits[uint64(r)/64]&(1<<(uint64(r)%64)) == 0 {
394 value := lexer.expression[start:lexer.currentPos]
396 tokenType: tUnquotedIdentifier,
399 length: lexer.currentPos - start,
403 func (lexer *Lexer) consumeNumber() token {
404 // Consume runes until we reach something that's not a number.
405 start := lexer.currentPos - lexer.lastWidth
408 if r < '0' || r > '9' {
413 value := lexer.expression[start:lexer.currentPos]
418 length: lexer.currentPos - start,