vendor/github.com/hashicorp/hil/scanner/scanner.go

   1 package scanner
   2
   3 import (
   4         "unicode"
   5         "unicode/utf8"
   6
   7         "github.com/hashicorp/hil/ast"
   8 )
   9
  10 // Scan returns a channel that recieves Tokens from the given input string.
  11 //
  12 // The scanner's job is just to partition the string into meaningful parts.
  13 // It doesn't do any transformation of the raw input string, so the caller
  14 // must deal with any further interpretation required, such as parsing INTEGER
  15 // tokens into real ints, or dealing with escape sequences in LITERAL or
  16 // STRING tokens.
  17 //
  18 // Strings in the returned tokens are slices from the original string.
  19 //
  20 // startPos should be set to ast.InitPos unless the caller knows that
  21 // this interpolation string is part of a larger file and knows the position
  22 // of the first character in that larger file.
  23 func Scan(s string, startPos ast.Pos) <-chan *Token {
  24         ch := make(chan *Token)
  25         go scan(s, ch, startPos)
  26         return ch
  27 }
  28
  29 func scan(s string, ch chan<- *Token, pos ast.Pos) {
  30         // 'remain' starts off as the whole string but we gradually
  31         // slice of the front of it as we work our way through.
  32         remain := s
  33
  34         // nesting keeps track of how many ${ .. } sequences we are
  35         // inside, so we can recognize the minor differences in syntax
  36         // between outer string literals (LITERAL tokens) and quoted
  37         // string literals (STRING tokens).
  38         nesting := 0
  39
  40         // We're going to flip back and forth between parsing literals/strings
  41         // and parsing interpolation sequences ${ .. } until we reach EOF or
  42         // some INVALID token.
  43 All:
  44         for {
  45                 startPos := pos
  46                 // Literal string processing first, since the beginning of
  47                 // a string is always outside of an interpolation sequence.
  48                 literalVal, terminator := scanLiteral(remain, pos, nesting > 0)
  49
  50                 if len(literalVal) > 0 {
  51                         litType := LITERAL
  52                         if nesting > 0 {
  53                                 litType = STRING
  54                         }
  55                         ch <- &Token{
  56                                 Type:    litType,
  57                                 Content: literalVal,
  58                                 Pos:     startPos,
  59                         }
  60                         remain = remain[len(literalVal):]
  61                 }
  62
  63                 ch <- terminator
  64                 remain = remain[len(terminator.Content):]
  65                 pos = terminator.Pos
  66                 // Safe to use len() here because none of the terminator tokens
  67                 // can contain UTF-8 sequences.
  68                 pos.Column = pos.Column + len(terminator.Content)
  69
  70                 switch terminator.Type {
  71                 case INVALID:
  72                         // Synthetic EOF after invalid token, since further scanning
  73                         // is likely to just produce more garbage.
  74                         ch <- &Token{
  75                                 Type:    EOF,
  76                                 Content: "",
  77                                 Pos:     pos,
  78                         }
  79                         break All
  80                 case EOF:
  81                         // All done!
  82                         break All
  83                 case BEGIN:
  84                         nesting++
  85                 case CQUOTE:
  86                         // nothing special to do
  87                 default:
  88                         // Should never happen
  89                         panic("invalid string/literal terminator")
  90                 }
  91
  92                 // Now we do the processing of the insides of ${ .. } sequences.
  93                 // This loop terminates when we encounter either a closing } or
  94                 // an opening ", which will cause us to return to literal processing.
  95         Interpolation:
  96                 for {
  97
  98                         token, size, newPos := scanInterpolationToken(remain, pos)
  99                         ch <- token
 100                         remain = remain[size:]
 101                         pos = newPos
 102
 103                         switch token.Type {
 104                         case INVALID:
 105                                 // Synthetic EOF after invalid token, since further scanning
 106                                 // is likely to just produce more garbage.
 107                                 ch <- &Token{
 108                                         Type:    EOF,
 109                                         Content: "",
 110                                         Pos:     pos,
 111                                 }
 112                                 break All
 113                         case EOF:
 114                                 // All done
 115                                 // (though a syntax error that we'll catch in the parser)
 116                                 break All
 117                         case END:
 118                                 nesting--
 119                                 if nesting < 0 {
 120                                         // Can happen if there are unbalanced ${ and } sequences
 121                                         // in the input, which we'll catch in the parser.
 122                                         nesting = 0
 123                                 }
 124                                 break Interpolation
 125                         case OQUOTE:
 126                                 // Beginning of nested quoted string
 127                                 break Interpolation
 128                         }
 129                 }
 130         }
 131
 132         close(ch)
 133 }
 134
 135 // Returns the token found at the start of the given string, followed by
 136 // the number of bytes that were consumed from the string and the adjusted
 137 // source position.
 138 //
 139 // Note that the number of bytes consumed can be more than the length of
 140 // the returned token contents if the string begins with whitespace, since
 141 // it will be silently consumed before reading the token.
 142 func scanInterpolationToken(s string, startPos ast.Pos) (*Token, int, ast.Pos) {
 143         pos := startPos
 144         size := 0
 145
 146         // Consume whitespace, if any
 147         for len(s) > 0 && byteIsSpace(s[0]) {
 148                 if s[0] == '\n' {
 149                         pos.Column = 1
 150                         pos.Line++
 151                 } else {
 152                         pos.Column++
 153                 }
 154                 size++
 155                 s = s[1:]
 156         }
 157
 158         // Unexpected EOF during sequence
 159         if len(s) == 0 {
 160                 return &Token{
 161                         Type:    EOF,
 162                         Content: "",
 163                         Pos:     pos,
 164                 }, size, pos
 165         }
 166
 167         next := s[0]
 168         var token *Token
 169
 170         switch next {
 171         case '(', ')', '[', ']', ',', '.', '+', '-', '*', '/', '%', '?', ':':
 172                 // Easy punctuation symbols that don't have any special meaning
 173                 // during scanning, and that stand for themselves in the
 174                 // TokenType enumeration.
 175                 token = &Token{
 176                         Type:    TokenType(next),
 177                         Content: s[:1],
 178                         Pos:     pos,
 179                 }
 180         case '}':
 181                 token = &Token{
 182                         Type:    END,
 183                         Content: s[:1],
 184                         Pos:     pos,
 185                 }
 186         case '"':
 187                 token = &Token{
 188                         Type:    OQUOTE,
 189                         Content: s[:1],
 190                         Pos:     pos,
 191                 }
 192         case '!':
 193                 if len(s) >= 2 && s[:2] == "!=" {
 194                         token = &Token{
 195                                 Type:    NOTEQUAL,
 196                                 Content: s[:2],
 197                                 Pos:     pos,
 198                         }
 199                 } else {
 200                         token = &Token{
 201                                 Type:    BANG,
 202                                 Content: s[:1],
 203                                 Pos:     pos,
 204                         }
 205                 }
 206         case '<':
 207                 if len(s) >= 2 && s[:2] == "<=" {
 208                         token = &Token{
 209                                 Type:    LTE,
 210                                 Content: s[:2],
 211                                 Pos:     pos,
 212                         }
 213                 } else {
 214                         token = &Token{
 215                                 Type:    LT,
 216                                 Content: s[:1],
 217                                 Pos:     pos,
 218                         }
 219                 }
 220         case '>':
 221                 if len(s) >= 2 && s[:2] == ">=" {
 222                         token = &Token{
 223                                 Type:    GTE,
 224                                 Content: s[:2],
 225                                 Pos:     pos,
 226                         }
 227                 } else {
 228                         token = &Token{
 229                                 Type:    GT,
 230                                 Content: s[:1],
 231                                 Pos:     pos,
 232                         }
 233                 }
 234         case '=':
 235                 if len(s) >= 2 && s[:2] == "==" {
 236                         token = &Token{
 237                                 Type:    EQUAL,
 238                                 Content: s[:2],
 239                                 Pos:     pos,
 240                         }
 241                 } else {
 242                         // A single equals is not a valid operator
 243                         token = &Token{
 244                                 Type:    INVALID,
 245                                 Content: s[:1],
 246                                 Pos:     pos,
 247                         }
 248                 }
 249         case '&':
 250                 if len(s) >= 2 && s[:2] == "&&" {
 251                         token = &Token{
 252                                 Type:    AND,
 253                                 Content: s[:2],
 254                                 Pos:     pos,
 255                         }
 256                 } else {
 257                         token = &Token{
 258                                 Type:    INVALID,
 259                                 Content: s[:1],
 260                                 Pos:     pos,
 261                         }
 262                 }
 263         case '|':
 264                 if len(s) >= 2 && s[:2] == "||" {
 265                         token = &Token{
 266                                 Type:    OR,
 267                                 Content: s[:2],
 268                                 Pos:     pos,
 269                         }
 270                 } else {
 271                         token = &Token{
 272                                 Type:    INVALID,
 273                                 Content: s[:1],
 274                                 Pos:     pos,
 275                         }
 276                 }
 277         default:
 278                 if next >= '0' && next <= '9' {
 279                         num, numType := scanNumber(s)
 280                         token = &Token{
 281                                 Type:    numType,
 282                                 Content: num,
 283                                 Pos:     pos,
 284                         }
 285                 } else if stringStartsWithIdentifier(s) {
 286                         ident, runeLen := scanIdentifier(s)
 287                         tokenType := IDENTIFIER
 288                         if ident == "true" || ident == "false" {
 289                                 tokenType = BOOL
 290                         }
 291                         token = &Token{
 292                                 Type:    tokenType,
 293                                 Content: ident,
 294                                 Pos:     pos,
 295                         }
 296                         // Skip usual token handling because it doesn't
 297                         // know how to deal with UTF-8 sequences.
 298                         pos.Column = pos.Column + runeLen
 299                         return token, size + len(ident), pos
 300                 } else {
 301                         _, byteLen := utf8.DecodeRuneInString(s)
 302                         token = &Token{
 303                                 Type:    INVALID,
 304                                 Content: s[:byteLen],
 305                                 Pos:     pos,
 306                         }
 307                         // Skip usual token handling because it doesn't
 308                         // know how to deal with UTF-8 sequences.
 309                         pos.Column = pos.Column + 1
 310                         return token, size + byteLen, pos
 311                 }
 312         }
 313
 314         // Here we assume that the token content contains no UTF-8 sequences,
 315         // because we dealt with UTF-8 characters as a special case where
 316         // necessary above.
 317         size = size + len(token.Content)
 318         pos.Column = pos.Column + len(token.Content)
 319
 320         return token, size, pos
 321 }
 322
 323 // Returns the (possibly-empty) prefix of the given string that represents
 324 // a literal, followed by the token that marks the end of the literal.
 325 func scanLiteral(s string, startPos ast.Pos, nested bool) (string, *Token) {
 326         litLen := 0
 327         pos := startPos
 328         var terminator *Token
 329         for {
 330
 331                 if litLen >= len(s) {
 332                         if nested {
 333                                 // We've ended in the middle of a quoted string,
 334                                 // which means this token is actually invalid.
 335                                 return "", &Token{
 336                                         Type:    INVALID,
 337                                         Content: s,
 338                                         Pos:     startPos,
 339                                 }
 340                         }
 341                         terminator = &Token{
 342                                 Type:    EOF,
 343                                 Content: "",
 344                                 Pos:     pos,
 345                         }
 346                         break
 347                 }
 348
 349                 next := s[litLen]
 350
 351                 if next == '$' && len(s) > litLen+1 {
 352                         follow := s[litLen+1]
 353
 354                         if follow == '{' {
 355                                 terminator = &Token{
 356                                         Type:    BEGIN,
 357                                         Content: s[litLen : litLen+2],
 358                                         Pos:     pos,
 359                                 }
 360                                 pos.Column = pos.Column + 2
 361                                 break
 362                         } else if follow == '$' {
 363                                 // Double-$ escapes the special processing of $,
 364                                 // so we will consume both characters here.
 365                                 pos.Column = pos.Column + 2
 366                                 litLen = litLen + 2
 367                                 continue
 368                         }
 369                 }
 370
 371                 // special handling that applies only to quoted strings
 372                 if nested {
 373                         if next == '"' {
 374                                 terminator = &Token{
 375                                         Type:    CQUOTE,
 376                                         Content: s[litLen : litLen+1],
 377                                         Pos:     pos,
 378                                 }
 379                                 pos.Column = pos.Column + 1
 380                                 break
 381                         }
 382
 383                         // Escaped quote marks do not terminate the string.
 384                         //
 385                         // All we do here in the scanner is avoid terminating a string
 386                         // due to an escaped quote. The parser is responsible for the
 387                         // full handling of escape sequences, since it's able to produce
 388                         // better error messages than we can produce in here.
 389                         if next == '\\' && len(s) > litLen+1 {
 390                                 follow := s[litLen+1]
 391
 392                                 if follow == '"' {
 393                                         // \" escapes the special processing of ",
 394                                         // so we will consume both characters here.
 395                                         pos.Column = pos.Column + 2
 396                                         litLen = litLen + 2
 397                                         continue
 398                                 }
 399                         }
 400                 }
 401
 402                 if next == '\n' {
 403                         pos.Column = 1
 404                         pos.Line++
 405                         litLen++
 406                 } else {
 407                         pos.Column++
 408
 409                         // "Column" measures runes, so we need to actually consume
 410                         // a valid UTF-8 character here.
 411                         _, size := utf8.DecodeRuneInString(s[litLen:])
 412                         litLen = litLen + size
 413                 }
 414
 415         }
 416
 417         return s[:litLen], terminator
 418 }
 419
 420 // scanNumber returns the extent of the prefix of the string that represents
 421 // a valid number, along with what type of number it represents: INT or FLOAT.
 422 //
 423 // scanNumber does only basic character analysis: numbers consist of digits
 424 // and periods, with at least one period signalling a FLOAT. It's the parser's
 425 // responsibility to validate the form and range of the number, such as ensuring
 426 // that a FLOAT actually contains only one period, etc.
 427 func scanNumber(s string) (string, TokenType) {
 428         period := -1
 429         byteLen := 0
 430         numType := INTEGER
 431         for {
 432                 if byteLen >= len(s) {
 433                         break
 434                 }
 435
 436                 next := s[byteLen]
 437                 if next != '.' && (next < '0' || next > '9') {
 438                         // If our last value was a period, then we're not a float,
 439                         // we're just an integer that ends in a period.
 440                         if period == byteLen-1 {
 441                                 byteLen--
 442                                 numType = INTEGER
 443                         }
 444
 445                         break
 446                 }
 447
 448                 if next == '.' {
 449                         // If we've already seen a period, break out
 450                         if period >= 0 {
 451                                 break
 452                         }
 453
 454                         period = byteLen
 455                         numType = FLOAT
 456                 }
 457
 458                 byteLen++
 459         }
 460
 461         return s[:byteLen], numType
 462 }
 463
 464 // scanIdentifier returns the extent of the prefix of the string that
 465 // represents a valid identifier, along with the length of that prefix
 466 // in runes.
 467 //
 468 // Identifiers may contain utf8-encoded non-Latin letters, which will
 469 // cause the returned "rune length" to be shorter than the byte length
 470 // of the returned string.
 471 func scanIdentifier(s string) (string, int) {
 472         byteLen := 0
 473         runeLen := 0
 474         for {
 475                 if byteLen >= len(s) {
 476                         break
 477                 }
 478
 479                 nextRune, size := utf8.DecodeRuneInString(s[byteLen:])
 480                 if !(nextRune == '_' ||
 481                         nextRune == '-' ||
 482                         nextRune == '.' ||
 483                         nextRune == '*' ||
 484                         unicode.IsNumber(nextRune) ||
 485                         unicode.IsLetter(nextRune) ||
 486                         unicode.IsMark(nextRune)) {
 487                         break
 488                 }
 489
 490                 // If we reach a star, it must be between periods to be part
 491                 // of the same identifier.
 492                 if nextRune == '*' && s[byteLen-1] != '.' {
 493                         break
 494                 }
 495
 496                 // If our previous character was a star, then the current must
 497                 // be period. Otherwise, undo that and exit.
 498                 if byteLen > 0 && s[byteLen-1] == '*' && nextRune != '.' {
 499                         byteLen--
 500                         if s[byteLen-1] == '.' {
 501                                 byteLen--
 502                         }
 503
 504                         break
 505                 }
 506
 507                 byteLen = byteLen + size
 508                 runeLen = runeLen + 1
 509         }
 510
 511         return s[:byteLen], runeLen
 512 }
 513
 514 // byteIsSpace implements a restrictive interpretation of spaces that includes
 515 // only what's valid inside interpolation sequences: spaces, tabs, newlines.
 516 func byteIsSpace(b byte) bool {
 517         switch b {
 518         case ' ', '\t', '\r', '\n':
 519                 return true
 520         default:
 521                 return false
 522         }
 523 }
 524
 525 // stringStartsWithIdentifier returns true if the given string begins with
 526 // a character that is a legal start of an identifier: an underscore or
 527 // any character that Unicode considers to be a letter.
 528 func stringStartsWithIdentifier(s string) bool {
 529         if len(s) == 0 {
 530                 return false
 531         }
 532
 533         first := s[0]
 534
 535         // Easy ASCII cases first
 536         if (first >= 'a' && first <= 'z') || (first >= 'A' && first <= 'Z') || first == '_' {
 537                 return true
 538         }
 539
 540         // If our first byte begins a UTF-8 sequence then the sequence might
 541         // be a unicode letter.
 542         if utf8.RuneStart(first) {
 543                 firstRune, _ := utf8.DecodeRuneInString(s)
 544                 if unicode.IsLetter(firstRune) {
 545                         return true
 546                 }
 547         }
 548
 549         return false
 550 }