vendor/github.com/hashicorp/hcl/json/scanner/scanner.go

   1 package scanner
   2
   3 import (
   4         "bytes"
   5         "fmt"
   6         "os"
   7         "unicode"
   8         "unicode/utf8"
   9
  10         "github.com/hashicorp/hcl/json/token"
  11 )
  12
  13 // eof represents a marker rune for the end of the reader.
  14 const eof = rune(0)
  15
  16 // Scanner defines a lexical scanner
  17 type Scanner struct {
  18         buf *bytes.Buffer // Source buffer for advancing and scanning
  19         src []byte        // Source buffer for immutable access
  20
  21         // Source Position
  22         srcPos  token.Pos // current position
  23         prevPos token.Pos // previous position, used for peek() method
  24
  25         lastCharLen int // length of last character in bytes
  26         lastLineLen int // length of last line in characters (for correct column reporting)
  27
  28         tokStart int // token text start position
  29         tokEnd   int // token text end  position
  30
  31         // Error is called for each error encountered. If no Error
  32         // function is set, the error is reported to os.Stderr.
  33         Error func(pos token.Pos, msg string)
  34
  35         // ErrorCount is incremented by one for each error encountered.
  36         ErrorCount int
  37
  38         // tokPos is the start position of most recently scanned token; set by
  39         // Scan. The Filename field is always left untouched by the Scanner.  If
  40         // an error is reported (via Error) and Position is invalid, the scanner is
  41         // not inside a token.
  42         tokPos token.Pos
  43 }
  44
  45 // New creates and initializes a new instance of Scanner using src as
  46 // its source content.
  47 func New(src []byte) *Scanner {
  48         // even though we accept a src, we read from a io.Reader compatible type
  49         // (*bytes.Buffer). So in the future we might easily change it to streaming
  50         // read.
  51         b := bytes.NewBuffer(src)
  52         s := &Scanner{
  53                 buf: b,
  54                 src: src,
  55         }
  56
  57         // srcPosition always starts with 1
  58         s.srcPos.Line = 1
  59         return s
  60 }
  61
  62 // next reads the next rune from the bufferred reader. Returns the rune(0) if
  63 // an error occurs (or io.EOF is returned).
  64 func (s *Scanner) next() rune {
  65         ch, size, err := s.buf.ReadRune()
  66         if err != nil {
  67                 // advance for error reporting
  68                 s.srcPos.Column++
  69                 s.srcPos.Offset += size
  70                 s.lastCharLen = size
  71                 return eof
  72         }
  73
  74         if ch == utf8.RuneError && size == 1 {
  75                 s.srcPos.Column++
  76                 s.srcPos.Offset += size
  77                 s.lastCharLen = size
  78                 s.err("illegal UTF-8 encoding")
  79                 return ch
  80         }
  81
  82         // remember last position
  83         s.prevPos = s.srcPos
  84
  85         s.srcPos.Column++
  86         s.lastCharLen = size
  87         s.srcPos.Offset += size
  88
  89         if ch == '\n' {
  90                 s.srcPos.Line++
  91                 s.lastLineLen = s.srcPos.Column
  92                 s.srcPos.Column = 0
  93         }
  94
  95         // debug
  96         // fmt.Printf("ch: %q, offset:column: %d:%d\n", ch, s.srcPos.Offset, s.srcPos.Column)
  97         return ch
  98 }
  99
 100 // unread unreads the previous read Rune and updates the source position
 101 func (s *Scanner) unread() {
 102         if err := s.buf.UnreadRune(); err != nil {
 103                 panic(err) // this is user fault, we should catch it
 104         }
 105         s.srcPos = s.prevPos // put back last position
 106 }
 107
 108 // peek returns the next rune without advancing the reader.
 109 func (s *Scanner) peek() rune {
 110         peek, _, err := s.buf.ReadRune()
 111         if err != nil {
 112                 return eof
 113         }
 114
 115         s.buf.UnreadRune()
 116         return peek
 117 }
 118
 119 // Scan scans the next token and returns the token.
 120 func (s *Scanner) Scan() token.Token {
 121         ch := s.next()
 122
 123         // skip white space
 124         for isWhitespace(ch) {
 125                 ch = s.next()
 126         }
 127
 128         var tok token.Type
 129
 130         // token text markings
 131         s.tokStart = s.srcPos.Offset - s.lastCharLen
 132
 133         // token position, initial next() is moving the offset by one(size of rune
 134         // actually), though we are interested with the starting point
 135         s.tokPos.Offset = s.srcPos.Offset - s.lastCharLen
 136         if s.srcPos.Column > 0 {
 137                 // common case: last character was not a '\n'
 138                 s.tokPos.Line = s.srcPos.Line
 139                 s.tokPos.Column = s.srcPos.Column
 140         } else {
 141                 // last character was a '\n'
 142                 // (we cannot be at the beginning of the source
 143                 // since we have called next() at least once)
 144                 s.tokPos.Line = s.srcPos.Line - 1
 145                 s.tokPos.Column = s.lastLineLen
 146         }
 147
 148         switch {
 149         case isLetter(ch):
 150                 lit := s.scanIdentifier()
 151                 if lit == "true" || lit == "false" {
 152                         tok = token.BOOL
 153                 } else if lit == "null" {
 154                         tok = token.NULL
 155                 } else {
 156                         s.err("illegal char")
 157                 }
 158         case isDecimal(ch):
 159                 tok = s.scanNumber(ch)
 160         default:
 161                 switch ch {
 162                 case eof:
 163                         tok = token.EOF
 164                 case '"':
 165                         tok = token.STRING
 166                         s.scanString()
 167                 case '.':
 168                         tok = token.PERIOD
 169                         ch = s.peek()
 170                         if isDecimal(ch) {
 171                                 tok = token.FLOAT
 172                                 ch = s.scanMantissa(ch)
 173                                 ch = s.scanExponent(ch)
 174                         }
 175                 case '[':
 176                         tok = token.LBRACK
 177                 case ']':
 178                         tok = token.RBRACK
 179                 case '{':
 180                         tok = token.LBRACE
 181                 case '}':
 182                         tok = token.RBRACE
 183                 case ',':
 184                         tok = token.COMMA
 185                 case ':':
 186                         tok = token.COLON
 187                 case '-':
 188                         if isDecimal(s.peek()) {
 189                                 ch := s.next()
 190                                 tok = s.scanNumber(ch)
 191                         } else {
 192                                 s.err("illegal char")
 193                         }
 194                 default:
 195                         s.err("illegal char: " + string(ch))
 196                 }
 197         }
 198
 199         // finish token ending
 200         s.tokEnd = s.srcPos.Offset
 201
 202         // create token literal
 203         var tokenText string
 204         if s.tokStart >= 0 {
 205                 tokenText = string(s.src[s.tokStart:s.tokEnd])
 206         }
 207         s.tokStart = s.tokEnd // ensure idempotency of tokenText() call
 208
 209         return token.Token{
 210                 Type: tok,
 211                 Pos:  s.tokPos,
 212                 Text: tokenText,
 213         }
 214 }
 215
 216 // scanNumber scans a HCL number definition starting with the given rune
 217 func (s *Scanner) scanNumber(ch rune) token.Type {
 218         zero := ch == '0'
 219         pos := s.srcPos
 220
 221         s.scanMantissa(ch)
 222         ch = s.next() // seek forward
 223         if ch == 'e' || ch == 'E' {
 224                 ch = s.scanExponent(ch)
 225                 return token.FLOAT
 226         }
 227
 228         if ch == '.' {
 229                 ch = s.scanFraction(ch)
 230                 if ch == 'e' || ch == 'E' {
 231                         ch = s.next()
 232                         ch = s.scanExponent(ch)
 233                 }
 234                 return token.FLOAT
 235         }
 236
 237         if ch != eof {
 238                 s.unread()
 239         }
 240
 241         // If we have a larger number and this is zero, error
 242         if zero && pos != s.srcPos {
 243                 s.err("numbers cannot start with 0")
 244         }
 245
 246         return token.NUMBER
 247 }
 248
 249 // scanMantissa scans the mantissa begining from the rune. It returns the next
 250 // non decimal rune. It's used to determine wheter it's a fraction or exponent.
 251 func (s *Scanner) scanMantissa(ch rune) rune {
 252         scanned := false
 253         for isDecimal(ch) {
 254                 ch = s.next()
 255                 scanned = true
 256         }
 257
 258         if scanned && ch != eof {
 259                 s.unread()
 260         }
 261         return ch
 262 }
 263
 264 // scanFraction scans the fraction after the '.' rune
 265 func (s *Scanner) scanFraction(ch rune) rune {
 266         if ch == '.' {
 267                 ch = s.peek() // we peek just to see if we can move forward
 268                 ch = s.scanMantissa(ch)
 269         }
 270         return ch
 271 }
 272
 273 // scanExponent scans the remaining parts of an exponent after the 'e' or 'E'
 274 // rune.
 275 func (s *Scanner) scanExponent(ch rune) rune {
 276         if ch == 'e' || ch == 'E' {
 277                 ch = s.next()
 278                 if ch == '-' || ch == '+' {
 279                         ch = s.next()
 280                 }
 281                 ch = s.scanMantissa(ch)
 282         }
 283         return ch
 284 }
 285
 286 // scanString scans a quoted string
 287 func (s *Scanner) scanString() {
 288         braces := 0
 289         for {
 290                 // '"' opening already consumed
 291                 // read character after quote
 292                 ch := s.next()
 293
 294                 if ch == '\n' || ch < 0 || ch == eof {
 295                         s.err("literal not terminated")
 296                         return
 297                 }
 298
 299                 if ch == '"' {
 300                         break
 301                 }
 302
 303                 // If we're going into a ${} then we can ignore quotes for awhile
 304                 if braces == 0 && ch == '$' && s.peek() == '{' {
 305                         braces++
 306                         s.next()
 307                 } else if braces > 0 && ch == '{' {
 308                         braces++
 309                 }
 310                 if braces > 0 && ch == '}' {
 311                         braces--
 312                 }
 313
 314                 if ch == '\\' {
 315                         s.scanEscape()
 316                 }
 317         }
 318
 319         return
 320 }
 321
 322 // scanEscape scans an escape sequence
 323 func (s *Scanner) scanEscape() rune {
 324         // http://en.cppreference.com/w/cpp/language/escape
 325         ch := s.next() // read character after '/'
 326         switch ch {
 327         case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"':
 328                 // nothing to do
 329         case '0', '1', '2', '3', '4', '5', '6', '7':
 330                 // octal notation
 331                 ch = s.scanDigits(ch, 8, 3)
 332         case 'x':
 333                 // hexademical notation
 334                 ch = s.scanDigits(s.next(), 16, 2)
 335         case 'u':
 336                 // universal character name
 337                 ch = s.scanDigits(s.next(), 16, 4)
 338         case 'U':
 339                 // universal character name
 340                 ch = s.scanDigits(s.next(), 16, 8)
 341         default:
 342                 s.err("illegal char escape")
 343         }
 344         return ch
 345 }
 346
 347 // scanDigits scans a rune with the given base for n times. For example an
 348 // octal notation \184 would yield in scanDigits(ch, 8, 3)
 349 func (s *Scanner) scanDigits(ch rune, base, n int) rune {
 350         for n > 0 && digitVal(ch) < base {
 351                 ch = s.next()
 352                 n--
 353         }
 354         if n > 0 {
 355                 s.err("illegal char escape")
 356         }
 357
 358         // we scanned all digits, put the last non digit char back
 359         s.unread()
 360         return ch
 361 }
 362
 363 // scanIdentifier scans an identifier and returns the literal string
 364 func (s *Scanner) scanIdentifier() string {
 365         offs := s.srcPos.Offset - s.lastCharLen
 366         ch := s.next()
 367         for isLetter(ch) || isDigit(ch) || ch == '-' {
 368                 ch = s.next()
 369         }
 370
 371         if ch != eof {
 372                 s.unread() // we got identifier, put back latest char
 373         }
 374
 375         return string(s.src[offs:s.srcPos.Offset])
 376 }
 377
 378 // recentPosition returns the position of the character immediately after the
 379 // character or token returned by the last call to Scan.
 380 func (s *Scanner) recentPosition() (pos token.Pos) {
 381         pos.Offset = s.srcPos.Offset - s.lastCharLen
 382         switch {
 383         case s.srcPos.Column > 0:
 384                 // common case: last character was not a '\n'
 385                 pos.Line = s.srcPos.Line
 386                 pos.Column = s.srcPos.Column
 387         case s.lastLineLen > 0:
 388                 // last character was a '\n'
 389                 // (we cannot be at the beginning of the source
 390                 // since we have called next() at least once)
 391                 pos.Line = s.srcPos.Line - 1
 392                 pos.Column = s.lastLineLen
 393         default:
 394                 // at the beginning of the source
 395                 pos.Line = 1
 396                 pos.Column = 1
 397         }
 398         return
 399 }
 400
 401 // err prints the error of any scanning to s.Error function. If the function is
 402 // not defined, by default it prints them to os.Stderr
 403 func (s *Scanner) err(msg string) {
 404         s.ErrorCount++
 405         pos := s.recentPosition()
 406
 407         if s.Error != nil {
 408                 s.Error(pos, msg)
 409                 return
 410         }
 411
 412         fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg)
 413 }
 414
 415 // isHexadecimal returns true if the given rune is a letter
 416 func isLetter(ch rune) bool {
 417         return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
 418 }
 419
 420 // isHexadecimal returns true if the given rune is a decimal digit
 421 func isDigit(ch rune) bool {
 422         return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
 423 }
 424
 425 // isHexadecimal returns true if the given rune is a decimal number
 426 func isDecimal(ch rune) bool {
 427         return '0' <= ch && ch <= '9'
 428 }
 429
 430 // isHexadecimal returns true if the given rune is an hexadecimal number
 431 func isHexadecimal(ch rune) bool {
 432         return '0' <= ch && ch <= '9' || 'a' <= ch && ch <= 'f' || 'A' <= ch && ch <= 'F'
 433 }
 434
 435 // isWhitespace returns true if the rune is a space, tab, newline or carriage return
 436 func isWhitespace(ch rune) bool {
 437         return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r'
 438 }
 439
 440 // digitVal returns the integer value of a given octal,decimal or hexadecimal rune
 441 func digitVal(ch rune) int {
 442         switch {
 443         case '0' <= ch && ch <= '9':
 444                 return int(ch - '0')
 445         case 'a' <= ch && ch <= 'f':
 446                 return int(ch - 'a' + 10)
 447         case 'A' <= ch && ch <= 'F':
 448                 return int(ch - 'A' + 10)
 449         }
 450         return 16 // larger than any legal digit val
 451 }