vendor/github.com/hashicorp/hcl2/hcl/json/scanner.go

   1 package json
   2
   3 import (
   4         "fmt"
   5
   6         "github.com/apparentlymart/go-textseg/textseg"
   7         "github.com/hashicorp/hcl2/hcl"
   8 )
   9
  10 //go:generate stringer -type tokenType scanner.go
  11 type tokenType rune
  12
  13 const (
  14         tokenBraceO  tokenType = '{'
  15         tokenBraceC  tokenType = '}'
  16         tokenBrackO  tokenType = '['
  17         tokenBrackC  tokenType = ']'
  18         tokenComma   tokenType = ','
  19         tokenColon   tokenType = ':'
  20         tokenKeyword tokenType = 'K'
  21         tokenString  tokenType = 'S'
  22         tokenNumber  tokenType = 'N'
  23         tokenEOF     tokenType = '␄'
  24         tokenInvalid tokenType = 0
  25         tokenEquals  tokenType = '=' // used only for reminding the user of JSON syntax
  26 )
  27
  28 type token struct {
  29         Type  tokenType
  30         Bytes []byte
  31         Range hcl.Range
  32 }
  33
  34 // scan returns the primary tokens for the given JSON buffer in sequence.
  35 //
  36 // The responsibility of this pass is to just mark the slices of the buffer
  37 // as being of various types. It is lax in how it interprets the multi-byte
  38 // token types keyword, string and number, preferring to capture erroneous
  39 // extra bytes that we presume the user intended to be part of the token
  40 // so that we can generate more helpful diagnostics in the parser.
  41 func scan(buf []byte, start pos) []token {
  42         var tokens []token
  43         p := start
  44         for {
  45                 if len(buf) == 0 {
  46                         tokens = append(tokens, token{
  47                                 Type:  tokenEOF,
  48                                 Bytes: nil,
  49                                 Range: posRange(p, p),
  50                         })
  51                         return tokens
  52                 }
  53
  54                 buf, p = skipWhitespace(buf, p)
  55
  56                 if len(buf) == 0 {
  57                         tokens = append(tokens, token{
  58                                 Type:  tokenEOF,
  59                                 Bytes: nil,
  60                                 Range: posRange(p, p),
  61                         })
  62                         return tokens
  63                 }
  64
  65                 start = p
  66
  67                 first := buf[0]
  68                 switch {
  69                 case first == '{' || first == '}' || first == '[' || first == ']' || first == ',' || first == ':' || first == '=':
  70                         p.Pos.Column++
  71                         p.Pos.Byte++
  72                         tokens = append(tokens, token{
  73                                 Type:  tokenType(first),
  74                                 Bytes: buf[0:1],
  75                                 Range: posRange(start, p),
  76                         })
  77                         buf = buf[1:]
  78                 case first == '"':
  79                         var tokBuf []byte
  80                         tokBuf, buf, p = scanString(buf, p)
  81                         tokens = append(tokens, token{
  82                                 Type:  tokenString,
  83                                 Bytes: tokBuf,
  84                                 Range: posRange(start, p),
  85                         })
  86                 case byteCanStartNumber(first):
  87                         var tokBuf []byte
  88                         tokBuf, buf, p = scanNumber(buf, p)
  89                         tokens = append(tokens, token{
  90                                 Type:  tokenNumber,
  91                                 Bytes: tokBuf,
  92                                 Range: posRange(start, p),
  93                         })
  94                 case byteCanStartKeyword(first):
  95                         var tokBuf []byte
  96                         tokBuf, buf, p = scanKeyword(buf, p)
  97                         tokens = append(tokens, token{
  98                                 Type:  tokenKeyword,
  99                                 Bytes: tokBuf,
 100                                 Range: posRange(start, p),
 101                         })
 102                 default:
 103                         tokens = append(tokens, token{
 104                                 Type:  tokenInvalid,
 105                                 Bytes: buf[:1],
 106                                 Range: start.Range(1, 1),
 107                         })
 108                         // If we've encountered an invalid then we might as well stop
 109                         // scanning since the parser won't proceed beyond this point.
 110                         return tokens
 111                 }
 112         }
 113 }
 114
 115 func byteCanStartNumber(b byte) bool {
 116         switch b {
 117         // We are slightly more tolerant than JSON requires here since we
 118         // expect the parser will make a stricter interpretation of the
 119         // number bytes, but we specifically don't allow 'e' or 'E' here
 120         // since we want the scanner to treat that as the start of an
 121         // invalid keyword instead, to produce more intelligible error messages.
 122         case '-', '+', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
 123                 return true
 124         default:
 125                 return false
 126         }
 127 }
 128
 129 func scanNumber(buf []byte, start pos) ([]byte, []byte, pos) {
 130         // The scanner doesn't check that the sequence of digit-ish bytes is
 131         // in a valid order. The parser must do this when decoding a number
 132         // token.
 133         var i int
 134         p := start
 135 Byte:
 136         for i = 0; i < len(buf); i++ {
 137                 switch buf[i] {
 138                 case '-', '+', '.', 'e', 'E', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
 139                         p.Pos.Byte++
 140                         p.Pos.Column++
 141                 default:
 142                         break Byte
 143                 }
 144         }
 145         return buf[:i], buf[i:], p
 146 }
 147
 148 func byteCanStartKeyword(b byte) bool {
 149         switch {
 150         // We allow any sequence of alphabetical characters here, even though
 151         // JSON is more constrained, so that we can collect what we presume
 152         // the user intended to be a single keyword and then check its validity
 153         // in the parser, where we can generate better diagnostics.
 154         // So e.g. we want to be able to say:
 155         //   unrecognized keyword "True". Did you mean "true"?
 156         case isAlphabetical(b):
 157                 return true
 158         default:
 159                 return false
 160         }
 161 }
 162
 163 func scanKeyword(buf []byte, start pos) ([]byte, []byte, pos) {
 164         var i int
 165         p := start
 166 Byte:
 167         for i = 0; i < len(buf); i++ {
 168                 b := buf[i]
 169                 switch {
 170                 case isAlphabetical(b) || b == '_':
 171                         p.Pos.Byte++
 172                         p.Pos.Column++
 173                 default:
 174                         break Byte
 175                 }
 176         }
 177         return buf[:i], buf[i:], p
 178 }
 179
 180 func scanString(buf []byte, start pos) ([]byte, []byte, pos) {
 181         // The scanner doesn't validate correct use of escapes, etc. It pays
 182         // attention to escapes only for the purpose of identifying the closing
 183         // quote character. It's the parser's responsibility to do proper
 184         // validation.
 185         //
 186         // The scanner also doesn't specifically detect unterminated string
 187         // literals, though they can be identified in the parser by checking if
 188         // the final byte in a string token is the double-quote character.
 189
 190         // Skip the opening quote symbol
 191         i := 1
 192         p := start
 193         p.Pos.Byte++
 194         p.Pos.Column++
 195         escaping := false
 196 Byte:
 197         for i < len(buf) {
 198                 b := buf[i]
 199
 200                 switch {
 201                 case b == '\\':
 202                         escaping = !escaping
 203                         p.Pos.Byte++
 204                         p.Pos.Column++
 205                         i++
 206                 case b == '"':
 207                         p.Pos.Byte++
 208                         p.Pos.Column++
 209                         i++
 210                         if !escaping {
 211                                 break Byte
 212                         }
 213                         escaping = false
 214                 case b < 32:
 215                         break Byte
 216                 default:
 217                         // Advance by one grapheme cluster, so that we consider each
 218                         // grapheme to be a "column".
 219                         // Ignoring error because this scanner cannot produce errors.
 220                         advance, _, _ := textseg.ScanGraphemeClusters(buf[i:], true)
 221
 222                         p.Pos.Byte += advance
 223                         p.Pos.Column++
 224                         i += advance
 225
 226                         escaping = false
 227                 }
 228         }
 229         return buf[:i], buf[i:], p
 230 }
 231
 232 func skipWhitespace(buf []byte, start pos) ([]byte, pos) {
 233         var i int
 234         p := start
 235 Byte:
 236         for i = 0; i < len(buf); i++ {
 237                 switch buf[i] {
 238                 case ' ':
 239                         p.Pos.Byte++
 240                         p.Pos.Column++
 241                 case '\n':
 242                         p.Pos.Byte++
 243                         p.Pos.Column = 1
 244                         p.Pos.Line++
 245                 case '\r':
 246                         // For the purpose of line/column counting we consider a
 247                         // carriage return to take up no space, assuming that it will
 248                         // be paired up with a newline (on Windows, for example) that
 249                         // will account for both of them.
 250                         p.Pos.Byte++
 251                 case '\t':
 252                         // We arbitrarily count a tab as if it were two spaces, because
 253                         // we need to choose _some_ number here. This means any system
 254                         // that renders code on-screen with markers must itself treat
 255                         // tabs as a pair of spaces for rendering purposes, or instead
 256                         // use the byte offset and back into its own column position.
 257                         p.Pos.Byte++
 258                         p.Pos.Column += 2
 259                 default:
 260                         break Byte
 261                 }
 262         }
 263         return buf[i:], p
 264 }
 265
 266 type pos struct {
 267         Filename string
 268         Pos      hcl.Pos
 269 }
 270
 271 func (p *pos) Range(byteLen, charLen int) hcl.Range {
 272         start := p.Pos
 273         end := p.Pos
 274         end.Byte += byteLen
 275         end.Column += charLen
 276         return hcl.Range{
 277                 Filename: p.Filename,
 278                 Start:    start,
 279                 End:      end,
 280         }
 281 }
 282
 283 func posRange(start, end pos) hcl.Range {
 284         return hcl.Range{
 285                 Filename: start.Filename,
 286                 Start:    start.Pos,
 287                 End:      end.Pos,
 288         }
 289 }
 290
 291 func (t token) GoString() string {
 292         return fmt.Sprintf("json.token{json.%s, []byte(%q), %#v}", t.Type, t.Bytes, t.Range)
 293 }
 294
 295 func isAlphabetical(b byte) bool {
 296         return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z')
 297 }