]>
Commit | Line | Data |
---|---|---|
bae9f6d2 JC |
1 | package scanner |
2 | ||
3 | import ( | |
4 | "bytes" | |
5 | "fmt" | |
6 | "os" | |
7 | "unicode" | |
8 | "unicode/utf8" | |
9 | ||
10 | "github.com/hashicorp/hcl/json/token" | |
11 | ) | |
12 | ||
13 | // eof represents a marker rune for the end of the reader. | |
14 | const eof = rune(0) | |
15 | ||
16 | // Scanner defines a lexical scanner | |
17 | type Scanner struct { | |
18 | buf *bytes.Buffer // Source buffer for advancing and scanning | |
19 | src []byte // Source buffer for immutable access | |
20 | ||
21 | // Source Position | |
22 | srcPos token.Pos // current position | |
23 | prevPos token.Pos // previous position, used for peek() method | |
24 | ||
25 | lastCharLen int // length of last character in bytes | |
26 | lastLineLen int // length of last line in characters (for correct column reporting) | |
27 | ||
28 | tokStart int // token text start position | |
29 | tokEnd int // token text end position | |
30 | ||
31 | // Error is called for each error encountered. If no Error | |
32 | // function is set, the error is reported to os.Stderr. | |
33 | Error func(pos token.Pos, msg string) | |
34 | ||
35 | // ErrorCount is incremented by one for each error encountered. | |
36 | ErrorCount int | |
37 | ||
38 | // tokPos is the start position of most recently scanned token; set by | |
39 | // Scan. The Filename field is always left untouched by the Scanner. If | |
40 | // an error is reported (via Error) and Position is invalid, the scanner is | |
41 | // not inside a token. | |
42 | tokPos token.Pos | |
43 | } | |
44 | ||
45 | // New creates and initializes a new instance of Scanner using src as | |
46 | // its source content. | |
47 | func New(src []byte) *Scanner { | |
48 | // even though we accept a src, we read from a io.Reader compatible type | |
49 | // (*bytes.Buffer). So in the future we might easily change it to streaming | |
50 | // read. | |
51 | b := bytes.NewBuffer(src) | |
52 | s := &Scanner{ | |
53 | buf: b, | |
54 | src: src, | |
55 | } | |
56 | ||
57 | // srcPosition always starts with 1 | |
58 | s.srcPos.Line = 1 | |
59 | return s | |
60 | } | |
61 | ||
62 | // next reads the next rune from the bufferred reader. Returns the rune(0) if | |
63 | // an error occurs (or io.EOF is returned). | |
64 | func (s *Scanner) next() rune { | |
65 | ch, size, err := s.buf.ReadRune() | |
66 | if err != nil { | |
67 | // advance for error reporting | |
68 | s.srcPos.Column++ | |
69 | s.srcPos.Offset += size | |
70 | s.lastCharLen = size | |
71 | return eof | |
72 | } | |
73 | ||
74 | if ch == utf8.RuneError && size == 1 { | |
75 | s.srcPos.Column++ | |
76 | s.srcPos.Offset += size | |
77 | s.lastCharLen = size | |
78 | s.err("illegal UTF-8 encoding") | |
79 | return ch | |
80 | } | |
81 | ||
82 | // remember last position | |
83 | s.prevPos = s.srcPos | |
84 | ||
85 | s.srcPos.Column++ | |
86 | s.lastCharLen = size | |
87 | s.srcPos.Offset += size | |
88 | ||
89 | if ch == '\n' { | |
90 | s.srcPos.Line++ | |
91 | s.lastLineLen = s.srcPos.Column | |
92 | s.srcPos.Column = 0 | |
93 | } | |
94 | ||
95 | // debug | |
96 | // fmt.Printf("ch: %q, offset:column: %d:%d\n", ch, s.srcPos.Offset, s.srcPos.Column) | |
97 | return ch | |
98 | } | |
99 | ||
100 | // unread unreads the previous read Rune and updates the source position | |
101 | func (s *Scanner) unread() { | |
102 | if err := s.buf.UnreadRune(); err != nil { | |
103 | panic(err) // this is user fault, we should catch it | |
104 | } | |
105 | s.srcPos = s.prevPos // put back last position | |
106 | } | |
107 | ||
108 | // peek returns the next rune without advancing the reader. | |
109 | func (s *Scanner) peek() rune { | |
110 | peek, _, err := s.buf.ReadRune() | |
111 | if err != nil { | |
112 | return eof | |
113 | } | |
114 | ||
115 | s.buf.UnreadRune() | |
116 | return peek | |
117 | } | |
118 | ||
119 | // Scan scans the next token and returns the token. | |
120 | func (s *Scanner) Scan() token.Token { | |
121 | ch := s.next() | |
122 | ||
123 | // skip white space | |
124 | for isWhitespace(ch) { | |
125 | ch = s.next() | |
126 | } | |
127 | ||
128 | var tok token.Type | |
129 | ||
130 | // token text markings | |
131 | s.tokStart = s.srcPos.Offset - s.lastCharLen | |
132 | ||
133 | // token position, initial next() is moving the offset by one(size of rune | |
134 | // actually), though we are interested with the starting point | |
135 | s.tokPos.Offset = s.srcPos.Offset - s.lastCharLen | |
136 | if s.srcPos.Column > 0 { | |
137 | // common case: last character was not a '\n' | |
138 | s.tokPos.Line = s.srcPos.Line | |
139 | s.tokPos.Column = s.srcPos.Column | |
140 | } else { | |
141 | // last character was a '\n' | |
142 | // (we cannot be at the beginning of the source | |
143 | // since we have called next() at least once) | |
144 | s.tokPos.Line = s.srcPos.Line - 1 | |
145 | s.tokPos.Column = s.lastLineLen | |
146 | } | |
147 | ||
148 | switch { | |
149 | case isLetter(ch): | |
150 | lit := s.scanIdentifier() | |
151 | if lit == "true" || lit == "false" { | |
152 | tok = token.BOOL | |
153 | } else if lit == "null" { | |
154 | tok = token.NULL | |
155 | } else { | |
156 | s.err("illegal char") | |
157 | } | |
158 | case isDecimal(ch): | |
159 | tok = s.scanNumber(ch) | |
160 | default: | |
161 | switch ch { | |
162 | case eof: | |
163 | tok = token.EOF | |
164 | case '"': | |
165 | tok = token.STRING | |
166 | s.scanString() | |
167 | case '.': | |
168 | tok = token.PERIOD | |
169 | ch = s.peek() | |
170 | if isDecimal(ch) { | |
171 | tok = token.FLOAT | |
172 | ch = s.scanMantissa(ch) | |
173 | ch = s.scanExponent(ch) | |
174 | } | |
175 | case '[': | |
176 | tok = token.LBRACK | |
177 | case ']': | |
178 | tok = token.RBRACK | |
179 | case '{': | |
180 | tok = token.LBRACE | |
181 | case '}': | |
182 | tok = token.RBRACE | |
183 | case ',': | |
184 | tok = token.COMMA | |
185 | case ':': | |
186 | tok = token.COLON | |
187 | case '-': | |
188 | if isDecimal(s.peek()) { | |
189 | ch := s.next() | |
190 | tok = s.scanNumber(ch) | |
191 | } else { | |
192 | s.err("illegal char") | |
193 | } | |
194 | default: | |
195 | s.err("illegal char: " + string(ch)) | |
196 | } | |
197 | } | |
198 | ||
199 | // finish token ending | |
200 | s.tokEnd = s.srcPos.Offset | |
201 | ||
202 | // create token literal | |
203 | var tokenText string | |
204 | if s.tokStart >= 0 { | |
205 | tokenText = string(s.src[s.tokStart:s.tokEnd]) | |
206 | } | |
207 | s.tokStart = s.tokEnd // ensure idempotency of tokenText() call | |
208 | ||
209 | return token.Token{ | |
210 | Type: tok, | |
211 | Pos: s.tokPos, | |
212 | Text: tokenText, | |
213 | } | |
214 | } | |
215 | ||
216 | // scanNumber scans a HCL number definition starting with the given rune | |
217 | func (s *Scanner) scanNumber(ch rune) token.Type { | |
218 | zero := ch == '0' | |
219 | pos := s.srcPos | |
220 | ||
221 | s.scanMantissa(ch) | |
222 | ch = s.next() // seek forward | |
223 | if ch == 'e' || ch == 'E' { | |
224 | ch = s.scanExponent(ch) | |
225 | return token.FLOAT | |
226 | } | |
227 | ||
228 | if ch == '.' { | |
229 | ch = s.scanFraction(ch) | |
230 | if ch == 'e' || ch == 'E' { | |
231 | ch = s.next() | |
232 | ch = s.scanExponent(ch) | |
233 | } | |
234 | return token.FLOAT | |
235 | } | |
236 | ||
237 | if ch != eof { | |
238 | s.unread() | |
239 | } | |
240 | ||
241 | // If we have a larger number and this is zero, error | |
242 | if zero && pos != s.srcPos { | |
243 | s.err("numbers cannot start with 0") | |
244 | } | |
245 | ||
246 | return token.NUMBER | |
247 | } | |
248 | ||
249 | // scanMantissa scans the mantissa begining from the rune. It returns the next | |
250 | // non decimal rune. It's used to determine wheter it's a fraction or exponent. | |
251 | func (s *Scanner) scanMantissa(ch rune) rune { | |
252 | scanned := false | |
253 | for isDecimal(ch) { | |
254 | ch = s.next() | |
255 | scanned = true | |
256 | } | |
257 | ||
258 | if scanned && ch != eof { | |
259 | s.unread() | |
260 | } | |
261 | return ch | |
262 | } | |
263 | ||
264 | // scanFraction scans the fraction after the '.' rune | |
265 | func (s *Scanner) scanFraction(ch rune) rune { | |
266 | if ch == '.' { | |
267 | ch = s.peek() // we peek just to see if we can move forward | |
268 | ch = s.scanMantissa(ch) | |
269 | } | |
270 | return ch | |
271 | } | |
272 | ||
273 | // scanExponent scans the remaining parts of an exponent after the 'e' or 'E' | |
274 | // rune. | |
275 | func (s *Scanner) scanExponent(ch rune) rune { | |
276 | if ch == 'e' || ch == 'E' { | |
277 | ch = s.next() | |
278 | if ch == '-' || ch == '+' { | |
279 | ch = s.next() | |
280 | } | |
281 | ch = s.scanMantissa(ch) | |
282 | } | |
283 | return ch | |
284 | } | |
285 | ||
286 | // scanString scans a quoted string | |
287 | func (s *Scanner) scanString() { | |
288 | braces := 0 | |
289 | for { | |
290 | // '"' opening already consumed | |
291 | // read character after quote | |
292 | ch := s.next() | |
293 | ||
294 | if ch == '\n' || ch < 0 || ch == eof { | |
295 | s.err("literal not terminated") | |
296 | return | |
297 | } | |
298 | ||
299 | if ch == '"' { | |
300 | break | |
301 | } | |
302 | ||
303 | // If we're going into a ${} then we can ignore quotes for awhile | |
304 | if braces == 0 && ch == '$' && s.peek() == '{' { | |
305 | braces++ | |
306 | s.next() | |
307 | } else if braces > 0 && ch == '{' { | |
308 | braces++ | |
309 | } | |
310 | if braces > 0 && ch == '}' { | |
311 | braces-- | |
312 | } | |
313 | ||
314 | if ch == '\\' { | |
315 | s.scanEscape() | |
316 | } | |
317 | } | |
318 | ||
319 | return | |
320 | } | |
321 | ||
322 | // scanEscape scans an escape sequence | |
323 | func (s *Scanner) scanEscape() rune { | |
324 | // http://en.cppreference.com/w/cpp/language/escape | |
325 | ch := s.next() // read character after '/' | |
326 | switch ch { | |
327 | case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"': | |
328 | // nothing to do | |
329 | case '0', '1', '2', '3', '4', '5', '6', '7': | |
330 | // octal notation | |
331 | ch = s.scanDigits(ch, 8, 3) | |
332 | case 'x': | |
333 | // hexademical notation | |
334 | ch = s.scanDigits(s.next(), 16, 2) | |
335 | case 'u': | |
336 | // universal character name | |
337 | ch = s.scanDigits(s.next(), 16, 4) | |
338 | case 'U': | |
339 | // universal character name | |
340 | ch = s.scanDigits(s.next(), 16, 8) | |
341 | default: | |
342 | s.err("illegal char escape") | |
343 | } | |
344 | return ch | |
345 | } | |
346 | ||
347 | // scanDigits scans a rune with the given base for n times. For example an | |
348 | // octal notation \184 would yield in scanDigits(ch, 8, 3) | |
349 | func (s *Scanner) scanDigits(ch rune, base, n int) rune { | |
350 | for n > 0 && digitVal(ch) < base { | |
351 | ch = s.next() | |
352 | n-- | |
353 | } | |
354 | if n > 0 { | |
355 | s.err("illegal char escape") | |
356 | } | |
357 | ||
358 | // we scanned all digits, put the last non digit char back | |
359 | s.unread() | |
360 | return ch | |
361 | } | |
362 | ||
363 | // scanIdentifier scans an identifier and returns the literal string | |
364 | func (s *Scanner) scanIdentifier() string { | |
365 | offs := s.srcPos.Offset - s.lastCharLen | |
366 | ch := s.next() | |
367 | for isLetter(ch) || isDigit(ch) || ch == '-' { | |
368 | ch = s.next() | |
369 | } | |
370 | ||
371 | if ch != eof { | |
372 | s.unread() // we got identifier, put back latest char | |
373 | } | |
374 | ||
375 | return string(s.src[offs:s.srcPos.Offset]) | |
376 | } | |
377 | ||
378 | // recentPosition returns the position of the character immediately after the | |
379 | // character or token returned by the last call to Scan. | |
380 | func (s *Scanner) recentPosition() (pos token.Pos) { | |
381 | pos.Offset = s.srcPos.Offset - s.lastCharLen | |
382 | switch { | |
383 | case s.srcPos.Column > 0: | |
384 | // common case: last character was not a '\n' | |
385 | pos.Line = s.srcPos.Line | |
386 | pos.Column = s.srcPos.Column | |
387 | case s.lastLineLen > 0: | |
388 | // last character was a '\n' | |
389 | // (we cannot be at the beginning of the source | |
390 | // since we have called next() at least once) | |
391 | pos.Line = s.srcPos.Line - 1 | |
392 | pos.Column = s.lastLineLen | |
393 | default: | |
394 | // at the beginning of the source | |
395 | pos.Line = 1 | |
396 | pos.Column = 1 | |
397 | } | |
398 | return | |
399 | } | |
400 | ||
401 | // err prints the error of any scanning to s.Error function. If the function is | |
402 | // not defined, by default it prints them to os.Stderr | |
403 | func (s *Scanner) err(msg string) { | |
404 | s.ErrorCount++ | |
405 | pos := s.recentPosition() | |
406 | ||
407 | if s.Error != nil { | |
408 | s.Error(pos, msg) | |
409 | return | |
410 | } | |
411 | ||
412 | fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg) | |
413 | } | |
414 | ||
415 | // isHexadecimal returns true if the given rune is a letter | |
416 | func isLetter(ch rune) bool { | |
417 | return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch) | |
418 | } | |
419 | ||
420 | // isHexadecimal returns true if the given rune is a decimal digit | |
421 | func isDigit(ch rune) bool { | |
422 | return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch) | |
423 | } | |
424 | ||
425 | // isHexadecimal returns true if the given rune is a decimal number | |
426 | func isDecimal(ch rune) bool { | |
427 | return '0' <= ch && ch <= '9' | |
428 | } | |
429 | ||
430 | // isHexadecimal returns true if the given rune is an hexadecimal number | |
431 | func isHexadecimal(ch rune) bool { | |
432 | return '0' <= ch && ch <= '9' || 'a' <= ch && ch <= 'f' || 'A' <= ch && ch <= 'F' | |
433 | } | |
434 | ||
435 | // isWhitespace returns true if the rune is a space, tab, newline or carriage return | |
436 | func isWhitespace(ch rune) bool { | |
437 | return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' | |
438 | } | |
439 | ||
440 | // digitVal returns the integer value of a given octal,decimal or hexadecimal rune | |
441 | func digitVal(ch rune) int { | |
442 | switch { | |
443 | case '0' <= ch && ch <= '9': | |
444 | return int(ch - '0') | |
445 | case 'a' <= ch && ch <= 'f': | |
446 | return int(ch - 'a' + 10) | |
447 | case 'A' <= ch && ch <= 'F': | |
448 | return int(ch - 'A' + 10) | |
449 | } | |
450 | return 16 // larger than any legal digit val | |
451 | } |