]>
Commit | Line | Data |
---|---|---|
15c0b25d AP |
1 | package json |
2 | ||
3 | import ( | |
4 | "fmt" | |
5 | ||
6 | "github.com/apparentlymart/go-textseg/textseg" | |
7 | "github.com/hashicorp/hcl2/hcl" | |
8 | ) | |
9 | ||
10 | //go:generate stringer -type tokenType scanner.go | |
11 | type tokenType rune | |
12 | ||
13 | const ( | |
14 | tokenBraceO tokenType = '{' | |
15 | tokenBraceC tokenType = '}' | |
16 | tokenBrackO tokenType = '[' | |
17 | tokenBrackC tokenType = ']' | |
18 | tokenComma tokenType = ',' | |
19 | tokenColon tokenType = ':' | |
20 | tokenKeyword tokenType = 'K' | |
21 | tokenString tokenType = 'S' | |
22 | tokenNumber tokenType = 'N' | |
23 | tokenEOF tokenType = '␄' | |
24 | tokenInvalid tokenType = 0 | |
25 | tokenEquals tokenType = '=' // used only for reminding the user of JSON syntax | |
26 | ) | |
27 | ||
28 | type token struct { | |
29 | Type tokenType | |
30 | Bytes []byte | |
31 | Range hcl.Range | |
32 | } | |
33 | ||
34 | // scan returns the primary tokens for the given JSON buffer in sequence. | |
35 | // | |
36 | // The responsibility of this pass is to just mark the slices of the buffer | |
37 | // as being of various types. It is lax in how it interprets the multi-byte | |
38 | // token types keyword, string and number, preferring to capture erroneous | |
39 | // extra bytes that we presume the user intended to be part of the token | |
40 | // so that we can generate more helpful diagnostics in the parser. | |
41 | func scan(buf []byte, start pos) []token { | |
42 | var tokens []token | |
43 | p := start | |
44 | for { | |
45 | if len(buf) == 0 { | |
46 | tokens = append(tokens, token{ | |
47 | Type: tokenEOF, | |
48 | Bytes: nil, | |
49 | Range: posRange(p, p), | |
50 | }) | |
51 | return tokens | |
52 | } | |
53 | ||
54 | buf, p = skipWhitespace(buf, p) | |
55 | ||
56 | if len(buf) == 0 { | |
57 | tokens = append(tokens, token{ | |
58 | Type: tokenEOF, | |
59 | Bytes: nil, | |
60 | Range: posRange(p, p), | |
61 | }) | |
62 | return tokens | |
63 | } | |
64 | ||
65 | start = p | |
66 | ||
67 | first := buf[0] | |
68 | switch { | |
69 | case first == '{' || first == '}' || first == '[' || first == ']' || first == ',' || first == ':' || first == '=': | |
70 | p.Pos.Column++ | |
71 | p.Pos.Byte++ | |
72 | tokens = append(tokens, token{ | |
73 | Type: tokenType(first), | |
74 | Bytes: buf[0:1], | |
75 | Range: posRange(start, p), | |
76 | }) | |
77 | buf = buf[1:] | |
78 | case first == '"': | |
79 | var tokBuf []byte | |
80 | tokBuf, buf, p = scanString(buf, p) | |
81 | tokens = append(tokens, token{ | |
82 | Type: tokenString, | |
83 | Bytes: tokBuf, | |
84 | Range: posRange(start, p), | |
85 | }) | |
86 | case byteCanStartNumber(first): | |
87 | var tokBuf []byte | |
88 | tokBuf, buf, p = scanNumber(buf, p) | |
89 | tokens = append(tokens, token{ | |
90 | Type: tokenNumber, | |
91 | Bytes: tokBuf, | |
92 | Range: posRange(start, p), | |
93 | }) | |
94 | case byteCanStartKeyword(first): | |
95 | var tokBuf []byte | |
96 | tokBuf, buf, p = scanKeyword(buf, p) | |
97 | tokens = append(tokens, token{ | |
98 | Type: tokenKeyword, | |
99 | Bytes: tokBuf, | |
100 | Range: posRange(start, p), | |
101 | }) | |
102 | default: | |
103 | tokens = append(tokens, token{ | |
104 | Type: tokenInvalid, | |
105 | Bytes: buf[:1], | |
106 | Range: start.Range(1, 1), | |
107 | }) | |
108 | // If we've encountered an invalid then we might as well stop | |
109 | // scanning since the parser won't proceed beyond this point. | |
110 | return tokens | |
111 | } | |
112 | } | |
113 | } | |
114 | ||
115 | func byteCanStartNumber(b byte) bool { | |
116 | switch b { | |
117 | // We are slightly more tolerant than JSON requires here since we | |
118 | // expect the parser will make a stricter interpretation of the | |
119 | // number bytes, but we specifically don't allow 'e' or 'E' here | |
120 | // since we want the scanner to treat that as the start of an | |
121 | // invalid keyword instead, to produce more intelligible error messages. | |
122 | case '-', '+', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': | |
123 | return true | |
124 | default: | |
125 | return false | |
126 | } | |
127 | } | |
128 | ||
129 | func scanNumber(buf []byte, start pos) ([]byte, []byte, pos) { | |
130 | // The scanner doesn't check that the sequence of digit-ish bytes is | |
131 | // in a valid order. The parser must do this when decoding a number | |
132 | // token. | |
133 | var i int | |
134 | p := start | |
135 | Byte: | |
136 | for i = 0; i < len(buf); i++ { | |
137 | switch buf[i] { | |
138 | case '-', '+', '.', 'e', 'E', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': | |
139 | p.Pos.Byte++ | |
140 | p.Pos.Column++ | |
141 | default: | |
142 | break Byte | |
143 | } | |
144 | } | |
145 | return buf[:i], buf[i:], p | |
146 | } | |
147 | ||
148 | func byteCanStartKeyword(b byte) bool { | |
149 | switch { | |
150 | // We allow any sequence of alphabetical characters here, even though | |
151 | // JSON is more constrained, so that we can collect what we presume | |
152 | // the user intended to be a single keyword and then check its validity | |
153 | // in the parser, where we can generate better diagnostics. | |
154 | // So e.g. we want to be able to say: | |
155 | // unrecognized keyword "True". Did you mean "true"? | |
107c1cdb | 156 | case isAlphabetical(b): |
15c0b25d AP |
157 | return true |
158 | default: | |
159 | return false | |
160 | } | |
161 | } | |
162 | ||
163 | func scanKeyword(buf []byte, start pos) ([]byte, []byte, pos) { | |
164 | var i int | |
165 | p := start | |
166 | Byte: | |
167 | for i = 0; i < len(buf); i++ { | |
168 | b := buf[i] | |
169 | switch { | |
107c1cdb | 170 | case isAlphabetical(b) || b == '_': |
15c0b25d AP |
171 | p.Pos.Byte++ |
172 | p.Pos.Column++ | |
173 | default: | |
174 | break Byte | |
175 | } | |
176 | } | |
177 | return buf[:i], buf[i:], p | |
178 | } | |
179 | ||
180 | func scanString(buf []byte, start pos) ([]byte, []byte, pos) { | |
181 | // The scanner doesn't validate correct use of escapes, etc. It pays | |
182 | // attention to escapes only for the purpose of identifying the closing | |
183 | // quote character. It's the parser's responsibility to do proper | |
184 | // validation. | |
185 | // | |
186 | // The scanner also doesn't specifically detect unterminated string | |
187 | // literals, though they can be identified in the parser by checking if | |
188 | // the final byte in a string token is the double-quote character. | |
189 | ||
190 | // Skip the opening quote symbol | |
191 | i := 1 | |
192 | p := start | |
193 | p.Pos.Byte++ | |
194 | p.Pos.Column++ | |
195 | escaping := false | |
196 | Byte: | |
197 | for i < len(buf) { | |
198 | b := buf[i] | |
199 | ||
200 | switch { | |
201 | case b == '\\': | |
202 | escaping = !escaping | |
203 | p.Pos.Byte++ | |
204 | p.Pos.Column++ | |
205 | i++ | |
206 | case b == '"': | |
207 | p.Pos.Byte++ | |
208 | p.Pos.Column++ | |
209 | i++ | |
210 | if !escaping { | |
211 | break Byte | |
212 | } | |
213 | escaping = false | |
214 | case b < 32: | |
215 | break Byte | |
216 | default: | |
217 | // Advance by one grapheme cluster, so that we consider each | |
218 | // grapheme to be a "column". | |
219 | // Ignoring error because this scanner cannot produce errors. | |
220 | advance, _, _ := textseg.ScanGraphemeClusters(buf[i:], true) | |
221 | ||
222 | p.Pos.Byte += advance | |
223 | p.Pos.Column++ | |
224 | i += advance | |
225 | ||
226 | escaping = false | |
227 | } | |
228 | } | |
229 | return buf[:i], buf[i:], p | |
230 | } | |
231 | ||
232 | func skipWhitespace(buf []byte, start pos) ([]byte, pos) { | |
233 | var i int | |
234 | p := start | |
235 | Byte: | |
236 | for i = 0; i < len(buf); i++ { | |
237 | switch buf[i] { | |
238 | case ' ': | |
239 | p.Pos.Byte++ | |
240 | p.Pos.Column++ | |
241 | case '\n': | |
242 | p.Pos.Byte++ | |
243 | p.Pos.Column = 1 | |
244 | p.Pos.Line++ | |
245 | case '\r': | |
246 | // For the purpose of line/column counting we consider a | |
247 | // carriage return to take up no space, assuming that it will | |
248 | // be paired up with a newline (on Windows, for example) that | |
249 | // will account for both of them. | |
250 | p.Pos.Byte++ | |
251 | case '\t': | |
252 | // We arbitrarily count a tab as if it were two spaces, because | |
253 | // we need to choose _some_ number here. This means any system | |
254 | // that renders code on-screen with markers must itself treat | |
255 | // tabs as a pair of spaces for rendering purposes, or instead | |
256 | // use the byte offset and back into its own column position. | |
257 | p.Pos.Byte++ | |
258 | p.Pos.Column += 2 | |
259 | default: | |
260 | break Byte | |
261 | } | |
262 | } | |
263 | return buf[i:], p | |
264 | } | |
265 | ||
266 | type pos struct { | |
267 | Filename string | |
268 | Pos hcl.Pos | |
269 | } | |
270 | ||
271 | func (p *pos) Range(byteLen, charLen int) hcl.Range { | |
272 | start := p.Pos | |
273 | end := p.Pos | |
274 | end.Byte += byteLen | |
275 | end.Column += charLen | |
276 | return hcl.Range{ | |
277 | Filename: p.Filename, | |
278 | Start: start, | |
279 | End: end, | |
280 | } | |
281 | } | |
282 | ||
283 | func posRange(start, end pos) hcl.Range { | |
284 | return hcl.Range{ | |
285 | Filename: start.Filename, | |
286 | Start: start.Pos, | |
287 | End: end.Pos, | |
288 | } | |
289 | } | |
290 | ||
291 | func (t token) GoString() string { | |
292 | return fmt.Sprintf("json.token{json.%s, []byte(%q), %#v}", t.Type, t.Bytes, t.Range) | |
293 | } | |
107c1cdb ND |
294 | |
295 | func isAlphabetical(b byte) bool { | |
296 | return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') | |
297 | } |