]>
Commit | Line | Data |
---|---|---|
15c0b25d AP |
1 | package hcl |
2 | ||
3 | import ( | |
4 | "bufio" | |
5 | "bytes" | |
6 | ||
7 | "github.com/apparentlymart/go-textseg/textseg" | |
8 | ) | |
9 | ||
10 | // RangeScanner is a helper that will scan over a buffer using a bufio.SplitFunc | |
11 | // and visit a source range for each token matched. | |
12 | // | |
13 | // For example, this can be used with bufio.ScanLines to find the source range | |
14 | // for each line in the file, skipping over the actual newline characters, which | |
15 | // may be useful when printing source code snippets as part of diagnostic | |
16 | // messages. | |
17 | // | |
18 | // The line and column information in the returned ranges is produced by | |
19 | // counting newline characters and grapheme clusters respectively, which | |
20 | // mimics the behavior we expect from a parser when producing ranges. | |
21 | type RangeScanner struct { | |
22 | filename string | |
23 | b []byte | |
24 | cb bufio.SplitFunc | |
25 | ||
26 | pos Pos // position of next byte to process in b | |
27 | cur Range // latest range | |
28 | tok []byte // slice of b that is covered by cur | |
29 | err error // error from last scan, if any | |
30 | } | |
31 | ||
107c1cdb ND |
32 | // NewRangeScanner creates a new RangeScanner for the given buffer, producing |
33 | // ranges for the given filename. | |
15c0b25d AP |
34 | // |
35 | // Since ranges have grapheme-cluster granularity rather than byte granularity, | |
36 | // the scanner will produce incorrect results if the given SplitFunc creates | |
37 | // tokens between grapheme cluster boundaries. In particular, it is incorrect | |
38 | // to use RangeScanner with bufio.ScanRunes because it will produce tokens | |
39 | // around individual UTF-8 sequences, which will split any multi-sequence | |
40 | // grapheme clusters. | |
41 | func NewRangeScanner(b []byte, filename string, cb bufio.SplitFunc) *RangeScanner { | |
107c1cdb ND |
42 | return NewRangeScannerFragment(b, filename, InitialPos, cb) |
43 | } | |
44 | ||
45 | // NewRangeScannerFragment is like NewRangeScanner but the ranges it produces | |
46 | // will be offset by the given starting position, which is appropriate for | |
47 | // sub-slices of a file, whereas NewRangeScanner assumes it is scanning an | |
48 | // entire file. | |
49 | func NewRangeScannerFragment(b []byte, filename string, start Pos, cb bufio.SplitFunc) *RangeScanner { | |
15c0b25d AP |
50 | return &RangeScanner{ |
51 | filename: filename, | |
52 | b: b, | |
53 | cb: cb, | |
107c1cdb | 54 | pos: start, |
15c0b25d AP |
55 | } |
56 | } | |
57 | ||
58 | func (sc *RangeScanner) Scan() bool { | |
59 | if sc.pos.Byte >= len(sc.b) || sc.err != nil { | |
60 | // All done | |
61 | return false | |
62 | } | |
63 | ||
64 | // Since we're operating on an in-memory buffer, we always pass the whole | |
65 | // remainder of the buffer to our SplitFunc and set isEOF to let it know | |
66 | // that it has the whole thing. | |
67 | advance, token, err := sc.cb(sc.b[sc.pos.Byte:], true) | |
68 | ||
69 | // Since we are setting isEOF to true this should never happen, but | |
70 | // if it does we will just abort and assume the SplitFunc is misbehaving. | |
71 | if advance == 0 && token == nil && err == nil { | |
72 | return false | |
73 | } | |
74 | ||
75 | if err != nil { | |
76 | sc.err = err | |
77 | sc.cur = Range{ | |
78 | Filename: sc.filename, | |
79 | Start: sc.pos, | |
80 | End: sc.pos, | |
81 | } | |
82 | sc.tok = nil | |
83 | return false | |
84 | } | |
85 | ||
86 | sc.tok = token | |
87 | start := sc.pos | |
88 | end := sc.pos | |
89 | new := sc.pos | |
90 | ||
91 | // adv is similar to token but it also includes any subsequent characters | |
92 | // we're being asked to skip over by the SplitFunc. | |
93 | // adv is a slice covering any additional bytes we are skipping over, based | |
94 | // on what the SplitFunc told us to do with advance. | |
95 | adv := sc.b[sc.pos.Byte : sc.pos.Byte+advance] | |
96 | ||
97 | // We now need to scan over our token to count the grapheme clusters | |
98 | // so we can correctly advance Column, and count the newlines so we | |
99 | // can correctly advance Line. | |
100 | advR := bytes.NewReader(adv) | |
101 | gsc := bufio.NewScanner(advR) | |
102 | advanced := 0 | |
103 | gsc.Split(textseg.ScanGraphemeClusters) | |
104 | for gsc.Scan() { | |
105 | gr := gsc.Bytes() | |
106 | new.Byte += len(gr) | |
107 | new.Column++ | |
108 | ||
109 | // We rely here on the fact that \r\n is considered a grapheme cluster | |
110 | // and so we don't need to worry about miscounting additional lines | |
111 | // on files with Windows-style line endings. | |
112 | if len(gr) != 0 && (gr[0] == '\r' || gr[0] == '\n') { | |
113 | new.Column = 1 | |
114 | new.Line++ | |
115 | } | |
116 | ||
117 | if advanced < len(token) { | |
118 | // If we've not yet found the end of our token then we'll | |
119 | // also push our "end" marker along. | |
120 | // (if advance > len(token) then we'll stop moving "end" early | |
121 | // so that the caller only sees the range covered by token.) | |
122 | end = new | |
123 | } | |
124 | advanced += len(gr) | |
125 | } | |
126 | ||
127 | sc.cur = Range{ | |
128 | Filename: sc.filename, | |
129 | Start: start, | |
130 | End: end, | |
131 | } | |
132 | sc.pos = new | |
133 | return true | |
134 | } | |
135 | ||
136 | // Range returns a range that covers the latest token obtained after a call | |
137 | // to Scan returns true. | |
138 | func (sc *RangeScanner) Range() Range { | |
139 | return sc.cur | |
140 | } | |
141 | ||
142 | // Bytes returns the slice of the input buffer that is covered by the range | |
143 | // that would be returned by Range. | |
144 | func (sc *RangeScanner) Bytes() []byte { | |
145 | return sc.tok | |
146 | } | |
147 | ||
148 | // Err can be called after Scan returns false to determine if the latest read | |
149 | // resulted in an error, and obtain that error if so. | |
150 | func (sc *RangeScanner) Err() error { | |
151 | return sc.err | |
152 | } |