diff options
Diffstat (limited to 'vendor/github.com/hashicorp/hcl2/hcl/hclsyntax/unicode2ragel.rb')
-rw-r--r-- | vendor/github.com/hashicorp/hcl2/hcl/hclsyntax/unicode2ragel.rb | 335 |
1 files changed, 335 insertions, 0 deletions
diff --git a/vendor/github.com/hashicorp/hcl2/hcl/hclsyntax/unicode2ragel.rb b/vendor/github.com/hashicorp/hcl2/hcl/hclsyntax/unicode2ragel.rb new file mode 100644 index 0000000..422e4e5 --- /dev/null +++ b/vendor/github.com/hashicorp/hcl2/hcl/hclsyntax/unicode2ragel.rb | |||
@@ -0,0 +1,335 @@ | |||
1 | #!/usr/bin/env ruby | ||
2 | # | ||
3 | # This scripted has been updated to accept more command-line arguments: | ||
4 | # | ||
5 | # -u, --url URL to process | ||
6 | # -m, --machine Machine name | ||
7 | # -p, --properties Properties to add to the machine | ||
8 | # -o, --output Write output to file | ||
9 | # | ||
10 | # Updated by: Marty Schoch <marty.schoch@gmail.com> | ||
11 | # | ||
12 | # This script uses the unicode spec to generate a Ragel state machine | ||
13 | # that recognizes unicode alphanumeric characters. It generates 5 | ||
14 | # character classes: uupper, ulower, ualpha, udigit, and ualnum. | ||
15 | # Currently supported encodings are UTF-8 [default] and UCS-4. | ||
16 | # | ||
17 | # Usage: unicode2ragel.rb [options] | ||
18 | # -e, --encoding [ucs4 | utf8] Data encoding | ||
19 | # -h, --help Show this message | ||
20 | # | ||
21 | # This script was originally written as part of the Ferret search | ||
22 | # engine library. | ||
23 | # | ||
24 | # Author: Rakan El-Khalil <rakan@well.com> | ||
25 | |||
26 | require 'optparse' | ||
27 | require 'open-uri' | ||
28 | |||
29 | ENCODINGS = [ :utf8, :ucs4 ] | ||
30 | ALPHTYPES = { :utf8 => "byte", :ucs4 => "rune" } | ||
31 | DEFAULT_CHART_URL = "http://www.unicode.org/Public/5.1.0/ucd/DerivedCoreProperties.txt" | ||
32 | DEFAULT_MACHINE_NAME= "WChar" | ||
33 | |||
34 | ### | ||
35 | # Display vars & default option | ||
36 | |||
37 | TOTAL_WIDTH = 80 | ||
38 | RANGE_WIDTH = 23 | ||
39 | @encoding = :utf8 | ||
40 | @chart_url = DEFAULT_CHART_URL | ||
41 | machine_name = DEFAULT_MACHINE_NAME | ||
42 | properties = [] | ||
43 | @output = $stdout | ||
44 | |||
45 | ### | ||
46 | # Option parsing | ||
47 | |||
48 | cli_opts = OptionParser.new do |opts| | ||
49 | opts.on("-e", "--encoding [ucs4 | utf8]", "Data encoding") do |o| | ||
50 | @encoding = o.downcase.to_sym | ||
51 | end | ||
52 | opts.on("-h", "--help", "Show this message") do | ||
53 | puts opts | ||
54 | exit | ||
55 | end | ||
56 | opts.on("-u", "--url URL", "URL to process") do |o| | ||
57 | @chart_url = o | ||
58 | end | ||
59 | opts.on("-m", "--machine MACHINE_NAME", "Machine name") do |o| | ||
60 | machine_name = o | ||
61 | end | ||
62 | opts.on("-p", "--properties x,y,z", Array, "Properties to add to machine") do |o| | ||
63 | properties = o | ||
64 | end | ||
65 | opts.on("-o", "--output FILE", "output file") do |o| | ||
66 | @output = File.new(o, "w+") | ||
67 | end | ||
68 | end | ||
69 | |||
70 | cli_opts.parse(ARGV) | ||
71 | unless ENCODINGS.member? @encoding | ||
72 | puts "Invalid encoding: #{@encoding}" | ||
73 | puts cli_opts | ||
74 | exit | ||
75 | end | ||
76 | |||
77 | ## | ||
78 | # Downloads the document at url and yields every alpha line's hex | ||
79 | # range and description. | ||
80 | |||
81 | def each_alpha( url, property ) | ||
82 | open( url ) do |file| | ||
83 | file.each_line do |line| | ||
84 | next if line =~ /^#/; | ||
85 | next if line !~ /; #{property} #/; | ||
86 | |||
87 | range, description = line.split(/;/) | ||
88 | range.strip! | ||
89 | description.gsub!(/.*#/, '').strip! | ||
90 | |||
91 | if range =~ /\.\./ | ||
92 | start, stop = range.split '..' | ||
93 | else start = stop = range | ||
94 | end | ||
95 | |||
96 | yield start.hex .. stop.hex, description | ||
97 | end | ||
98 | end | ||
99 | end | ||
100 | |||
101 | ### | ||
102 | # Formats to hex at minimum width | ||
103 | |||
104 | def to_hex( n ) | ||
105 | r = "%0X" % n | ||
106 | r = "0#{r}" unless (r.length % 2).zero? | ||
107 | r | ||
108 | end | ||
109 | |||
110 | ### | ||
111 | # UCS4 is just a straight hex conversion of the unicode codepoint. | ||
112 | |||
113 | def to_ucs4( range ) | ||
114 | rangestr = "0x" + to_hex(range.begin) | ||
115 | rangestr << "..0x" + to_hex(range.end) if range.begin != range.end | ||
116 | [ rangestr ] | ||
117 | end | ||
118 | |||
119 | ## | ||
120 | # 0x00 - 0x7f -> 0zzzzzzz[7] | ||
121 | # 0x80 - 0x7ff -> 110yyyyy[5] 10zzzzzz[6] | ||
122 | # 0x800 - 0xffff -> 1110xxxx[4] 10yyyyyy[6] 10zzzzzz[6] | ||
123 | # 0x010000 - 0x10ffff -> 11110www[3] 10xxxxxx[6] 10yyyyyy[6] 10zzzzzz[6] | ||
124 | |||
125 | UTF8_BOUNDARIES = [0x7f, 0x7ff, 0xffff, 0x10ffff] | ||
126 | |||
127 | def to_utf8_enc( n ) | ||
128 | r = 0 | ||
129 | if n <= 0x7f | ||
130 | r = n | ||
131 | elsif n <= 0x7ff | ||
132 | y = 0xc0 | (n >> 6) | ||
133 | z = 0x80 | (n & 0x3f) | ||
134 | r = y << 8 | z | ||
135 | elsif n <= 0xffff | ||
136 | x = 0xe0 | (n >> 12) | ||
137 | y = 0x80 | (n >> 6) & 0x3f | ||
138 | z = 0x80 | n & 0x3f | ||
139 | r = x << 16 | y << 8 | z | ||
140 | elsif n <= 0x10ffff | ||
141 | w = 0xf0 | (n >> 18) | ||
142 | x = 0x80 | (n >> 12) & 0x3f | ||
143 | y = 0x80 | (n >> 6) & 0x3f | ||
144 | z = 0x80 | n & 0x3f | ||
145 | r = w << 24 | x << 16 | y << 8 | z | ||
146 | end | ||
147 | |||
148 | to_hex(r) | ||
149 | end | ||
150 | |||
151 | def from_utf8_enc( n ) | ||
152 | n = n.hex | ||
153 | r = 0 | ||
154 | if n <= 0x7f | ||
155 | r = n | ||
156 | elsif n <= 0xdfff | ||
157 | y = (n >> 8) & 0x1f | ||
158 | z = n & 0x3f | ||
159 | r = y << 6 | z | ||
160 | elsif n <= 0xefffff | ||
161 | x = (n >> 16) & 0x0f | ||
162 | y = (n >> 8) & 0x3f | ||
163 | z = n & 0x3f | ||
164 | r = x << 10 | y << 6 | z | ||
165 | elsif n <= 0xf7ffffff | ||
166 | w = (n >> 24) & 0x07 | ||
167 | x = (n >> 16) & 0x3f | ||
168 | y = (n >> 8) & 0x3f | ||
169 | z = n & 0x3f | ||
170 | r = w << 18 | x << 12 | y << 6 | z | ||
171 | end | ||
172 | r | ||
173 | end | ||
174 | |||
175 | ### | ||
176 | # Given a range, splits it up into ranges that can be continuously | ||
177 | # encoded into utf8. Eg: 0x00 .. 0xff => [0x00..0x7f, 0x80..0xff] | ||
178 | # This is not strictly needed since the current [5.1] unicode standard | ||
179 | # doesn't have ranges that straddle utf8 boundaries. This is included | ||
180 | # for completeness as there is no telling if that will ever change. | ||
181 | |||
182 | def utf8_ranges( range ) | ||
183 | ranges = [] | ||
184 | UTF8_BOUNDARIES.each do |max| | ||
185 | if range.begin <= max | ||
186 | if range.end <= max | ||
187 | ranges << range | ||
188 | return ranges | ||
189 | end | ||
190 | |||
191 | ranges << (range.begin .. max) | ||
192 | range = (max + 1) .. range.end | ||
193 | end | ||
194 | end | ||
195 | ranges | ||
196 | end | ||
197 | |||
198 | def build_range( start, stop ) | ||
199 | size = start.size/2 | ||
200 | left = size - 1 | ||
201 | return [""] if size < 1 | ||
202 | |||
203 | a = start[0..1] | ||
204 | b = stop[0..1] | ||
205 | |||
206 | ### | ||
207 | # Shared prefix | ||
208 | |||
209 | if a == b | ||
210 | return build_range(start[2..-1], stop[2..-1]).map do |elt| | ||
211 | "0x#{a} " + elt | ||
212 | end | ||
213 | end | ||
214 | |||
215 | ### | ||
216 | # Unshared prefix, end of run | ||
217 | |||
218 | return ["0x#{a}..0x#{b} "] if left.zero? | ||
219 | |||
220 | ### | ||
221 | # Unshared prefix, not end of run | ||
222 | # Range can be 0x123456..0x56789A | ||
223 | # Which is equivalent to: | ||
224 | # 0x123456 .. 0x12FFFF | ||
225 | # 0x130000 .. 0x55FFFF | ||
226 | # 0x560000 .. 0x56789A | ||
227 | |||
228 | ret = [] | ||
229 | ret << build_range(start, a + "FF" * left) | ||
230 | |||
231 | ### | ||
232 | # Only generate middle range if need be. | ||
233 | |||
234 | if a.hex+1 != b.hex | ||
235 | max = to_hex(b.hex - 1) | ||
236 | max = "FF" if b == "FF" | ||
237 | ret << "0x#{to_hex(a.hex+1)}..0x#{max} " + "0x00..0xFF " * left | ||
238 | end | ||
239 | |||
240 | ### | ||
241 | # Don't generate last range if it is covered by first range | ||
242 | |||
243 | ret << build_range(b + "00" * left, stop) unless b == "FF" | ||
244 | ret.flatten! | ||
245 | end | ||
246 | |||
247 | def to_utf8( range ) | ||
248 | utf8_ranges( range ).map do |r| | ||
249 | begin_enc = to_utf8_enc(r.begin) | ||
250 | end_enc = to_utf8_enc(r.end) | ||
251 | build_range begin_enc, end_enc | ||
252 | end.flatten! | ||
253 | end | ||
254 | |||
255 | ## | ||
256 | # Perform a 3-way comparison of the number of codepoints advertised by | ||
257 | # the unicode spec for the given range, the originally parsed range, | ||
258 | # and the resulting utf8 encoded range. | ||
259 | |||
260 | def count_codepoints( code ) | ||
261 | code.split(' ').inject(1) do |acc, elt| | ||
262 | if elt =~ /0x(.+)\.\.0x(.+)/ | ||
263 | if @encoding == :utf8 | ||
264 | acc * (from_utf8_enc($2) - from_utf8_enc($1) + 1) | ||
265 | else | ||
266 | acc * ($2.hex - $1.hex + 1) | ||
267 | end | ||
268 | else | ||
269 | acc | ||
270 | end | ||
271 | end | ||
272 | end | ||
273 | |||
274 | def is_valid?( range, desc, codes ) | ||
275 | spec_count = 1 | ||
276 | spec_count = $1.to_i if desc =~ /\[(\d+)\]/ | ||
277 | range_count = range.end - range.begin + 1 | ||
278 | |||
279 | sum = codes.inject(0) { |acc, elt| acc + count_codepoints(elt) } | ||
280 | sum == spec_count and sum == range_count | ||
281 | end | ||
282 | |||
283 | ## | ||
284 | # Generate the state maching to stdout | ||
285 | |||
286 | def generate_machine( name, property ) | ||
287 | pipe = " " | ||
288 | @output.puts " #{name} = " | ||
289 | each_alpha( @chart_url, property ) do |range, desc| | ||
290 | |||
291 | codes = (@encoding == :ucs4) ? to_ucs4(range) : to_utf8(range) | ||
292 | |||
293 | #raise "Invalid encoding of range #{range}: #{codes.inspect}" unless | ||
294 | # is_valid? range, desc, codes | ||
295 | |||
296 | range_width = codes.map { |a| a.size }.max | ||
297 | range_width = RANGE_WIDTH if range_width < RANGE_WIDTH | ||
298 | |||
299 | desc_width = TOTAL_WIDTH - RANGE_WIDTH - 11 | ||
300 | desc_width -= (range_width - RANGE_WIDTH) if range_width > RANGE_WIDTH | ||
301 | |||
302 | if desc.size > desc_width | ||
303 | desc = desc[0..desc_width - 4] + "..." | ||
304 | end | ||
305 | |||
306 | codes.each_with_index do |r, idx| | ||
307 | desc = "" unless idx.zero? | ||
308 | code = "%-#{range_width}s" % r | ||
309 | @output.puts " #{pipe} #{code} ##{desc}" | ||
310 | pipe = "|" | ||
311 | end | ||
312 | end | ||
313 | @output.puts " ;" | ||
314 | @output.puts "" | ||
315 | end | ||
316 | |||
317 | @output.puts <<EOF | ||
318 | # The following Ragel file was autogenerated with #{$0} | ||
319 | # from: #{@chart_url} | ||
320 | # | ||
321 | # It defines #{properties}. | ||
322 | # | ||
323 | # To use this, make sure that your alphtype is set to #{ALPHTYPES[@encoding]}, | ||
324 | # and that your input is in #{@encoding}. | ||
325 | |||
326 | %%{ | ||
327 | machine #{machine_name}; | ||
328 | |||
329 | EOF | ||
330 | |||
331 | properties.each { |x| generate_machine( x, x ) } | ||
332 | |||
333 | @output.puts <<EOF | ||
334 | }%% | ||
335 | EOF | ||