]> git.immae.eu Git - github/fretlink/terraform-provider-statuscake.git/blob - vendor/github.com/apparentlymart/go-textseg/textseg/unicode2ragel.rb
deps: github.com/hashicorp/terraform@sdk-v0.11-with-go-modules
[github/fretlink/terraform-provider-statuscake.git] / vendor / github.com / apparentlymart / go-textseg / textseg / unicode2ragel.rb
1 #!/usr/bin/env ruby
2 #
3 # This scripted has been updated to accept more command-line arguments:
4 #
5 # -u, --url URL to process
6 # -m, --machine Machine name
7 # -p, --properties Properties to add to the machine
8 # -o, --output Write output to file
9 #
10 # Updated by: Marty Schoch <marty.schoch@gmail.com>
11 #
12 # This script uses the unicode spec to generate a Ragel state machine
13 # that recognizes unicode alphanumeric characters. It generates 5
14 # character classes: uupper, ulower, ualpha, udigit, and ualnum.
15 # Currently supported encodings are UTF-8 [default] and UCS-4.
16 #
17 # Usage: unicode2ragel.rb [options]
18 # -e, --encoding [ucs4 | utf8] Data encoding
19 # -h, --help Show this message
20 #
21 # This script was originally written as part of the Ferret search
22 # engine library.
23 #
24 # Author: Rakan El-Khalil <rakan@well.com>
25
26 require 'optparse'
27 require 'open-uri'
28
29 ENCODINGS = [ :utf8, :ucs4 ]
30 ALPHTYPES = { :utf8 => "byte", :ucs4 => "rune" }
31 DEFAULT_CHART_URL = "http://www.unicode.org/Public/5.1.0/ucd/DerivedCoreProperties.txt"
32 DEFAULT_MACHINE_NAME= "WChar"
33
34 ###
35 # Display vars & default option
36
37 TOTAL_WIDTH = 80
38 RANGE_WIDTH = 23
39 @encoding = :utf8
40 @chart_url = DEFAULT_CHART_URL
41 machine_name = DEFAULT_MACHINE_NAME
42 properties = []
43 @output = $stdout
44
45 ###
46 # Option parsing
47
48 cli_opts = OptionParser.new do |opts|
49 opts.on("-e", "--encoding [ucs4 | utf8]", "Data encoding") do |o|
50 @encoding = o.downcase.to_sym
51 end
52 opts.on("-h", "--help", "Show this message") do
53 puts opts
54 exit
55 end
56 opts.on("-u", "--url URL", "URL to process") do |o|
57 @chart_url = o
58 end
59 opts.on("-m", "--machine MACHINE_NAME", "Machine name") do |o|
60 machine_name = o
61 end
62 opts.on("-p", "--properties x,y,z", Array, "Properties to add to machine") do |o|
63 properties = o
64 end
65 opts.on("-o", "--output FILE", "output file") do |o|
66 @output = File.new(o, "w+")
67 end
68 end
69
70 cli_opts.parse(ARGV)
71 unless ENCODINGS.member? @encoding
72 puts "Invalid encoding: #{@encoding}"
73 puts cli_opts
74 exit
75 end
76
77 ##
78 # Downloads the document at url and yields every alpha line's hex
79 # range and description.
80
81 def each_alpha( url, property )
82 open( url ) do |file|
83 file.each_line do |line|
84 next if line =~ /^#/;
85 next if line !~ /; #{property} #/;
86
87 range, description = line.split(/;/)
88 range.strip!
89 description.gsub!(/.*#/, '').strip!
90
91 if range =~ /\.\./
92 start, stop = range.split '..'
93 else start = stop = range
94 end
95
96 yield start.hex .. stop.hex, description
97 end
98 end
99 end
100
101 ###
102 # Formats to hex at minimum width
103
104 def to_hex( n )
105 r = "%0X" % n
106 r = "0#{r}" unless (r.length % 2).zero?
107 r
108 end
109
110 ###
111 # UCS4 is just a straight hex conversion of the unicode codepoint.
112
113 def to_ucs4( range )
114 rangestr = "0x" + to_hex(range.begin)
115 rangestr << "..0x" + to_hex(range.end) if range.begin != range.end
116 [ rangestr ]
117 end
118
119 ##
120 # 0x00 - 0x7f -> 0zzzzzzz[7]
121 # 0x80 - 0x7ff -> 110yyyyy[5] 10zzzzzz[6]
122 # 0x800 - 0xffff -> 1110xxxx[4] 10yyyyyy[6] 10zzzzzz[6]
123 # 0x010000 - 0x10ffff -> 11110www[3] 10xxxxxx[6] 10yyyyyy[6] 10zzzzzz[6]
124
125 UTF8_BOUNDARIES = [0x7f, 0x7ff, 0xffff, 0x10ffff]
126
127 def to_utf8_enc( n )
128 r = 0
129 if n <= 0x7f
130 r = n
131 elsif n <= 0x7ff
132 y = 0xc0 | (n >> 6)
133 z = 0x80 | (n & 0x3f)
134 r = y << 8 | z
135 elsif n <= 0xffff
136 x = 0xe0 | (n >> 12)
137 y = 0x80 | (n >> 6) & 0x3f
138 z = 0x80 | n & 0x3f
139 r = x << 16 | y << 8 | z
140 elsif n <= 0x10ffff
141 w = 0xf0 | (n >> 18)
142 x = 0x80 | (n >> 12) & 0x3f
143 y = 0x80 | (n >> 6) & 0x3f
144 z = 0x80 | n & 0x3f
145 r = w << 24 | x << 16 | y << 8 | z
146 end
147
148 to_hex(r)
149 end
150
151 def from_utf8_enc( n )
152 n = n.hex
153 r = 0
154 if n <= 0x7f
155 r = n
156 elsif n <= 0xdfff
157 y = (n >> 8) & 0x1f
158 z = n & 0x3f
159 r = y << 6 | z
160 elsif n <= 0xefffff
161 x = (n >> 16) & 0x0f
162 y = (n >> 8) & 0x3f
163 z = n & 0x3f
164 r = x << 10 | y << 6 | z
165 elsif n <= 0xf7ffffff
166 w = (n >> 24) & 0x07
167 x = (n >> 16) & 0x3f
168 y = (n >> 8) & 0x3f
169 z = n & 0x3f
170 r = w << 18 | x << 12 | y << 6 | z
171 end
172 r
173 end
174
175 ###
176 # Given a range, splits it up into ranges that can be continuously
177 # encoded into utf8. Eg: 0x00 .. 0xff => [0x00..0x7f, 0x80..0xff]
178 # This is not strictly needed since the current [5.1] unicode standard
179 # doesn't have ranges that straddle utf8 boundaries. This is included
180 # for completeness as there is no telling if that will ever change.
181
182 def utf8_ranges( range )
183 ranges = []
184 UTF8_BOUNDARIES.each do |max|
185 if range.begin <= max
186 if range.end <= max
187 ranges << range
188 return ranges
189 end
190
191 ranges << (range.begin .. max)
192 range = (max + 1) .. range.end
193 end
194 end
195 ranges
196 end
197
198 def build_range( start, stop )
199 size = start.size/2
200 left = size - 1
201 return [""] if size < 1
202
203 a = start[0..1]
204 b = stop[0..1]
205
206 ###
207 # Shared prefix
208
209 if a == b
210 return build_range(start[2..-1], stop[2..-1]).map do |elt|
211 "0x#{a} " + elt
212 end
213 end
214
215 ###
216 # Unshared prefix, end of run
217
218 return ["0x#{a}..0x#{b} "] if left.zero?
219
220 ###
221 # Unshared prefix, not end of run
222 # Range can be 0x123456..0x56789A
223 # Which is equivalent to:
224 # 0x123456 .. 0x12FFFF
225 # 0x130000 .. 0x55FFFF
226 # 0x560000 .. 0x56789A
227
228 ret = []
229 ret << build_range(start, a + "FF" * left)
230
231 ###
232 # Only generate middle range if need be.
233
234 if a.hex+1 != b.hex
235 max = to_hex(b.hex - 1)
236 max = "FF" if b == "FF"
237 ret << "0x#{to_hex(a.hex+1)}..0x#{max} " + "0x00..0xFF " * left
238 end
239
240 ###
241 # Don't generate last range if it is covered by first range
242
243 ret << build_range(b + "00" * left, stop) unless b == "FF"
244 ret.flatten!
245 end
246
247 def to_utf8( range )
248 utf8_ranges( range ).map do |r|
249 begin_enc = to_utf8_enc(r.begin)
250 end_enc = to_utf8_enc(r.end)
251 build_range begin_enc, end_enc
252 end.flatten!
253 end
254
255 ##
256 # Perform a 3-way comparison of the number of codepoints advertised by
257 # the unicode spec for the given range, the originally parsed range,
258 # and the resulting utf8 encoded range.
259
260 def count_codepoints( code )
261 code.split(' ').inject(1) do |acc, elt|
262 if elt =~ /0x(.+)\.\.0x(.+)/
263 if @encoding == :utf8
264 acc * (from_utf8_enc($2) - from_utf8_enc($1) + 1)
265 else
266 acc * ($2.hex - $1.hex + 1)
267 end
268 else
269 acc
270 end
271 end
272 end
273
274 def is_valid?( range, desc, codes )
275 spec_count = 1
276 spec_count = $1.to_i if desc =~ /\[(\d+)\]/
277 range_count = range.end - range.begin + 1
278
279 sum = codes.inject(0) { |acc, elt| acc + count_codepoints(elt) }
280 sum == spec_count and sum == range_count
281 end
282
283 ##
284 # Generate the state maching to stdout
285
286 def generate_machine( name, property )
287 pipe = " "
288 @output.puts " #{name} = "
289 each_alpha( @chart_url, property ) do |range, desc|
290
291 codes = (@encoding == :ucs4) ? to_ucs4(range) : to_utf8(range)
292
293 #raise "Invalid encoding of range #{range}: #{codes.inspect}" unless
294 # is_valid? range, desc, codes
295
296 range_width = codes.map { |a| a.size }.max
297 range_width = RANGE_WIDTH if range_width < RANGE_WIDTH
298
299 desc_width = TOTAL_WIDTH - RANGE_WIDTH - 11
300 desc_width -= (range_width - RANGE_WIDTH) if range_width > RANGE_WIDTH
301
302 if desc.size > desc_width
303 desc = desc[0..desc_width - 4] + "..."
304 end
305
306 codes.each_with_index do |r, idx|
307 desc = "" unless idx.zero?
308 code = "%-#{range_width}s" % r
309 @output.puts " #{pipe} #{code} ##{desc}"
310 pipe = "|"
311 end
312 end
313 @output.puts " ;"
314 @output.puts ""
315 end
316
317 @output.puts <<EOF
318 # The following Ragel file was autogenerated with #{$0}
319 # from: #{@chart_url}
320 #
321 # It defines #{properties}.
322 #
323 # To use this, make sure that your alphtype is set to #{ALPHTYPES[@encoding]},
324 # and that your input is in #{@encoding}.
325
326 %%{
327 machine #{machine_name};
328
329 EOF
330
331 properties.each { |x| generate_machine( x, x ) }
332
333 @output.puts <<EOF
334 }%%
335 EOF