]>
git.immae.eu Git - github/fretlink/terraform-provider-statuscake.git/blob - vendor/github.com/apparentlymart/go-textseg/textseg/unicode2ragel.rb
3 # This scripted has been updated to accept more command-line arguments:
5 # -u, --url URL to process
6 # -m, --machine Machine name
7 # -p, --properties Properties to add to the machine
8 # -o, --output Write output to file
10 # Updated by: Marty Schoch <marty.schoch@gmail.com>
12 # This script uses the unicode spec to generate a Ragel state machine
13 # that recognizes unicode alphanumeric characters. It generates 5
14 # character classes: uupper, ulower, ualpha, udigit, and ualnum.
15 # Currently supported encodings are UTF-8 [default] and UCS-4.
17 # Usage: unicode2ragel.rb [options]
18 # -e, --encoding [ucs4 | utf8] Data encoding
19 # -h, --help Show this message
21 # This script was originally written as part of the Ferret search
24 # Author: Rakan El-Khalil <rakan@well.com>
29 ENCODINGS
= [ :utf8, :ucs4 ]
30 ALPHTYPES
= { :utf8 => "byte", :ucs4 => "rune" }
31 DEFAULT_CHART_URL
= "http://www.unicode.org/Public/5.1.0/ucd/DerivedCoreProperties.txt"
32 DEFAULT_MACHINE_NAME
= "WChar"
35 # Display vars & default option
40 @chart_url = DEFAULT_CHART_URL
41 machine_name
= DEFAULT_MACHINE_NAME
48 cli_opts
= OptionParser
.new
do |opts
|
49 opts
.on("-e", "--encoding [ucs4 | utf8]", "Data encoding") do |o
|
50 @encoding = o
.downcase
.to_sym
52 opts
.on("-h", "--help", "Show this message") do
56 opts
.on("-u", "--url URL", "URL to process") do |o
|
59 opts
.on("-m", "--machine MACHINE_NAME", "Machine name") do |o
|
62 opts
.on("-p", "--properties x,y,z", Array
, "Properties to add to machine") do |o
|
65 opts
.on("-o", "--output FILE", "output file") do |o
|
66 @output = File
.new(o
, "w+")
71 unless ENCODINGS.member? @encoding
72 puts "Invalid encoding
: #{@encoding}"
78 # Downloads the document at url and yields every alpha line's hex
79 # range and description.
81 def each_alpha( url, property )
83 file.each_line do |line|
85 next if line !~
/; #{property} #/;
87 range
, description
= line
.split(/;/)
89 description
.gsub!
(/.*#/, '').strip!
92 start
, stop
= range
.split
'..'
93 else start
= stop
= range
96 yield start
.hex
.. stop
.hex
, description
102 # Formats to hex at minimum width
106 r
= "0#{r}" unless (r
.length
% 2).zero
?
111 # UCS4 is just a straight hex conversion of the unicode codepoint.
114 rangestr
= "0x" +
to_hex(range
.begin)
115 rangestr
<< "..0x" +
to_hex(range
.end) if range
.begin !
= range
.end
120 # 0x00 - 0x7f -> 0zzzzzzz[7]
121 # 0x80 - 0x7ff -> 110yyyyy[5] 10zzzzzz[6]
122 # 0x800 - 0xffff -> 1110xxxx[4] 10yyyyyy[6] 10zzzzzz[6]
123 # 0x010000 - 0x10ffff -> 11110www[3] 10xxxxxx[6] 10yyyyyy[6] 10zzzzzz[6]
125 UTF8_BOUNDARIES
= [0x7f, 0x7ff, 0xffff, 0x10ffff]
133 z
= 0x80 | (n
& 0x3f)
137 y
= 0x80 | (n
>> 6) & 0x3f
139 r
= x
<< 16 | y
<< 8 | z
142 x
= 0x80 | (n
>> 12) & 0x3f
143 y
= 0x80 | (n
>> 6) & 0x3f
145 r
= w
<< 24 | x
<< 16 | y
<< 8 | z
151 def from_utf8_enc( n
)
164 r
= x
<< 10 | y
<< 6 | z
165 elsif n
<= 0xf7ffffff
170 r
= w
<< 18 | x
<< 12 | y
<< 6 | z
176 # Given a range, splits it up into ranges that can be continuously
177 # encoded into utf8. Eg: 0x00 .. 0xff => [0x00..0x7f, 0x80..0xff]
178 # This is not strictly needed since the current [5.1] unicode standard
179 # doesn't have ranges that straddle utf8 boundaries. This is included
180 # for completeness as there is no telling if that will ever change.
182 def utf8_ranges( range
)
184 UTF8_BOUNDARIES
.each
do |max
|
185 if range
.begin <= max
191 ranges
<< (range
.begin .. max
)
192 range
= (max +
1) .. range
.end
198 def build_range( start
, stop
)
201 return [""] if size
< 1
210 return build_range(start
[2..-1], stop
[2..-1]).map
do |elt
|
216 # Unshared prefix, end of run
218 return ["0x#{a}..0x#{b} "] if left
.zero
?
221 # Unshared prefix, not end of run
222 # Range can be 0x123456..0x56789A
223 # Which is equivalent to:
224 # 0x123456 .. 0x12FFFF
225 # 0x130000 .. 0x55FFFF
226 # 0x560000 .. 0x56789A
229 ret
<< build_range(start
, a +
"FF" * left
)
232 # Only generate middle range if need be.
235 max
= to_hex(b
.hex
- 1)
236 max
= "FF" if b
== "FF"
237 ret
<< "0x#{to_hex(a.hex+1)}..0x#{max} " +
"0x00..0xFF " * left
241 # Don't generate last range if it is covered by first range
243 ret
<< build_range(b +
"00" * left
, stop
) unless b
== "FF"
248 utf8_ranges( range
).map
do |r
|
249 begin_enc
= to_utf8_enc(r
.begin)
250 end_enc
= to_utf8_enc(r
.end)
251 build_range begin_enc
, end_enc
256 # Perform a 3-way comparison of the number of codepoints advertised by
257 # the unicode spec for the given range, the originally parsed range,
258 # and the resulting utf8 encoded range.
260 def count_codepoints( code
)
261 code
.split(' ').inject(1) do |acc
, elt
|
262 if elt
=~
/0x(.+)\.\.0x(.+)/
263 if @encoding == :utf8
264 acc
* (from_utf8_enc($2) - from_utf8_enc($1) +
1)
266 acc
* ($2.hex
- $1.hex +
1)
274 def is_valid
?( range
, desc
, codes
)
276 spec_count
= $1.to_i
if desc
=~
/\[(\d+)\]/
277 range_count
= range
.end - range
.begin +
1
279 sum
= codes
.inject(0) { |acc
, elt
| acc +
count_codepoints(elt
) }
280 sum
== spec_count
and sum
== range_count
284 # Generate the state maching to stdout
286 def generate_machine( name
, property
)
288 @output.puts
" #{name} = "
289 each_alpha( @chart_url, property
) do |range
, desc
|
291 codes
= (@encoding == :ucs4) ? to_ucs4(range
) : to_utf8(range
)
293 #raise "Invalid encoding of range #{range}: #{codes.inspect}" unless
294 # is_valid? range, desc, codes
296 range_width
= codes
.map
{ |a
| a
.size
}.max
297 range_width
= RANGE_WIDTH
if range_width
< RANGE_WIDTH
299 desc_width
= TOTAL_WIDTH
- RANGE_WIDTH
- 11
300 desc_width
-= (range_width
- RANGE_WIDTH
) if range_width
> RANGE_WIDTH
302 if desc
.size
> desc_width
303 desc
= desc
[0..desc_width
- 4] +
"..."
306 codes
.each_with_index
do |r
, idx
|
307 desc
= "" unless idx
.zero
?
308 code
= "%-#{range_width}s" % r
309 @output.puts
" #{pipe} #{code} ##{desc}"
318 # The following Ragel file was autogenerated with #{$0}
319 # from: #{@chart_url}
321 # It defines #{properties}.
323 # To use this, make sure that your alphtype is set to #{ALPHTYPES[@encoding]},
324 # and that your input is in #{@encoding}.
327 machine #{machine_name};
331 properties
.each
{ |x
| generate_machine( x
, x
) }