1 files changed, 335 insertions, 0 deletions
diff --git a/vendor/github.com/hashicorp/hcl2/hcl/hclsyntax/unicode2ragel.rb b/vendor/github.com/hashicorp/hcl2/hcl/hclsyntax/unicode2ragel.rb
new file mode 100644
index 0000000..422e4e5
--- /dev/null
+++ b/vendor/github.com/hashicorp/hcl2/hcl/hclsyntax/unicode2ragel.rb
@@ -0,0 +1,335 @@
+#!/usr/bin/env ruby
+#
+# This scripted has been updated to accept more command-line arguments:
+#
+#    -u, --url                        URL to process
+#    -m, --machine                    Machine name
+#    -p, --properties                 Properties to add to the machine
+#    -o, --output                     Write output to file
+#
+# Updated by: Marty Schoch <marty.schoch@gmail.com>
+# 
+# This script uses the unicode spec to generate a Ragel state machine
+# that recognizes unicode alphanumeric characters.  It generates 5
+# character classes: uupper, ulower, ualpha, udigit, and ualnum.
+# Currently supported encodings are UTF-8 [default] and UCS-4.
+#
+# Usage: unicode2ragel.rb [options]
+#    -e, --encoding [ucs4 | utf8]     Data encoding
+#    -h, --help                       Show this message
+#
+# This script was originally written as part of the Ferret search
+# engine library.
+#
+# Author: Rakan El-Khalil <rakan@well.com>
+require 'optparse'
+require 'open-uri'
+ENCODINGS = [ :utf8, :ucs4 ]
+ALPHTYPES = { :utf8 => "byte", :ucs4 => "rune" }
+DEFAULT_CHART_URL = "http://www.unicode.org/Public/5.1.0/ucd/DerivedCoreProperties.txt"
+DEFAULT_MACHINE_NAME= "WChar"
+###
+# Display vars & default option
+TOTAL_WIDTH = 80
+RANGE_WIDTH = 23
+@encoding = :utf8
+@chart_url = DEFAULT_CHART_URL
+machine_name = DEFAULT_MACHINE_NAME
+properties = []
+@output = $stdout
+###
+# Option parsing
+cli_opts = OptionParser.new do |opts|
+  opts.on("-e", "--encoding [ucs4 | utf8]", "Data encoding") do |o|
+    @encoding = o.downcase.to_sym
+  end
+  opts.on("-h", "--help", "Show this message") do
+    puts opts
+    exit
+  end
+  opts.on("-u", "--url URL", "URL to process") do |o|
+    @chart_url = o 
+  end
+  opts.on("-m", "--machine MACHINE_NAME", "Machine name") do |o|
+    machine_name = o
+  end
+  opts.on("-p", "--properties x,y,z", Array, "Properties to add to machine") do |o|
+    properties = o
+  end
+  opts.on("-o", "--output FILE", "output file") do |o|
+    @output = File.new(o, "w+")
+  end
+end
+cli_opts.parse(ARGV)
+unless ENCODINGS.member? @encoding
+  puts "Invalid encoding: #{@encoding}"
+  puts cli_opts
+  exit
+end
+##
+# Downloads the document at url and yields every alpha line's hex
+# range and description.
+def each_alpha( url, property ) 
+  open( url ) do |file|
+    file.each_line do |line|
+      next if line =~ /^#/;
+      next if line !~ /; #{property} #/;
+      range, description = line.split(/;/)
+      range.strip!
+      description.gsub!(/.*#/, '').strip!
+      if range =~ /\.\./
+           start, stop = range.split '..'
+      else start = stop = range
+      end
+      yield start.hex .. stop.hex, description
+    end
+  end
+end
+###
+# Formats to hex at minimum width
+def to_hex( n )
+  r = "%0X" % n
+  r = "0#{r}" unless (r.length % 2).zero?
+  r
+end
+###
+# UCS4 is just a straight hex conversion of the unicode codepoint.
+def to_ucs4( range )
+  rangestr  =   "0x" + to_hex(range.begin)
+  rangestr << "..0x" + to_hex(range.end) if range.begin != range.end
+  [ rangestr ]
+end
+##
+# 0x00     - 0x7f     -> 0zzzzzzz[7]
+# 0x80     - 0x7ff    -> 110yyyyy[5] 10zzzzzz[6]
+# 0x800    - 0xffff   -> 1110xxxx[4] 10yyyyyy[6] 10zzzzzz[6]
+# 0x010000 - 0x10ffff -> 11110www[3] 10xxxxxx[6] 10yyyyyy[6] 10zzzzzz[6] 
+UTF8_BOUNDARIES = [0x7f, 0x7ff, 0xffff, 0x10ffff]
+def to_utf8_enc( n )
+  r = 0
+  if n <= 0x7f
+    r = n
+  elsif n <= 0x7ff
+    y = 0xc0 | (n >> 6)
+    z = 0x80 | (n & 0x3f)
+    r = y << 8 | z
+  elsif n <= 0xffff
+    x = 0xe0 | (n >> 12)
+    y = 0x80 | (n >>  6) & 0x3f
+    z = 0x80 |  n        & 0x3f
+    r = x << 16 | y << 8 | z
+  elsif n <= 0x10ffff
+    w = 0xf0 | (n >> 18)
+    x = 0x80 | (n >> 12) & 0x3f
+    y = 0x80 | (n >>  6) & 0x3f
+    z = 0x80 |  n        & 0x3f
+    r = w << 24 | x << 16 | y << 8 | z
+  end
+  to_hex(r)
+end
+def from_utf8_enc( n )
+  n = n.hex
+  r = 0
+  if n <= 0x7f
+    r = n
+  elsif n <= 0xdfff
+    y = (n >> 8) & 0x1f
+    z =  n       & 0x3f
+    r = y << 6 | z
+  elsif n <= 0xefffff
+    x = (n >> 16) & 0x0f
+    y = (n >>  8) & 0x3f
+    z =  n        & 0x3f
+    r = x << 10 | y << 6 | z
+  elsif n <= 0xf7ffffff
+    w = (n >> 24) & 0x07
+    x = (n >> 16) & 0x3f
+    y = (n >>  8) & 0x3f
+    z =  n        & 0x3f
+    r = w << 18 | x << 12 | y << 6 | z
+  end
+  r
+end
+###
+# Given a range, splits it up into ranges that can be continuously
+# encoded into utf8.  Eg: 0x00 .. 0xff => [0x00..0x7f, 0x80..0xff]
+# This is not strictly needed since the current [5.1] unicode standard
+# doesn't have ranges that straddle utf8 boundaries.  This is included
+# for completeness as there is no telling if that will ever change.
+def utf8_ranges( range )
+  ranges = []
+  UTF8_BOUNDARIES.each do |max|
+    if range.begin <= max
+      if range.end <= max
+        ranges << range
+        return ranges
+      end
+      ranges << (range.begin .. max)
+      range = (max + 1) .. range.end
+    end
+  end
+  ranges
+end
+def build_range( start, stop )
+  size = start.size/2
+  left = size - 1
+  return [""] if size < 1
+  a = start[0..1]
+  b = stop[0..1]
+  ###
+  # Shared prefix
+  if a == b
+    return build_range(start[2..-1], stop[2..-1]).map do |elt|
+      "0x#{a} " + elt
+    end
+  end
+  ###
+  # Unshared prefix, end of run
+  return ["0x#{a}..0x#{b} "] if left.zero?
+  
+  ###
+  # Unshared prefix, not end of run
+  # Range can be 0x123456..0x56789A
+  # Which is equivalent to:
+  #     0x123456 .. 0x12FFFF
+  #     0x130000 .. 0x55FFFF
+  #     0x560000 .. 0x56789A
+  ret = []
+  ret << build_range(start, a + "FF" * left)
+  ###
+  # Only generate middle range if need be.
+  if a.hex+1 != b.hex
+    max = to_hex(b.hex - 1)
+    max = "FF" if b == "FF"
+    ret << "0x#{to_hex(a.hex+1)}..0x#{max} " + "0x00..0xFF " * left
+  end
+  ###
+  # Don't generate last range if it is covered by first range
+  
+  ret << build_range(b + "00" * left, stop) unless b == "FF"
+  ret.flatten!
+end
+def to_utf8( range )
+  utf8_ranges( range ).map do |r|   
+    begin_enc = to_utf8_enc(r.begin)
+    end_enc = to_utf8_enc(r.end)
+    build_range begin_enc, end_enc
+  end.flatten!
+end
+##
+# Perform a 3-way comparison of the number of codepoints advertised by
+# the unicode spec for the given range, the originally parsed range,
+# and the resulting utf8 encoded range.
+def count_codepoints( code )
+  code.split(' ').inject(1) do |acc, elt|
+    if elt =~ /0x(.+)\.\.0x(.+)/
+      if @encoding == :utf8
+        acc * (from_utf8_enc($2) - from_utf8_enc($1) + 1)
+      else
+        acc * ($2.hex - $1.hex + 1)
+      end
+    else
+      acc
+    end
+  end
+end
+def is_valid?( range, desc, codes )
+  spec_count  = 1
+  spec_count  = $1.to_i if desc =~ /\[(\d+)\]/
+  range_count = range.end - range.begin + 1
+  sum = codes.inject(0) { |acc, elt| acc + count_codepoints(elt) }
+  sum == spec_count and sum == range_count
+end
+##
+# Generate the state maching to stdout
+def generate_machine( name, property )
+  pipe = " "
+  @output.puts "    #{name} = "
+  each_alpha( @chart_url, property ) do |range, desc|
+    codes = (@encoding == :ucs4) ? to_ucs4(range) : to_utf8(range)
+    #raise "Invalid encoding of range #{range}: #{codes.inspect}" unless 
+    #  is_valid? range, desc, codes
+    range_width = codes.map { |a| a.size }.max
+    range_width = RANGE_WIDTH if range_width < RANGE_WIDTH
+    desc_width  = TOTAL_WIDTH - RANGE_WIDTH - 11
+    desc_width -= (range_width - RANGE_WIDTH) if range_width > RANGE_WIDTH
+    if desc.size > desc_width
+      desc = desc[0..desc_width - 4] + "..."
+    end
+    codes.each_with_index do |r, idx|
+      desc = "" unless idx.zero?
+      code = "%-#{range_width}s" % r
+      @output.puts "      #{pipe} #{code} ##{desc}"
+      pipe = "|"
+    end
+  end
+  @output.puts "      ;"
+  @output.puts ""
+end
+@output.puts <<EOF
+# The following Ragel file was autogenerated with #{$0} 
+# from: #{@chart_url}
+#
+# It defines #{properties}.
+#
+# To use this, make sure that your alphtype is set to #{ALPHTYPES[@encoding]},
+# and that your input is in #{@encoding}.
+%%{
+    machine #{machine_name};
+    
+EOF
+properties.each { |x| generate_machine( x, x ) }
+@output.puts <<EOF
+}%%
+EOF

diff --git a/vendor/github.com/hashicorp/hcl2/hcl/hclsyntax/unicode2ragel.rb b/vendor/github.com/hashicorp/hcl2/hcl/hclsyntax/unicode2ragel.rb new file mode 100644 index 0000000..422e4e5 --- /dev/null +++ b/vendor/github.com/hashicorp/hcl2/hcl/hclsyntax/unicode2ragel.rb
@@ -0,0 +1,335 @@
	1	#!/usr/bin/env ruby
	2	#
	3	# This scripted has been updated to accept more command-line arguments:
	4	#
	5	# -u, --url URL to process
	6	# -m, --machine Machine name
	7	# -p, --properties Properties to add to the machine
	8	# -o, --output Write output to file
	9	#
	10	# Updated by: Marty Schoch <marty.schoch@gmail.com>
	11	#
	12	# This script uses the unicode spec to generate a Ragel state machine
	13	# that recognizes unicode alphanumeric characters. It generates 5
	14	# character classes: uupper, ulower, ualpha, udigit, and ualnum.
	15	# Currently supported encodings are UTF-8 [default] and UCS-4.
	16	#
	17	# Usage: unicode2ragel.rb [options]
	18	# -e, --encoding [ucs4 \| utf8] Data encoding
	19	# -h, --help Show this message
	20	#
	21	# This script was originally written as part of the Ferret search
	22	# engine library.
	23	#
	24	# Author: Rakan El-Khalil <rakan@well.com>
	25
	26	require 'optparse'
	27	require 'open-uri'
	28
	29	ENCODINGS = [ :utf8, :ucs4 ]
	30	ALPHTYPES = { :utf8 => "byte", :ucs4 => "rune" }
	31	DEFAULT_CHART_URL = "http://www.unicode.org/Public/5.1.0/ucd/DerivedCoreProperties.txt"
	32	DEFAULT_MACHINE_NAME= "WChar"
	33
	34	###
	35	# Display vars & default option
	36
	37	TOTAL_WIDTH = 80
	38	RANGE_WIDTH = 23
	39	@encoding = :utf8
	40	@chart_url = DEFAULT_CHART_URL
	41	machine_name = DEFAULT_MACHINE_NAME
	42	properties = []
	43	@output = $stdout
	44
	45	###
	46	# Option parsing
	47
	48	cli_opts = OptionParser.new do \|opts\|
	49	opts.on("-e", "--encoding [ucs4 \| utf8]", "Data encoding") do \|o\|
	50	@encoding = o.downcase.to_sym
	51	end
	52	opts.on("-h", "--help", "Show this message") do
	53	puts opts
	54	exit
	55	end
	56	opts.on("-u", "--url URL", "URL to process") do \|o\|
	57	@chart_url = o
	58	end
	59	opts.on("-m", "--machine MACHINE_NAME", "Machine name") do \|o\|
	60	machine_name = o
	61	end
	62	opts.on("-p", "--properties x,y,z", Array, "Properties to add to machine") do \|o\|
	63	properties = o
	64	end
	65	opts.on("-o", "--output FILE", "output file") do \|o\|
	66	@output = File.new(o, "w+")
	67	end
	68	end
	69
	70	cli_opts.parse(ARGV)
	71	unless ENCODINGS.member? @encoding
	72	puts "Invalid encoding: #{@encoding}"
	73	puts cli_opts
	74	exit
	75	end
	76
	77	##
	78	# Downloads the document at url and yields every alpha line's hex
	79	# range and description.
	80
	81	def each_alpha( url, property )
	82	open( url ) do \|file\|
	83	file.each_line do \|line\|
	84	next if line =~ /^#/;
	85	next if line !~ /; #{property} #/;
	86
	87	range, description = line.split(/;/)
	88	range.strip!
	89	description.gsub!(/.*#/, '').strip!
	90
	91	if range =~ /\.\./
	92	start, stop = range.split '..'
	93	else start = stop = range
	94	end
	95
	96	yield start.hex .. stop.hex, description
	97	end
	98	end
	99	end
	100
	101	###
	102	# Formats to hex at minimum width
	103
	104	def to_hex( n )
	105	r = "%0X" % n
	106	r = "0#{r}" unless (r.length % 2).zero?
	107	r
	108	end
	109
	110	###
	111	# UCS4 is just a straight hex conversion of the unicode codepoint.
	112
	113	def to_ucs4( range )
	114	rangestr = "0x" + to_hex(range.begin)
	115	rangestr << "..0x" + to_hex(range.end) if range.begin != range.end
	116	[ rangestr ]
	117	end
	118
	119	##
	120	# 0x00 - 0x7f -> 0zzzzzzz[7]
	121	# 0x80 - 0x7ff -> 110yyyyy[5] 10zzzzzz[6]
	122	# 0x800 - 0xffff -> 1110xxxx[4] 10yyyyyy[6] 10zzzzzz[6]
	123	# 0x010000 - 0x10ffff -> 11110www[3] 10xxxxxx[6] 10yyyyyy[6] 10zzzzzz[6]
	124
	125	UTF8_BOUNDARIES = [0x7f, 0x7ff, 0xffff, 0x10ffff]
	126
	127	def to_utf8_enc( n )
	128	r = 0
	129	if n <= 0x7f
	130	r = n
	131	elsif n <= 0x7ff
	132	y = 0xc0 \| (n >> 6)
	133	z = 0x80 \| (n & 0x3f)
	134	r = y << 8 \| z
	135	elsif n <= 0xffff
	136	x = 0xe0 \| (n >> 12)
	137	y = 0x80 \| (n >> 6) & 0x3f
	138	z = 0x80 \| n & 0x3f
	139	r = x << 16 \| y << 8 \| z
	140	elsif n <= 0x10ffff
	141	w = 0xf0 \| (n >> 18)
	142	x = 0x80 \| (n >> 12) & 0x3f
	143	y = 0x80 \| (n >> 6) & 0x3f
	144	z = 0x80 \| n & 0x3f
	145	r = w << 24 \| x << 16 \| y << 8 \| z
	146	end
	147
	148	to_hex(r)
	149	end
	150
	151	def from_utf8_enc( n )
	152	n = n.hex
	153	r = 0
	154	if n <= 0x7f
	155	r = n
	156	elsif n <= 0xdfff
	157	y = (n >> 8) & 0x1f
	158	z = n & 0x3f
	159	r = y << 6 \| z
	160	elsif n <= 0xefffff
	161	x = (n >> 16) & 0x0f
	162	y = (n >> 8) & 0x3f
	163	z = n & 0x3f
	164	r = x << 10 \| y << 6 \| z
	165	elsif n <= 0xf7ffffff
	166	w = (n >> 24) & 0x07
	167	x = (n >> 16) & 0x3f
	168	y = (n >> 8) & 0x3f
	169	z = n & 0x3f
	170	r = w << 18 \| x << 12 \| y << 6 \| z
	171	end
	172	r
	173	end
	174
	175	###
	176	# Given a range, splits it up into ranges that can be continuously
	177	# encoded into utf8. Eg: 0x00 .. 0xff => [0x00..0x7f, 0x80..0xff]
	178	# This is not strictly needed since the current [5.1] unicode standard
	179	# doesn't have ranges that straddle utf8 boundaries. This is included
	180	# for completeness as there is no telling if that will ever change.
	181
	182	def utf8_ranges( range )
	183	ranges = []
	184	UTF8_BOUNDARIES.each do \|max\|
	185	if range.begin <= max
	186	if range.end <= max
	187	ranges << range
	188	return ranges
	189	end
	190
	191	ranges << (range.begin .. max)
	192	range = (max + 1) .. range.end
	193	end
	194	end
	195	ranges
	196	end
	197
	198	def build_range( start, stop )
	199	size = start.size/2
	200	left = size - 1
	201	return [""] if size < 1
	202
	203	a = start[0..1]
	204	b = stop[0..1]
	205
	206	###
	207	# Shared prefix
	208
	209	if a == b
	210	return build_range(start[2..-1], stop[2..-1]).map do \|elt\|
	211	"0x#{a} " + elt
	212	end
	213	end
	214
	215	###
	216	# Unshared prefix, end of run
	217
	218	return ["0x#{a}..0x#{b} "] if left.zero?
	219
	220	###
	221	# Unshared prefix, not end of run
	222	# Range can be 0x123456..0x56789A
	223	# Which is equivalent to:
	224	# 0x123456 .. 0x12FFFF
	225	# 0x130000 .. 0x55FFFF
	226	# 0x560000 .. 0x56789A
	227
	228	ret = []
	229	ret << build_range(start, a + "FF" * left)
	230
	231	###
	232	# Only generate middle range if need be.
	233
	234	if a.hex+1 != b.hex
	235	max = to_hex(b.hex - 1)
	236	max = "FF" if b == "FF"
	237	ret << "0x#{to_hex(a.hex+1)}..0x#{max} " + "0x00..0xFF " * left
	238	end
	239
	240	###
	241	# Don't generate last range if it is covered by first range
	242
	243	ret << build_range(b + "00" * left, stop) unless b == "FF"
	244	ret.flatten!
	245	end
	246
	247	def to_utf8( range )
	248	utf8_ranges( range ).map do \|r\|
	249	begin_enc = to_utf8_enc(r.begin)
	250	end_enc = to_utf8_enc(r.end)
	251	build_range begin_enc, end_enc
	252	end.flatten!
	253	end
	254
	255	##
	256	# Perform a 3-way comparison of the number of codepoints advertised by
	257	# the unicode spec for the given range, the originally parsed range,
	258	# and the resulting utf8 encoded range.
	259
	260	def count_codepoints( code )
	261	code.split(' ').inject(1) do \|acc, elt\|
	262	if elt =~ /0x(.+)\.\.0x(.+)/
	263	if @encoding == :utf8
	264	acc * (from_utf8_enc($2) - from_utf8_enc($1) + 1)
	265	else
	266	acc * ($2.hex - $1.hex + 1)
	267	end
	268	else
	269	acc
	270	end
	271	end
	272	end
	273
	274	def is_valid?( range, desc, codes )
	275	spec_count = 1
	276	spec_count = $1.to_i if desc =~ /\[(\d+)\]/
	277	range_count = range.end - range.begin + 1
	278
	279	sum = codes.inject(0) { \|acc, elt\| acc + count_codepoints(elt) }
	280	sum == spec_count and sum == range_count
	281	end
	282
	283	##
	284	# Generate the state maching to stdout
	285
	286	def generate_machine( name, property )
	287	pipe = " "
	288	@output.puts " #{name} = "
	289	each_alpha( @chart_url, property ) do \|range, desc\|
	290
	291	codes = (@encoding == :ucs4) ? to_ucs4(range) : to_utf8(range)
	292
	293	#raise "Invalid encoding of range #{range}: #{codes.inspect}" unless
	294	# is_valid? range, desc, codes
	295
	296	range_width = codes.map { \|a\| a.size }.max
	297	range_width = RANGE_WIDTH if range_width < RANGE_WIDTH
	298
	299	desc_width = TOTAL_WIDTH - RANGE_WIDTH - 11
	300	desc_width -= (range_width - RANGE_WIDTH) if range_width > RANGE_WIDTH
	301
	302	if desc.size > desc_width
	303	desc = desc[0..desc_width - 4] + "..."
	304	end
	305
	306	codes.each_with_index do \|r, idx\|
	307	desc = "" unless idx.zero?
	308	code = "%-#{range_width}s" % r
	309	@output.puts " #{pipe} #{code} ##{desc}"
	310	pipe = "\|"
	311	end
	312	end
	313	@output.puts " ;"
	314	@output.puts ""
	315	end
	316
	317	@output.puts <<EOF
	318	# The following Ragel file was autogenerated with #{$0}
	319	# from: #{@chart_url}
	320	#
	321	# It defines #{properties}.
	322	#
	323	# To use this, make sure that your alphtype is set to #{ALPHTYPES[@encoding]},
	324	# and that your input is in #{@encoding}.
	325
	326	%%{
	327	machine #{machine_name};
	328
	329	EOF
	330
	331	properties.each { \|x\| generate_machine( x, x ) }
	332
	333	@output.puts <<EOF
	334	}%%
	335	EOF