3 # This script uses the unicode spec to generate a Ragel state machine
4 # that recognizes unicode alphanumeric characters. It generates 5
5 # character classes: uupper, ulower, ualpha, udigit, and ualnum.
6 # Currently supported encodings are UTF-8 [default] and UCS-4.
8 # Usage: unicode2ragel.rb [options]
9 # -e, --encoding [ucs4 | utf8] Data encoding
10 # -h, --help Show this message
12 # This script was originally written as part of the Ferret search
15 # Author: Rakan El-Khalil <rakan@well.com>
20 ENCODINGS = [ :utf8, :ucs4 ]
21 ALPHTYPES = { :utf8 => "unsigned char", :ucs4 => "unsigned int" }
22 CHART_URL = "http://www.unicode.org/Public/5.1.0/ucd/DerivedCoreProperties.txt"
25 # Display vars & default option
34 cli_opts = OptionParser.new do |opts|
35 opts.on("-e", "--encoding [ucs4 | utf8]", "Data encoding") do |o|
36 @encoding = o.downcase.to_sym
38 opts.on("-h", "--help", "Show this message") do
45 unless ENCODINGS.member? @encoding
46 puts "Invalid encoding: #{@encoding}"
52 # Downloads the document at url and yields every alpha line's hex
53 # range and description.
55 def each_alpha( url, property )
57 file.each_line do |line|
59 next if line !~ /; #{property} #/;
61 range, description = line.split(/;/)
63 description.gsub!(/.*#/, '').strip!
66 start, stop = range.split '..'
67 else start = stop = range
70 yield start.hex .. stop.hex, description
76 # Formats to hex at minimum width
80 r = "0#{r}" unless (r.length % 2).zero?
85 # UCS4 is just a straight hex conversion of the unicode codepoint.
88 rangestr = "0x" + to_hex(range.begin)
89 rangestr << "..0x" + to_hex(range.end) if range.begin != range.end
94 # 0x00 - 0x7f -> 0zzzzzzz[7]
95 # 0x80 - 0x7ff -> 110yyyyy[5] 10zzzzzz[6]
96 # 0x800 - 0xffff -> 1110xxxx[4] 10yyyyyy[6] 10zzzzzz[6]
97 # 0x010000 - 0x10ffff -> 11110www[3] 10xxxxxx[6] 10yyyyyy[6] 10zzzzzz[6]
99 UTF8_BOUNDARIES = [0x7f, 0x7ff, 0xffff, 0x10ffff]
107 z = 0x80 | (n & 0x3f)
111 y = 0x80 | (n >> 6) & 0x3f
113 r = x << 16 | y << 8 | z
116 x = 0x80 | (n >> 12) & 0x3f
117 y = 0x80 | (n >> 6) & 0x3f
119 r = w << 24 | x << 16 | y << 8 | z
125 def from_utf8_enc( n )
138 r = x << 10 | y << 6 | z
139 elsif n <= 0xf7ffffff
144 r = w << 18 | x << 12 | y << 6 | z
150 # Given a range, splits it up into ranges that can be continuously
151 # encoded into utf8. Eg: 0x00 .. 0xff => [0x00..0x7f, 0x80..0xff]
152 # This is not strictly needed since the current [5.1] unicode standard
153 # doesn't have ranges that straddle utf8 boundaries. This is included
154 # for completeness as there is no telling if that will ever change.
156 def utf8_ranges( range )
158 UTF8_BOUNDARIES.each do |max|
159 if range.begin <= max
160 return ranges << range if range.end <= max
162 ranges << range.begin .. max
163 range = (max + 1) .. range.end
169 def build_range( start, stop )
172 return [""] if size < 1
181 return build_range(start[2..-1], stop[2..-1]).map do |elt|
187 # Unshared prefix, end of run
189 return ["0x#{a}..0x#{b} "] if left.zero?
192 # Unshared prefix, not end of run
193 # Range can be 0x123456..0x56789A
194 # Which is equivalent to:
195 # 0x123456 .. 0x12FFFF
196 # 0x130000 .. 0x55FFFF
197 # 0x560000 .. 0x56789A
200 ret << build_range(start, a + "FF" * left)
203 # Only generate middle range if need be.
206 max = to_hex(b.hex - 1)
207 max = "FF" if b == "FF"
208 ret << "0x#{to_hex(a.hex+1)}..0x#{max} " + "0x00..0xFF " * left
212 # Don't generate last range if it is covered by first range
214 ret << build_range(b + "00" * left, stop) unless b == "FF"
219 utf8_ranges( range ).map do |r|
220 build_range to_utf8_enc(r.begin), to_utf8_enc(r.end)
225 # Perform a 3-way comparison of the number of codepoints advertised by
226 # the unicode spec for the given range, the originally parsed range,
227 # and the resulting utf8 encoded range.
229 def count_codepoints( code )
230 code.split(' ').inject(1) do |acc, elt|
231 if elt =~ /0x(.+)\.\.0x(.+)/
232 if @encoding == :utf8
233 acc * (from_utf8_enc($2) - from_utf8_enc($1) + 1)
235 acc * ($2.hex - $1.hex + 1)
243 def is_valid?( range, desc, codes )
245 spec_count = $1.to_i if desc =~ /\[(\d+)\]/
246 range_count = range.end - range.begin + 1
248 sum = codes.inject(0) { |acc, elt| acc + count_codepoints(elt) }
249 sum == spec_count and sum == range_count
253 # Generate the state maching to stdout
255 def generate_machine( name, property )
258 each_alpha( CHART_URL, property ) do |range, desc|
260 codes = (@encoding == :ucs4) ? to_ucs4(range) : to_utf8(range)
262 raise "Invalid encoding of range #{range}: #{codes.inspect}" unless
263 is_valid? range, desc, codes
265 range_width = codes.map { |a| a.size }.max
266 range_width = RANGE_WIDTH if range_width < RANGE_WIDTH
268 desc_width = TOTAL_WIDTH - RANGE_WIDTH - 11
269 desc_width -= (range_width - RANGE_WIDTH) if range_width > RANGE_WIDTH
271 if desc.size > desc_width
272 desc = desc[0..desc_width - 4] + "..."
275 codes.each_with_index do |r, idx|
276 desc = "" unless idx.zero?
277 code = "%-#{range_width}s" % r
278 puts " #{pipe} #{code} ##{desc}"
287 # The following Ragel file was autogenerated with #{$0}
290 # It defines ualpha, udigit, ualnum.
292 # To use this, make sure that your alphtype is set to #{ALPHTYPES[@encoding]},
293 # and that your input is in #{@encoding}.
298 generate_machine( :ualpha, "Alphabetic" )
299 generate_machine( :ulower, "Lowercase" )
300 generate_machine( :uupper, "Uppercase" )
303 ualnum = ualpha | udigit;