contrib/unicode2ragel.rb

   1 #!/usr/bin/env ruby
   2 #
   3 # This script uses the unicode spec to generate a Ragel state machine
   4 # that recognizes unicode alphanumeric characters.  It generates 5
   5 # character classes: uupper, ulower, ualpha, udigit, and ualnum.
   6 # Currently supported encodings are UTF-8 [default] and UCS-4.
   7 #
   8 # Usage: unicode2ragel.rb [options]
   9 #    -e, --encoding [ucs4 | utf8]     Data encoding
  10 #    -h, --help                       Show this message
  11 #
  12 # This script was originally written as part of the Ferret search
  13 # engine library.
  14 #
  15 # Author: Rakan El-Khalil <rakan@well.com>
  16
  17 require 'optparse'
  18 require 'open-uri'
  19
  20 ENCODINGS = [ :utf8, :ucs4 ]
  21 ALPHTYPES = { :utf8 => "unsigned char", :ucs4 => "unsigned int" }
  22 CHART_URL = "http://www.unicode.org/Public/5.1.0/ucd/DerivedCoreProperties.txt"
  23
  24 ###
  25 # Display vars & default option
  26
  27 TOTAL_WIDTH = 80
  28 RANGE_WIDTH = 23
  29 @encoding = :utf8
  30
  31 ###
  32 # Option parsing
  33
  34 cli_opts = OptionParser.new do |opts|
  35   opts.on("-e", "--encoding [ucs4 | utf8]", "Data encoding") do |o|
  36     @encoding = o.downcase.to_sym
  37   end
  38   opts.on("-h", "--help", "Show this message") do
  39     puts opts
  40     exit
  41   end
  42 end
  43
  44 cli_opts.parse(ARGV)
  45 unless ENCODINGS.member? @encoding
  46   puts "Invalid encoding: #{@encoding}"
  47   puts cli_opts
  48   exit
  49 end
  50
  51 ##
  52 # Downloads the document at url and yields every alpha line's hex
  53 # range and description.
  54
  55 def each_alpha( url, property )
  56   open( url ) do |file|
  57     file.each_line do |line|
  58       next if line =~ /^#/;
  59       next if line !~ /; #{property} #/;
  60
  61       range, description = line.split(/;/)
  62       range.strip!
  63       description.gsub!(/.*#/, '').strip!
  64
  65       if range =~ /\.\./
  66            start, stop = range.split '..'
  67       else start = stop = range
  68       end
  69
  70       yield start.hex .. stop.hex, description
  71     end
  72   end
  73 end
  74
  75 ###
  76 # Formats to hex at minimum width
  77
  78 def to_hex( n )
  79   r = "%0X" % n
  80   r = "0#{r}" unless (r.length % 2).zero?
  81   r
  82 end
  83
  84 ###
  85 # UCS4 is just a straight hex conversion of the unicode codepoint.
  86
  87 def to_ucs4( range )
  88   rangestr  =   "0x" + to_hex(range.begin)
  89   rangestr << "..0x" + to_hex(range.end) if range.begin != range.end
  90   [ rangestr ]
  91 end
  92
  93 ##
  94 # 0x00     - 0x7f     -> 0zzzzzzz[7]
  95 # 0x80     - 0x7ff    -> 110yyyyy[5] 10zzzzzz[6]
  96 # 0x800    - 0xffff   -> 1110xxxx[4] 10yyyyyy[6] 10zzzzzz[6]
  97 # 0x010000 - 0x10ffff -> 11110www[3] 10xxxxxx[6] 10yyyyyy[6] 10zzzzzz[6]
  98
  99 UTF8_BOUNDARIES = [0x7f, 0x7ff, 0xffff, 0x10ffff]
 100
 101 def to_utf8_enc( n )
 102   r = 0
 103   if n <= 0x7f
 104     r = n
 105   elsif n <= 0x7ff
 106     y = 0xc0 | (n >> 6)
 107     z = 0x80 | (n & 0x3f)
 108     r = y << 8 | z
 109   elsif n <= 0xffff
 110     x = 0xe0 | (n >> 12)
 111     y = 0x80 | (n >>  6) & 0x3f
 112     z = 0x80 |  n        & 0x3f
 113     r = x << 16 | y << 8 | z
 114   elsif n <= 0x10ffff
 115     w = 0xf0 | (n >> 18)
 116     x = 0x80 | (n >> 12) & 0x3f
 117     y = 0x80 | (n >>  6) & 0x3f
 118     z = 0x80 |  n        & 0x3f
 119     r = w << 24 | x << 16 | y << 8 | z
 120   end
 121
 122   to_hex(r)
 123 end
 124
 125 def from_utf8_enc( n )
 126   n = n.hex
 127   r = 0
 128   if n <= 0x7f
 129     r = n
 130   elsif n <= 0xdfff
 131     y = (n >> 8) & 0x1f
 132     z =  n       & 0x3f
 133     r = y << 6 | z
 134   elsif n <= 0xefffff
 135     x = (n >> 16) & 0x0f
 136     y = (n >>  8) & 0x3f
 137     z =  n        & 0x3f
 138     r = x << 10 | y << 6 | z
 139   elsif n <= 0xf7ffffff
 140     w = (n >> 24) & 0x07
 141     x = (n >> 16) & 0x3f
 142     y = (n >>  8) & 0x3f
 143     z =  n        & 0x3f
 144     r = w << 18 | x << 12 | y << 6 | z
 145   end
 146   r
 147 end
 148
 149 ###
 150 # Given a range, splits it up into ranges that can be continuously
 151 # encoded into utf8.  Eg: 0x00 .. 0xff => [0x00..0x7f, 0x80..0xff]
 152 # This is not strictly needed since the current [5.1] unicode standard
 153 # doesn't have ranges that straddle utf8 boundaries.  This is included
 154 # for completeness as there is no telling if that will ever change.
 155
 156 def utf8_ranges( range )
 157   ranges = []
 158   UTF8_BOUNDARIES.each do |max|
 159     if range.begin <= max
 160       return ranges << range if range.end <= max
 161
 162       ranges << range.begin .. max
 163       range = (max + 1) .. range.end
 164     end
 165   end
 166   ranges
 167 end
 168
 169 def build_range( start, stop )
 170   size = start.size/2
 171   left = size - 1
 172   return [""] if size < 1
 173
 174   a = start[0..1]
 175   b = stop[0..1]
 176
 177   ###
 178   # Shared prefix
 179
 180   if a == b
 181     return build_range(start[2..-1], stop[2..-1]).map do |elt|
 182       "0x#{a} " + elt
 183     end
 184   end
 185
 186   ###
 187   # Unshared prefix, end of run
 188
 189   return ["0x#{a}..0x#{b} "] if left.zero?
 190
 191   ###
 192   # Unshared prefix, not end of run
 193   # Range can be 0x123456..0x56789A
 194   # Which is equivalent to:
 195   #     0x123456 .. 0x12FFFF
 196   #     0x130000 .. 0x55FFFF
 197   #     0x560000 .. 0x56789A
 198
 199   ret = []
 200   ret << build_range(start, a + "FF" * left)
 201
 202   ###
 203   # Only generate middle range if need be.
 204
 205   if a.hex+1 != b.hex
 206     max = to_hex(b.hex - 1)
 207     max = "FF" if b == "FF"
 208     ret << "0x#{to_hex(a.hex+1)}..0x#{max} " + "0x00..0xFF " * left
 209   end
 210
 211   ###
 212   # Don't generate last range if it is covered by first range
 213
 214   ret << build_range(b + "00" * left, stop) unless b == "FF"
 215   ret.flatten!
 216 end
 217
 218 def to_utf8( range )
 219   utf8_ranges( range ).map do |r|
 220     build_range to_utf8_enc(r.begin), to_utf8_enc(r.end)
 221   end.flatten!
 222 end
 223
 224 ##
 225 # Perform a 3-way comparison of the number of codepoints advertised by
 226 # the unicode spec for the given range, the originally parsed range,
 227 # and the resulting utf8 encoded range.
 228
 229 def count_codepoints( code )
 230   code.split(' ').inject(1) do |acc, elt|
 231     if elt =~ /0x(.+)\.\.0x(.+)/
 232       if @encoding == :utf8
 233         acc * (from_utf8_enc($2) - from_utf8_enc($1) + 1)
 234       else
 235         acc * ($2.hex - $1.hex + 1)
 236       end
 237     else
 238       acc
 239     end
 240   end
 241 end
 242
 243 def is_valid?( range, desc, codes )
 244   spec_count  = 1
 245   spec_count  = $1.to_i if desc =~ /\[(\d+)\]/
 246   range_count = range.end - range.begin + 1
 247
 248   sum = codes.inject(0) { |acc, elt| acc + count_codepoints(elt) }
 249   sum == spec_count and sum == range_count
 250 end
 251
 252 ##
 253 # Generate the state maching to stdout
 254
 255 def generate_machine( name, property )
 256   pipe = " "
 257   puts "    #{name} = "
 258   each_alpha( CHART_URL, property ) do |range, desc|
 259
 260     codes = (@encoding == :ucs4) ? to_ucs4(range) : to_utf8(range)
 261
 262     raise "Invalid encoding of range #{range}: #{codes.inspect}" unless
 263       is_valid? range, desc, codes
 264
 265     range_width = codes.map { |a| a.size }.max
 266     range_width = RANGE_WIDTH if range_width < RANGE_WIDTH
 267
 268     desc_width  = TOTAL_WIDTH - RANGE_WIDTH - 11
 269     desc_width -= (range_width - RANGE_WIDTH) if range_width > RANGE_WIDTH
 270
 271     if desc.size > desc_width
 272       desc = desc[0..desc_width - 4] + "..."
 273     end
 274
 275     codes.each_with_index do |r, idx|
 276       desc = "" unless idx.zero?
 277       code = "%-#{range_width}s" % r
 278       puts "      #{pipe} #{code} ##{desc}"
 279       pipe = "|"
 280     end
 281   end
 282   puts "      ;"
 283   puts ""
 284 end
 285
 286 puts <<EOF
 287 # The following Ragel file was autogenerated with #{$0}
 288 # from: #{CHART_URL}
 289 #
 290 # It defines ualpha, udigit, ualnum.
 291 #
 292 # To use this, make sure that your alphtype is set to #{ALPHTYPES[@encoding]},
 293 # and that your input is in #{@encoding}.
 294
 295 %%{
 296     machine WChar;
 297 EOF
 298 generate_machine( :ualpha, "Alphabetic" )
 299 generate_machine( :ulower, "Lowercase" )
 300 generate_machine( :uupper, "Uppercase" )
 301 puts <<EOF
 302     udigit = '0'..'9';
 303     ualnum = ualpha | udigit;
 304 }%%
 305 EOF