From: thurston Date: Thu, 9 Oct 2008 23:29:27 +0000 (+0000) Subject: Script for generating unicode machines. From Rakan El-Khalil. X-Git-Tag: 2.0_alpha~92 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=25c99017043761efbff41b4048a99022ea73903e;p=external%2Fragel.git Script for generating unicode machines. From Rakan El-Khalil. git-svn-id: http://svn.complang.org/ragel/trunk@494 052ea7fc-9027-0410-9066-f65837a77df0 --- diff --git a/examples/unicode2ragel.rb b/examples/unicode2ragel.rb new file mode 100644 index 0000000..d64e601 --- /dev/null +++ b/examples/unicode2ragel.rb @@ -0,0 +1,305 @@ +#!/usr/bin/env ruby +# +# This script uses the unicode spec to generate a Ragel state machine +# that recognizes unicode alphanumeric characters. It generates 5 +# character classes: uupper, ulower, ualpha, udigit, and ualnum. +# Currently supported encodings are UTF-8 [default] and UCS-4. +# +# Usage: unicode2ragel.rb [options] +# -e, --encoding [ucs4 | utf8] Data encoding +# -h, --help Show this message +# +# This script was originally written as part of the Ferret search +# engine library. +# +# Author: Rakan El-Khalil + +require 'optparse' +require 'open-uri' + +ENCODINGS = [ :utf8, :ucs4 ] +ALPHTYPES = { :utf8 => "unsigned char", :ucs4 => "unsigned int" } +CHART_URL = "http://www.unicode.org/Public/5.1.0/ucd/DerivedCoreProperties.txt" + +### +# Display vars & default option + +TOTAL_WIDTH = 80 +RANGE_WIDTH = 23 +@encoding = :utf8 + +### +# Option parsing + +cli_opts = OptionParser.new do |opts| + opts.on("-e", "--encoding [ucs4 | utf8]", "Data encoding") do |o| + @encoding = o.downcase.to_sym + end + opts.on("-h", "--help", "Show this message") do + puts opts + exit + end +end + +cli_opts.parse(ARGV) +unless ENCODINGS.member? @encoding + puts "Invalid encoding: #{@encoding}" + puts cli_opts + exit +end + +## +# Downloads the document at url and yields every alpha line's hex +# range and description. + +def each_alpha( url, property ) + open( url ) do |file| + file.each_line do |line| + next if line =~ /^#/; + next if line !~ /; #{property} #/; + + range, description = line.split(/;/) + range.strip! + description.gsub!(/.*#/, '').strip! + + if range =~ /\.\./ + start, stop = range.split '..' + else start = stop = range + end + + yield start.hex .. stop.hex, description + end + end +end + +### +# Formats to hex at minimum width + +def to_hex( n ) + r = "%0X" % n + r = "0#{r}" unless (r.length % 2).zero? + r +end + +### +# UCS4 is just a straight hex conversion of the unicode codepoint. + +def to_ucs4( range ) + rangestr = "0x" + to_hex(range.begin) + rangestr << "..0x" + to_hex(range.end) if range.begin != range.end + [ rangestr ] +end + +## +# 0x00 - 0x7f -> 0zzzzzzz[7] +# 0x80 - 0x7ff -> 110yyyyy[5] 10zzzzzz[6] +# 0x800 - 0xffff -> 1110xxxx[4] 10yyyyyy[6] 10zzzzzz[6] +# 0x010000 - 0x10ffff -> 11110www[3] 10xxxxxx[6] 10yyyyyy[6] 10zzzzzz[6] + +UTF8_BOUNDARIES = [0x7f, 0x7ff, 0xffff, 0x10ffff] + +def to_utf8_enc( n ) + r = 0 + if n <= 0x7f + r = n + elsif n <= 0x7ff + y = 0xc0 | (n >> 6) + z = 0x80 | (n & 0x3f) + r = y << 8 | z + elsif n <= 0xffff + x = 0xe0 | (n >> 12) + y = 0x80 | (n >> 6) & 0x3f + z = 0x80 | n & 0x3f + r = x << 16 | y << 8 | z + elsif n <= 0x10ffff + w = 0xf0 | (n >> 18) + x = 0x80 | (n >> 12) & 0x3f + y = 0x80 | (n >> 6) & 0x3f + z = 0x80 | n & 0x3f + r = w << 24 | x << 16 | y << 8 | z + end + + to_hex(r) +end + +def from_utf8_enc( n ) + n = n.hex + r = 0 + if n <= 0x7f + r = n + elsif n <= 0xdfff + y = (n >> 8) & 0x1f + z = n & 0x3f + r = y << 6 | z + elsif n <= 0xefffff + x = (n >> 16) & 0x0f + y = (n >> 8) & 0x3f + z = n & 0x3f + r = x << 10 | y << 6 | z + elsif n <= 0xf7ffffff + w = (n >> 24) & 0x07 + x = (n >> 16) & 0x3f + y = (n >> 8) & 0x3f + z = n & 0x3f + r = w << 18 | x << 12 | y << 6 | z + end + r +end + +### +# Given a range, splits it up into ranges that can be continuously +# encoded into utf8. Eg: 0x00 .. 0xff => [0x00..0x7f, 0x80..0xff] +# This is not strictly needed since the current [5.1] unicode standard +# doesn't have ranges that straddle utf8 boundaries. This is included +# for completeness as there is no telling if that will ever change. + +def utf8_ranges( range ) + ranges = [] + UTF8_BOUNDARIES.each do |max| + if range.begin <= max + return ranges << range if range.end <= max + + ranges << range.begin .. max + range = (max + 1) .. range.end + end + end + ranges +end + +def build_range( start, stop ) + size = start.size/2 + left = size - 1 + return [""] if size < 1 + + a = start[0..1] + b = stop[0..1] + + ### + # Shared prefix + + if a == b + return build_range(start[2..-1], stop[2..-1]).map do |elt| + "0x#{a} " + elt + end + end + + ### + # Unshared prefix, end of run + + return ["0x#{a}..0x#{b} "] if left.zero? + + ### + # Unshared prefix, not end of run + # Range can be 0x123456..0x56789A + # Which is equivalent to: + # 0x123456 .. 0x12FFFF + # 0x130000 .. 0x55FFFF + # 0x560000 .. 0x56789A + + ret = [] + ret << build_range(start, a + "FF" * left) + + ### + # Only generate middle range if need be. + + if a.hex+1 != b.hex + max = to_hex(b.hex - 1) + max = "FF" if b == "FF" + ret << "0x#{to_hex(a.hex+1)}..0x#{max} " + "0x00..0xFF " * left + end + + ### + # Don't generate last range if it is covered by first range + + ret << build_range(b + "00" * left, stop) unless b == "FF" + ret.flatten! +end + +def to_utf8( range ) + utf8_ranges( range ).map do |r| + build_range to_utf8_enc(r.begin), to_utf8_enc(r.end) + end.flatten! +end + +## +# Perform a 3-way comparison of the number of codepoints advertised by +# the unicode spec for the given range, the originally parsed range, +# and the resulting utf8 encoded range. + +def count_codepoints( code ) + code.split(' ').inject(1) do |acc, elt| + if elt =~ /0x(.+)\.\.0x(.+)/ + if @encoding == :utf8 + acc * (from_utf8_enc($2) - from_utf8_enc($1) + 1) + else + acc * ($2.hex - $1.hex + 1) + end + else + acc + end + end +end + +def is_valid?( range, desc, codes ) + spec_count = 1 + spec_count = $1.to_i if desc =~ /\[(\d+)\]/ + range_count = range.end - range.begin + 1 + + sum = codes.inject(0) { |acc, elt| acc + count_codepoints(elt) } + sum == spec_count and sum == range_count +end + +## +# Generate the state maching to stdout + +def generate_machine( name, property ) + pipe = " " + puts " #{name} = " + each_alpha( CHART_URL, property ) do |range, desc| + + codes = (@encoding == :ucs4) ? to_ucs4(range) : to_utf8(range) + + raise "Invalid encoding of range #{range}: #{codes.inspect}" unless + is_valid? range, desc, codes + + range_width = codes.map { |a| a.size }.max + range_width = RANGE_WIDTH if range_width < RANGE_WIDTH + + desc_width = TOTAL_WIDTH - RANGE_WIDTH - 11 + desc_width -= (range_width - RANGE_WIDTH) if range_width > RANGE_WIDTH + + if desc.size > desc_width + desc = desc[0..desc_width - 4] + "..." + end + + codes.each_with_index do |r, idx| + desc = "" unless idx.zero? + code = "%-#{range_width}s" % r + puts " #{pipe} #{code} ##{desc}" + pipe = "|" + end + end + puts " ;" + puts "" +end + +puts <