X-Git-Url: http://review.tizen.org/git/?a=blobdiff_plain;f=glib%2Fgen-unicode-tables.pl;h=ebcb4a44c6211c51adb5596d8627c86938a77e40;hb=35eaf037bdfca985abf5d349e7355f1d2ed9c77b;hp=6ac82098f85e16fc4644f79d2a1b3ccead409669;hpb=05f99527eb9391723bac93089f691aa522e3640e;p=platform%2Fupstream%2Fglib.git diff --git a/glib/gen-unicode-tables.pl b/glib/gen-unicode-tables.pl index 6ac8209..ebcb4a4 100755 --- a/glib/gen-unicode-tables.pl +++ b/glib/gen-unicode-tables.pl @@ -14,16 +14,13 @@ # GNU General Public License for more details. # You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA -# 02111-1307, USA. +# along with this program; if not, see . # Contributer(s): # Andrew Taylor # gen-unicode-tables.pl - Generate tables for libunicode from Unicode data. # See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html -# Usage: gen-unicode-tables.pl [-decomp | -both] UNICODE-VERSION UnicodeData.txt LineBreak.txt SpecialCasing.txt CaseFolding.txt # I consider the output of this program to be unrestricted. Use it as # you will. @@ -34,6 +31,8 @@ # we use some perl unicode features require 5.006; +use bytes; + use vars qw($CODE $NAME $CATEGORY $COMBINING_CLASSES $BIDI_CATEGORY $DECOMPOSITION $DECIMAL_VALUE $DIGIT_VALUE $NUMERIC_VALUE $MIRRORED $OLD_NAME $COMMENT $UPPER $LOWER $TITLE $BREAK_CODE $BREAK_CATEGORY $BREAK_NAME $CASE_CODE $CASE_LOWER $CASE_TITLE $CASE_UPPER $CASE_CONDITION); @@ -78,7 +77,7 @@ $FOLDING_MAPPING = 2; 'Ll' => "G_UNICODE_LOWERCASE_LETTER", 'Lt' => "G_UNICODE_TITLECASE_LETTER", 'Mn' => "G_UNICODE_NON_SPACING_MARK", - 'Mc' => "G_UNICODE_COMBINING_MARK", + 'Mc' => "G_UNICODE_SPACING_MARK", 'Me' => "G_UNICODE_ENCLOSING_MARK", 'Nd' => "G_UNICODE_DECIMAL_NUMBER", 'Nl' => "G_UNICODE_LETTER_NUMBER", @@ -110,37 +109,46 @@ $FOLDING_MAPPING = 2; %break_mappings = ( - 'BK' => "G_UNICODE_BREAK_MANDATORY", - 'CR' => "G_UNICODE_BREAK_CARRIAGE_RETURN", - 'LF' => "G_UNICODE_BREAK_LINE_FEED", - 'CM' => "G_UNICODE_BREAK_COMBINING_MARK", - 'SG' => "G_UNICODE_BREAK_SURROGATE", - 'ZW' => "G_UNICODE_BREAK_ZERO_WIDTH_SPACE", - 'IN' => "G_UNICODE_BREAK_INSEPARABLE", - 'GL' => "G_UNICODE_BREAK_NON_BREAKING_GLUE", - 'CB' => "G_UNICODE_BREAK_CONTINGENT", - 'SP' => "G_UNICODE_BREAK_SPACE", + 'AI' => "G_UNICODE_BREAK_AMBIGUOUS", + 'AL' => "G_UNICODE_BREAK_ALPHABETIC", + 'B2' => "G_UNICODE_BREAK_BEFORE_AND_AFTER", 'BA' => "G_UNICODE_BREAK_AFTER", 'BB' => "G_UNICODE_BREAK_BEFORE", - 'B2' => "G_UNICODE_BREAK_BEFORE_AND_AFTER", - 'HY' => "G_UNICODE_BREAK_HYPHEN", - 'NS' => "G_UNICODE_BREAK_NON_STARTER", - 'OP' => "G_UNICODE_BREAK_OPEN_PUNCTUATION", + 'BK' => "G_UNICODE_BREAK_MANDATORY", + 'CB' => "G_UNICODE_BREAK_CONTINGENT", + 'CJ' => "G_UNICODE_BREAK_CONDITIONAL_JAPANESE_STARTER", 'CL' => "G_UNICODE_BREAK_CLOSE_PUNCTUATION", - 'QU' => "G_UNICODE_BREAK_QUOTATION", + 'CM' => "G_UNICODE_BREAK_COMBINING_MARK", + 'CP' => "G_UNICODE_BREAK_CLOSE_PARANTHESIS", + 'CR' => "G_UNICODE_BREAK_CARRIAGE_RETURN", 'EX' => "G_UNICODE_BREAK_EXCLAMATION", + 'GL' => "G_UNICODE_BREAK_NON_BREAKING_GLUE", + 'H2' => "G_UNICODE_BREAK_HANGUL_LV_SYLLABLE", + 'H3' => "G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE", + 'HL' => "G_UNICODE_BREAK_HEBREW_LETTER", + 'HY' => "G_UNICODE_BREAK_HYPHEN", 'ID' => "G_UNICODE_BREAK_IDEOGRAPHIC", - 'NU' => "G_UNICODE_BREAK_NUMERIC", + 'IN' => "G_UNICODE_BREAK_INSEPARABLE", 'IS' => "G_UNICODE_BREAK_INFIX_SEPARATOR", - 'SY' => "G_UNICODE_BREAK_SYMBOL", - 'AL' => "G_UNICODE_BREAK_ALPHABETIC", - 'PR' => "G_UNICODE_BREAK_PREFIX", + 'JL' => "G_UNICODE_BREAK_HANGUL_L_JAMO", + 'JT' => "G_UNICODE_BREAK_HANGUL_T_JAMO", + 'JV' => "G_UNICODE_BREAK_HANGUL_V_JAMO", + 'LF' => "G_UNICODE_BREAK_LINE_FEED", + 'NL' => "G_UNICODE_BREAK_NEXT_LINE", + 'NS' => "G_UNICODE_BREAK_NON_STARTER", + 'NU' => "G_UNICODE_BREAK_NUMERIC", + 'OP' => "G_UNICODE_BREAK_OPEN_PUNCTUATION", 'PO' => "G_UNICODE_BREAK_POSTFIX", + 'PR' => "G_UNICODE_BREAK_PREFIX", + 'QU' => "G_UNICODE_BREAK_QUOTATION", + 'RI' => "G_UNICODE_BREAK_REGIONAL_INDICATOR", 'SA' => "G_UNICODE_BREAK_COMPLEX_CONTEXT", - 'AI' => "G_UNICODE_BREAK_AMBIGUOUS", - 'NL' => "G_UNICODE_BREAK_NEXT_LINE", + 'SG' => "G_UNICODE_BREAK_SURROGATE", + 'SP' => "G_UNICODE_BREAK_SPACE", + 'SY' => "G_UNICODE_BREAK_SYMBOL", 'WJ' => "G_UNICODE_BREAK_WORD_JOINER", - 'XX' => "G_UNICODE_BREAK_UNKNOWN" + 'XX' => "G_UNICODE_BREAK_UNKNOWN", + 'ZW' => "G_UNICODE_BREAK_ZERO_WIDTH_SPACE" ); # Title case mappings. @@ -153,8 +161,17 @@ my @special_cases; my @special_case_offsets; my $special_case_offset = 0; +# Scripts + +my @scripts; + +# East asian widths + +my @eawidths; + $do_decomp = 0; $do_props = 1; +$do_scripts = 1; if (@ARGV && $ARGV[0] eq '-decomp') { $do_decomp = 1; @@ -167,17 +184,47 @@ elsif (@ARGV && $ARGV[0] eq '-both') shift @ARGV; } -if (@ARGV != 6) { +if (@ARGV != 2) { $0 =~ s@.*/@@; - die "Usage: $0 [-decomp | -both] UNICODE-VERSION UnicodeData.txt LineBreak.txt SpecialCasing.txt CaseFolding.txt CompositionExclusions.txt\n"; + die "\nUsage: $0 [-decomp | -both] UNICODE-VERSION DIRECTORY\n\n DIRECTORY should contain the following Unicode data files:\n UnicodeData.txt, LineBreak.txt, SpecialCasing.txt, CaseFolding.txt,\n CompositionExclusions.txt Scripts.txt extracted/DerivedEastAsianWidth.txt \n\n"; } - + +my ($unicodedatatxt, $linebreaktxt, $specialcasingtxt, $casefoldingtxt, $compositionexclusionstxt, + $scriptstxt, $derivedeastasianwidth); + +my $d = $ARGV[1]; +opendir (my $dir, $d) or die "Cannot open Unicode data dir $d: $!\n"; +for my $f (readdir ($dir)) +{ + $unicodedatatxt = "$d/$f" if ($f =~ /^UnicodeData.*\.txt/); + $linebreaktxt = "$d/$f" if ($f =~ /^LineBreak.*\.txt/); + $specialcasingtxt = "$d/$f" if ($f =~ /^SpecialCasing.*\.txt/); + $casefoldingtxt = "$d/$f" if ($f =~ /^CaseFolding.*\.txt/); + $compositionexclusionstxt = "$d/$f" if ($f =~ /^CompositionExclusions.*\.txt/); + $scriptstxt = "$d/$f" if ($f =~ /^Scripts.*\.txt/); +} + +my $extd = $ARGV[1] . "/extracted"; +opendir (my $extdir, $extd) or die "Cannot open Unicode/extracted data dir $extd: $!\n"; +for my $f (readdir ($extdir)) +{ + $derivedeastasianwidthtxt = "$extd/$f" if ($f =~ /^DerivedEastAsianWidth.*\.txt/); +} + +defined $unicodedatatxt or die "Did not find UnicodeData file"; +defined $linebreaktxt or die "Did not find LineBreak file"; +defined $specialcasingtxt or die "Did not find SpecialCasing file"; +defined $casefoldingtxt or die "Did not find CaseFolding file"; +defined $compositionexclusionstxt or die "Did not find CompositionExclusions file"; +defined $scriptstxt or die "Did not find Scripts file"; +defined $derivedeastasianwidthtxt or die "Did not find DerivedEastAsianWidth file"; + print "Creating decomp table\n" if ($do_decomp); print "Creating property table\n" if ($do_props); -print "Composition exlusions from $ARGV[5]\n"; +print "Composition exlusions from $compositionexclusionstxt\n"; -open (INPUT, "< $ARGV[5]") || exit 1; +open (INPUT, "< $compositionexclusionstxt") || exit 1; while () { @@ -196,9 +243,9 @@ while () { close INPUT; -print "Unicode data from $ARGV[1]\n"; +print "Unicode data from $unicodedatatxt\n"; -open (INPUT, "< $ARGV[1]") || exit 1; +open (INPUT, "< $unicodedatatxt") || exit 1; # we save memory by skipping the huge empty area before U+E0000 my $pages_before_e0000; @@ -259,9 +306,9 @@ for (++$last_code; $last_code <= 0x10FFFF; ++$last_code) print "Creating line break table\n"; -print "Line break data from $ARGV[2]\n"; +print "Line break data from $linebreaktxt\n"; -open (INPUT, "< $ARGV[2]") || exit 1; +open (INPUT, "< $linebreaktxt") || exit 1; $last_code = -1; while () @@ -271,6 +318,7 @@ while () chop; next if /^#/; + next if /^$/; s/\s*#.*//; @@ -334,7 +382,7 @@ print STDERR "Last code is not 0x10FFFF" if ($last_code != 0x10FFFF); print "Reading special-casing table for case conversion\n"; -open (INPUT, "< $ARGV[3]") || exit 1; +open (INPUT, "< $specialcasingtxt") || exit 1; while () { @@ -393,7 +441,7 @@ while () close INPUT; -open (INPUT, "< $ARGV[4]") || exit 1; +open (INPUT, "< $casefoldingtxt") || exit 1; my $casefoldlen = 0; my @casefold; @@ -461,6 +509,51 @@ while () close INPUT; +print "Reading scripts\n"; + +open (INPUT, "< $scriptstxt") || exit 1; + +while () { + s/#.*//; + next if /^\s*$/; + if (!/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*([A-Za-z_]+)\s*$/) { + die "Cannot parse line: '$_'\n"; + } + + if (defined $2) { + push @scripts, [ hex $1, hex $2, uc $3 ]; + } else { + push @scripts, [ hex $1, hex $1, uc $3 ]; + } +} + +close INPUT; + +print "Reading derived east asian widths\n"; + +open (INPUT, "< $derivedeastasianwidthtxt") || exit 1; + +while () +{ + my ($start_code, $end_code); + + chop; + + s/#.*//; + next if /^\s*$/; + if (!/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*([A-Za-z_]+)\s*$/) { + die "Cannot parse line: '$_'\n"; + } + + if (defined $2) { + push @eawidths, [ hex $1, hex $2, $3 ]; + } else { + push @eawidths, [ hex $1, hex $1, $3 ]; + } +} + +close INPUT; + if ($do_props) { &print_tables ($last_code) } @@ -468,9 +561,12 @@ if ($do_decomp) { &print_decomp ($last_code); &output_composition_table; } - &print_line_break ($last_code); +if ($do_scripts) { + &print_scripts +} + exit 0; @@ -479,7 +575,6 @@ sub length_in_bytes { my ($string) = @_; - use bytes; return length $string; } @@ -637,6 +732,11 @@ sub print_tables &output_special_case_table (\*OUT); &output_casefold_table (\*OUT); + # + # And the widths tables + # + &output_width_tables (\*OUT); + print OUT "#endif /* CHARTABLES_H */\n"; close (OUT); @@ -714,9 +814,10 @@ sub escape { my ($string) = @_; - $string =~ s/(\C)/sprintf "\\x%02x",ord($1)/eg; + my $escaped = unpack("H*", $string); + $escaped =~ s/(.{2})/\\x$1/g; - return $string; + return $escaped; } # Returns the offset of $decomp in the offset string. Updates the @@ -843,6 +944,33 @@ sub print_decomp printf OUT "static const gchar decomp_expansion_string[] = %s;\n\n", $decomp_string; + print OUT "typedef struct\n{\n"; + print OUT " gunichar ch;\n"; + print OUT " gunichar a;\n"; + print OUT " gunichar b;\n"; + print OUT "} decomposition_step;\n\n"; + + # There's lots of room to optimize the following table... + print OUT "static const decomposition_step decomp_step_table[] =\n{\n"; + $first = 1; + my @steps = (); + for ($count = 0; $count <= $last; ++$count) + { + if ((defined $decompositions[$count]) && (!$decompose_compat[$count])) + { + print OUT ",\n" + if ! $first; + $first = 0; + my @list; + @list = (split(' ', $decompositions[$count]), "0"); + printf OUT qq( { 0x%05x, 0x%05x, 0x%05x }), $count, hex($list[0]), hex($list[1]); + # don't include 1:1 in the compose table + push @steps, [ ($count, hex($list[0]), hex($list[1])) ] + if hex($list[1]) + } + } + print OUT "\n};\n\n"; + print OUT "#endif /* DECOMP_H */\n"; printf STDERR "Generated %d bytes in decomp tables\n", $bytes_out; @@ -865,6 +993,9 @@ sub print_line_break print OUT "#ifndef BREAKTABLES_H\n"; print OUT "#define BREAKTABLES_H\n\n"; + print OUT "#include \n"; + print OUT "#include \n\n"; + print OUT "#define G_UNICODE_DATA_VERSION \"$ARGV[0]\"\n\n"; printf OUT "#define G_UNICODE_LAST_CHAR 0x%04X\n\n", $last; @@ -971,8 +1102,8 @@ sub add_special_case (map { hex ($_) } split /\s+/, $field1), 0, (map { hex ($_) } split /\s+/, $field2)); - $result = ""; + $result = ""; for $value (@values) { $result .= pack ("U", $value); # to utf-8 @@ -1052,6 +1183,10 @@ sub output_composition_table @values = map { hex ($_) } split /\s+/, $compositions{$code}; # non-starters + if ($cclass[$code]) { + delete $compositions{$code}; + next; + } if ($cclass[$values[0]]) { delete $compositions{$code}; next; @@ -1152,6 +1287,8 @@ sub output_composition_table $last = $code if $code > $last; } + printf OUT "#define COMPOSE_TABLE_LAST %d\n\n", $last / 256; + # Output lookup table my @row; @@ -1163,24 +1300,20 @@ sub output_composition_table } printf OUT "\n};\n\n"; - print OUT "static const gint16 compose_table[256] = {\n"; + print OUT "static const gint16 compose_table[COMPOSE_TABLE_LAST + 1] = {\n"; for (my $count = 0; $count <= $last; $count += 256) { print OUT ",\n" if $count > 0; print OUT " ", $row[$count / 256]; + $bytes_out += 2; } print OUT "\n};\n\n"; - $bytes_out += 256 * 2; - # Output first singletons - print OUT "static const guint16 compose_first_single[][2] = {\n"; + print OUT "static const gunichar compose_first_single[][2] = {\n"; $i = 0; for $record (@first_singletons) { - if ($record->[1] > 0xFFFF or $record->[2] > 0xFFFF) { - die "time to switch compose_first_single to gunichar" ; - } print OUT ",\n" if $i++ > 0; printf OUT " { %#06x, %#06x }", $record->[1], $record->[2]; } @@ -1190,12 +1323,9 @@ sub output_composition_table # Output second singletons - print OUT "static const guint16 compose_second_single[][2] = {\n"; + print OUT "static const gunichar compose_second_single[][2] = {\n"; $i = 0; for $record (@second_singletons) { - if ($record->[1] > 0xFFFF or $record->[2] > 0xFFFF) { - die "time to switch compose_second_single to gunichar"; - } print OUT ",\n" if $i++ > 0; printf OUT " { %#06x, %#06x }", $record->[1], $record->[2]; } @@ -1274,5 +1404,146 @@ EOT printf "Generated %d bytes for casefold table\n", $recordlen * @casefold; } - +sub output_one_width_table +{ + my ($out, $name, $wpe) = @_; + my $start; + my $end; + my $wp; + my $rex; + + print $out "static const struct Interval g_unicode_width_table_${name}[] = {\n"; + + $rex = qr/$wpe/; + + for (my $i = 0; $i <= $#eawidths; $i++) { + $start = $eawidths[$i]->[0]; + $end = $eawidths[$i]->[1]; + $wp = $eawidths[$i]->[2]; + + next if ($wp !~ $rex); + + while ($i <= $#eawidths - 1 && + $eawidths[$i + 1]->[0] == $end + 1 && + ($eawidths[$i + 1]->[2] =~ $rex)) { + $i++; + $end = $eawidths[$i]->[1]; + } + + printf $out "{0x%04X, 0x%04X},\n", $start, $end; + } + + printf $out "};\n\n"; +} + +sub output_width_tables +{ + my $out = shift; + + @eawidths = sort { $a->[0] <=> $b->[0] } @eawidths; + + print $out <gscripttable.h" or die "Cannot open gscripttable.h: $!\n"; + + print OUT<[0] <=> $b->[0] } @scripts; + + $easy_range = 0x2000; + + print OUT< $end) { + $start = $scripts[$i]->[0]; + $end = $scripts[$i]->[1]; + $script = $scripts[$i]->[2]; + $i++; + } + + if ($c < $start) { + printf OUT " G_UNICODE_SCRIPT_UNKNOWN,"; + } else { + printf OUT " G_UNICODE_SCRIPT_%s,", $script; + } + } + + if ($end >= $easy_range) { + $i--; + $scripts[$i]->[0] = $easy_range; + } + + print OUT<[0]; + $end = $scripts[$i]->[1]; + $script = $scripts[$i]->[2]; + + while ($i <= $#scripts - 1 && + $scripts[$i + 1]->[0] == $end + 1 && + $scripts[$i + 1]->[2] eq $script) { + $i++; + $end = $scripts[$i]->[1]; + } + printf OUT " { %#06x, %5d, G_UNICODE_SCRIPT_%s },\n", $start, $end - $start + 1, $script; + } + + printf OUT<