X-Git-Url: http://review.tizen.org/git/?a=blobdiff_plain;f=glib%2Fgen-unicode-tables.pl;h=ebcb4a44c6211c51adb5596d8627c86938a77e40;hb=4454b815367831a71b1ae00b0182b5b389a78df2;hp=5368adfc60515b5fc69d912645aa343659f9e353;hpb=63adeda0861a26b38ec0adc76255666554c18951;p=platform%2Fupstream%2Fglib.git diff --git a/glib/gen-unicode-tables.pl b/glib/gen-unicode-tables.pl index 5368adf..ebcb4a4 100755 --- a/glib/gen-unicode-tables.pl +++ b/glib/gen-unicode-tables.pl @@ -14,9 +14,7 @@ # GNU General Public License for more details. # You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA -# 02111-1307, USA. +# along with this program; if not, see . # Contributer(s): # Andrew Taylor @@ -33,6 +31,8 @@ # we use some perl unicode features require 5.006; +use bytes; + use vars qw($CODE $NAME $CATEGORY $COMBINING_CLASSES $BIDI_CATEGORY $DECOMPOSITION $DECIMAL_VALUE $DIGIT_VALUE $NUMERIC_VALUE $MIRRORED $OLD_NAME $COMMENT $UPPER $LOWER $TITLE $BREAK_CODE $BREAK_CATEGORY $BREAK_NAME $CASE_CODE $CASE_LOWER $CASE_TITLE $CASE_UPPER $CASE_CONDITION); @@ -77,7 +77,7 @@ $FOLDING_MAPPING = 2; 'Ll' => "G_UNICODE_LOWERCASE_LETTER", 'Lt' => "G_UNICODE_TITLECASE_LETTER", 'Mn' => "G_UNICODE_NON_SPACING_MARK", - 'Mc' => "G_UNICODE_COMBINING_MARK", + 'Mc' => "G_UNICODE_SPACING_MARK", 'Me' => "G_UNICODE_ENCLOSING_MARK", 'Nd' => "G_UNICODE_DECIMAL_NUMBER", 'Nl' => "G_UNICODE_LETTER_NUMBER", @@ -109,42 +109,46 @@ $FOLDING_MAPPING = 2; %break_mappings = ( - 'BK' => "G_UNICODE_BREAK_MANDATORY", - 'CR' => "G_UNICODE_BREAK_CARRIAGE_RETURN", - 'LF' => "G_UNICODE_BREAK_LINE_FEED", - 'CM' => "G_UNICODE_BREAK_COMBINING_MARK", - 'SG' => "G_UNICODE_BREAK_SURROGATE", - 'ZW' => "G_UNICODE_BREAK_ZERO_WIDTH_SPACE", - 'IN' => "G_UNICODE_BREAK_INSEPARABLE", - 'GL' => "G_UNICODE_BREAK_NON_BREAKING_GLUE", - 'CB' => "G_UNICODE_BREAK_CONTINGENT", - 'SP' => "G_UNICODE_BREAK_SPACE", + 'AI' => "G_UNICODE_BREAK_AMBIGUOUS", + 'AL' => "G_UNICODE_BREAK_ALPHABETIC", + 'B2' => "G_UNICODE_BREAK_BEFORE_AND_AFTER", 'BA' => "G_UNICODE_BREAK_AFTER", 'BB' => "G_UNICODE_BREAK_BEFORE", - 'B2' => "G_UNICODE_BREAK_BEFORE_AND_AFTER", - 'HY' => "G_UNICODE_BREAK_HYPHEN", - 'NS' => "G_UNICODE_BREAK_NON_STARTER", - 'OP' => "G_UNICODE_BREAK_OPEN_PUNCTUATION", + 'BK' => "G_UNICODE_BREAK_MANDATORY", + 'CB' => "G_UNICODE_BREAK_CONTINGENT", + 'CJ' => "G_UNICODE_BREAK_CONDITIONAL_JAPANESE_STARTER", 'CL' => "G_UNICODE_BREAK_CLOSE_PUNCTUATION", - 'QU' => "G_UNICODE_BREAK_QUOTATION", + 'CM' => "G_UNICODE_BREAK_COMBINING_MARK", + 'CP' => "G_UNICODE_BREAK_CLOSE_PARANTHESIS", + 'CR' => "G_UNICODE_BREAK_CARRIAGE_RETURN", 'EX' => "G_UNICODE_BREAK_EXCLAMATION", + 'GL' => "G_UNICODE_BREAK_NON_BREAKING_GLUE", + 'H2' => "G_UNICODE_BREAK_HANGUL_LV_SYLLABLE", + 'H3' => "G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE", + 'HL' => "G_UNICODE_BREAK_HEBREW_LETTER", + 'HY' => "G_UNICODE_BREAK_HYPHEN", 'ID' => "G_UNICODE_BREAK_IDEOGRAPHIC", - 'NU' => "G_UNICODE_BREAK_NUMERIC", + 'IN' => "G_UNICODE_BREAK_INSEPARABLE", 'IS' => "G_UNICODE_BREAK_INFIX_SEPARATOR", - 'SY' => "G_UNICODE_BREAK_SYMBOL", - 'AL' => "G_UNICODE_BREAK_ALPHABETIC", - 'PR' => "G_UNICODE_BREAK_PREFIX", + 'JL' => "G_UNICODE_BREAK_HANGUL_L_JAMO", + 'JT' => "G_UNICODE_BREAK_HANGUL_T_JAMO", + 'JV' => "G_UNICODE_BREAK_HANGUL_V_JAMO", + 'LF' => "G_UNICODE_BREAK_LINE_FEED", + 'NL' => "G_UNICODE_BREAK_NEXT_LINE", + 'NS' => "G_UNICODE_BREAK_NON_STARTER", + 'NU' => "G_UNICODE_BREAK_NUMERIC", + 'OP' => "G_UNICODE_BREAK_OPEN_PUNCTUATION", 'PO' => "G_UNICODE_BREAK_POSTFIX", + 'PR' => "G_UNICODE_BREAK_PREFIX", + 'QU' => "G_UNICODE_BREAK_QUOTATION", + 'RI' => "G_UNICODE_BREAK_REGIONAL_INDICATOR", 'SA' => "G_UNICODE_BREAK_COMPLEX_CONTEXT", - 'AI' => "G_UNICODE_BREAK_AMBIGUOUS", - 'NL' => "G_UNICODE_BREAK_NEXT_LINE", + 'SG' => "G_UNICODE_BREAK_SURROGATE", + 'SP' => "G_UNICODE_BREAK_SPACE", + 'SY' => "G_UNICODE_BREAK_SYMBOL", 'WJ' => "G_UNICODE_BREAK_WORD_JOINER", 'XX' => "G_UNICODE_BREAK_UNKNOWN", - 'JL' => "G_UNICODE_BREAK_HANGUL_L_JAMO", - 'JV' => "G_UNICODE_BREAK_HANGUL_V_JAMO", - 'JT' => "G_UNICODE_BREAK_HANGUL_T_JAMO", - 'H2' => "G_UNICODE_BREAK_HANGUL_LV_SYLLABLE", - 'H3' => "G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE" + 'ZW' => "G_UNICODE_BREAK_ZERO_WIDTH_SPACE" ); # Title case mappings. @@ -157,8 +161,17 @@ my @special_cases; my @special_case_offsets; my $special_case_offset = 0; +# Scripts + +my @scripts; + +# East asian widths + +my @eawidths; + $do_decomp = 0; $do_props = 1; +$do_scripts = 1; if (@ARGV && $ARGV[0] eq '-decomp') { $do_decomp = 1; @@ -173,20 +186,29 @@ elsif (@ARGV && $ARGV[0] eq '-both') if (@ARGV != 2) { $0 =~ s@.*/@@; - die "\nUsage: $0 [-decomp | -both] UNICODE-VERSION DIRECTORY\n\n DIRECTORY should contain the following Unicode data files:\n UnicodeData.txt, LineBreak.txt, SpecialCasing.txt, CaseFolding.txt,\n CompositionExclusions.txt, BidiMirroring.txt\n\n"; + die "\nUsage: $0 [-decomp | -both] UNICODE-VERSION DIRECTORY\n\n DIRECTORY should contain the following Unicode data files:\n UnicodeData.txt, LineBreak.txt, SpecialCasing.txt, CaseFolding.txt,\n CompositionExclusions.txt Scripts.txt extracted/DerivedEastAsianWidth.txt \n\n"; } -my ($unicodedatatxt, $linebreaktxt, $specialcasingtxt, $casefoldingtxt, $compositionexclusionstxt); +my ($unicodedatatxt, $linebreaktxt, $specialcasingtxt, $casefoldingtxt, $compositionexclusionstxt, + $scriptstxt, $derivedeastasianwidth); my $d = $ARGV[1]; opendir (my $dir, $d) or die "Cannot open Unicode data dir $d: $!\n"; for my $f (readdir ($dir)) { - $unicodedatatxt = "$d/$f" if ($f =~ /UnicodeData.*\.txt/); - $linebreaktxt = "$d/$f" if ($f =~ /LineBreak.*\.txt/); - $specialcasingtxt = "$d/$f" if ($f =~ /SpecialCasing.*\.txt/); - $casefoldingtxt = "$d/$f" if ($f =~ /CaseFolding.*\.txt/); - $compositionexclusionstxt = "$d/$f" if ($f =~ /CompositionExclusions.*\.txt/); + $unicodedatatxt = "$d/$f" if ($f =~ /^UnicodeData.*\.txt/); + $linebreaktxt = "$d/$f" if ($f =~ /^LineBreak.*\.txt/); + $specialcasingtxt = "$d/$f" if ($f =~ /^SpecialCasing.*\.txt/); + $casefoldingtxt = "$d/$f" if ($f =~ /^CaseFolding.*\.txt/); + $compositionexclusionstxt = "$d/$f" if ($f =~ /^CompositionExclusions.*\.txt/); + $scriptstxt = "$d/$f" if ($f =~ /^Scripts.*\.txt/); +} + +my $extd = $ARGV[1] . "/extracted"; +opendir (my $extdir, $extd) or die "Cannot open Unicode/extracted data dir $extd: $!\n"; +for my $f (readdir ($extdir)) +{ + $derivedeastasianwidthtxt = "$extd/$f" if ($f =~ /^DerivedEastAsianWidth.*\.txt/); } defined $unicodedatatxt or die "Did not find UnicodeData file"; @@ -194,6 +216,8 @@ defined $linebreaktxt or die "Did not find LineBreak file"; defined $specialcasingtxt or die "Did not find SpecialCasing file"; defined $casefoldingtxt or die "Did not find CaseFolding file"; defined $compositionexclusionstxt or die "Did not find CompositionExclusions file"; +defined $scriptstxt or die "Did not find Scripts file"; +defined $derivedeastasianwidthtxt or die "Did not find DerivedEastAsianWidth file"; print "Creating decomp table\n" if ($do_decomp); print "Creating property table\n" if ($do_props); @@ -294,6 +318,7 @@ while () chop; next if /^#/; + next if /^$/; s/\s*#.*//; @@ -484,6 +509,51 @@ while () close INPUT; +print "Reading scripts\n"; + +open (INPUT, "< $scriptstxt") || exit 1; + +while () { + s/#.*//; + next if /^\s*$/; + if (!/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*([A-Za-z_]+)\s*$/) { + die "Cannot parse line: '$_'\n"; + } + + if (defined $2) { + push @scripts, [ hex $1, hex $2, uc $3 ]; + } else { + push @scripts, [ hex $1, hex $1, uc $3 ]; + } +} + +close INPUT; + +print "Reading derived east asian widths\n"; + +open (INPUT, "< $derivedeastasianwidthtxt") || exit 1; + +while () +{ + my ($start_code, $end_code); + + chop; + + s/#.*//; + next if /^\s*$/; + if (!/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*([A-Za-z_]+)\s*$/) { + die "Cannot parse line: '$_'\n"; + } + + if (defined $2) { + push @eawidths, [ hex $1, hex $2, $3 ]; + } else { + push @eawidths, [ hex $1, hex $1, $3 ]; + } +} + +close INPUT; + if ($do_props) { &print_tables ($last_code) } @@ -491,9 +561,12 @@ if ($do_decomp) { &print_decomp ($last_code); &output_composition_table; } - &print_line_break ($last_code); +if ($do_scripts) { + &print_scripts +} + exit 0; @@ -502,7 +575,6 @@ sub length_in_bytes { my ($string) = @_; - use bytes; return length $string; } @@ -660,6 +732,11 @@ sub print_tables &output_special_case_table (\*OUT); &output_casefold_table (\*OUT); + # + # And the widths tables + # + &output_width_tables (\*OUT); + print OUT "#endif /* CHARTABLES_H */\n"; close (OUT); @@ -867,6 +944,33 @@ sub print_decomp printf OUT "static const gchar decomp_expansion_string[] = %s;\n\n", $decomp_string; + print OUT "typedef struct\n{\n"; + print OUT " gunichar ch;\n"; + print OUT " gunichar a;\n"; + print OUT " gunichar b;\n"; + print OUT "} decomposition_step;\n\n"; + + # There's lots of room to optimize the following table... + print OUT "static const decomposition_step decomp_step_table[] =\n{\n"; + $first = 1; + my @steps = (); + for ($count = 0; $count <= $last; ++$count) + { + if ((defined $decompositions[$count]) && (!$decompose_compat[$count])) + { + print OUT ",\n" + if ! $first; + $first = 0; + my @list; + @list = (split(' ', $decompositions[$count]), "0"); + printf OUT qq( { 0x%05x, 0x%05x, 0x%05x }), $count, hex($list[0]), hex($list[1]); + # don't include 1:1 in the compose table + push @steps, [ ($count, hex($list[0]), hex($list[1])) ] + if hex($list[1]) + } + } + print OUT "\n};\n\n"; + print OUT "#endif /* DECOMP_H */\n"; printf STDERR "Generated %d bytes in decomp tables\n", $bytes_out; @@ -889,6 +993,9 @@ sub print_line_break print OUT "#ifndef BREAKTABLES_H\n"; print OUT "#define BREAKTABLES_H\n\n"; + print OUT "#include \n"; + print OUT "#include \n\n"; + print OUT "#define G_UNICODE_DATA_VERSION \"$ARGV[0]\"\n\n"; printf OUT "#define G_UNICODE_LAST_CHAR 0x%04X\n\n", $last; @@ -995,8 +1102,8 @@ sub add_special_case (map { hex ($_) } split /\s+/, $field1), 0, (map { hex ($_) } split /\s+/, $field2)); - $result = ""; + $result = ""; for $value (@values) { $result .= pack ("U", $value); # to utf-8 @@ -1076,6 +1183,10 @@ sub output_composition_table @values = map { hex ($_) } split /\s+/, $compositions{$code}; # non-starters + if ($cclass[$code]) { + delete $compositions{$code}; + next; + } if ($cclass[$values[0]]) { delete $compositions{$code}; next; @@ -1200,12 +1311,9 @@ sub output_composition_table # Output first singletons - print OUT "static const guint16 compose_first_single[][2] = {\n"; + print OUT "static const gunichar compose_first_single[][2] = {\n"; $i = 0; for $record (@first_singletons) { - if ($record->[1] > 0xFFFF or $record->[2] > 0xFFFF) { - die "time to switch compose_first_single to gunichar" ; - } print OUT ",\n" if $i++ > 0; printf OUT " { %#06x, %#06x }", $record->[1], $record->[2]; } @@ -1215,12 +1323,9 @@ sub output_composition_table # Output second singletons - print OUT "static const guint16 compose_second_single[][2] = {\n"; + print OUT "static const gunichar compose_second_single[][2] = {\n"; $i = 0; for $record (@second_singletons) { - if ($record->[1] > 0xFFFF or $record->[2] > 0xFFFF) { - die "time to switch compose_second_single to gunichar"; - } print OUT ",\n" if $i++ > 0; printf OUT " { %#06x, %#06x }", $record->[1], $record->[2]; } @@ -1299,5 +1404,146 @@ EOT printf "Generated %d bytes for casefold table\n", $recordlen * @casefold; } - +sub output_one_width_table +{ + my ($out, $name, $wpe) = @_; + my $start; + my $end; + my $wp; + my $rex; + + print $out "static const struct Interval g_unicode_width_table_${name}[] = {\n"; + + $rex = qr/$wpe/; + + for (my $i = 0; $i <= $#eawidths; $i++) { + $start = $eawidths[$i]->[0]; + $end = $eawidths[$i]->[1]; + $wp = $eawidths[$i]->[2]; + + next if ($wp !~ $rex); + + while ($i <= $#eawidths - 1 && + $eawidths[$i + 1]->[0] == $end + 1 && + ($eawidths[$i + 1]->[2] =~ $rex)) { + $i++; + $end = $eawidths[$i]->[1]; + } + + printf $out "{0x%04X, 0x%04X},\n", $start, $end; + } + + printf $out "};\n\n"; +} + +sub output_width_tables +{ + my $out = shift; + + @eawidths = sort { $a->[0] <=> $b->[0] } @eawidths; + + print $out <gscripttable.h" or die "Cannot open gscripttable.h: $!\n"; + + print OUT<[0] <=> $b->[0] } @scripts; + + $easy_range = 0x2000; + + print OUT< $end) { + $start = $scripts[$i]->[0]; + $end = $scripts[$i]->[1]; + $script = $scripts[$i]->[2]; + $i++; + } + + if ($c < $start) { + printf OUT " G_UNICODE_SCRIPT_UNKNOWN,"; + } else { + printf OUT " G_UNICODE_SCRIPT_%s,", $script; + } + } + + if ($end >= $easy_range) { + $i--; + $scripts[$i]->[0] = $easy_range; + } + print OUT<[0]; + $end = $scripts[$i]->[1]; + $script = $scripts[$i]->[2]; + + while ($i <= $#scripts - 1 && + $scripts[$i + 1]->[0] == $end + 1 && + $scripts[$i + 1]->[2] eq $script) { + $i++; + $end = $scripts[$i]->[1]; + } + printf OUT " { %#06x, %5d, G_UNICODE_SCRIPT_%s },\n", $start, $end - $start + 1, $script; + } + + printf OUT<