X-Git-Url: http://review.tizen.org/git/?a=blobdiff_plain;f=glib%2Fgen-unicode-tables.pl;h=ebcb4a44c6211c51adb5596d8627c86938a77e40;hb=13e15733f38a40c6ef6a1baede91cce81c86ebaa;hp=c3623eb932003b83f8bdd12a09d03f3a96d1a3ed;hpb=761a1841eebedefeb7dc3675e442a56309f347c9;p=platform%2Fupstream%2Fglib.git diff --git a/glib/gen-unicode-tables.pl b/glib/gen-unicode-tables.pl index c3623eb..ebcb4a4 100755 --- a/glib/gen-unicode-tables.pl +++ b/glib/gen-unicode-tables.pl @@ -14,9 +14,7 @@ # GNU General Public License for more details. # You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA -# 02111-1307, USA. +# along with this program; if not, see . # Contributer(s): # Andrew Taylor @@ -79,7 +77,7 @@ $FOLDING_MAPPING = 2; 'Ll' => "G_UNICODE_LOWERCASE_LETTER", 'Lt' => "G_UNICODE_TITLECASE_LETTER", 'Mn' => "G_UNICODE_NON_SPACING_MARK", - 'Mc' => "G_UNICODE_COMBINING_MARK", + 'Mc' => "G_UNICODE_SPACING_MARK", 'Me' => "G_UNICODE_ENCLOSING_MARK", 'Nd' => "G_UNICODE_DECIMAL_NUMBER", 'Nl' => "G_UNICODE_LETTER_NUMBER", @@ -118,6 +116,7 @@ $FOLDING_MAPPING = 2; 'BB' => "G_UNICODE_BREAK_BEFORE", 'BK' => "G_UNICODE_BREAK_MANDATORY", 'CB' => "G_UNICODE_BREAK_CONTINGENT", + 'CJ' => "G_UNICODE_BREAK_CONDITIONAL_JAPANESE_STARTER", 'CL' => "G_UNICODE_BREAK_CLOSE_PUNCTUATION", 'CM' => "G_UNICODE_BREAK_COMBINING_MARK", 'CP' => "G_UNICODE_BREAK_CLOSE_PARANTHESIS", @@ -126,6 +125,7 @@ $FOLDING_MAPPING = 2; 'GL' => "G_UNICODE_BREAK_NON_BREAKING_GLUE", 'H2' => "G_UNICODE_BREAK_HANGUL_LV_SYLLABLE", 'H3' => "G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE", + 'HL' => "G_UNICODE_BREAK_HEBREW_LETTER", 'HY' => "G_UNICODE_BREAK_HYPHEN", 'ID' => "G_UNICODE_BREAK_IDEOGRAPHIC", 'IN' => "G_UNICODE_BREAK_INSEPARABLE", @@ -141,6 +141,7 @@ $FOLDING_MAPPING = 2; 'PO' => "G_UNICODE_BREAK_POSTFIX", 'PR' => "G_UNICODE_BREAK_PREFIX", 'QU' => "G_UNICODE_BREAK_QUOTATION", + 'RI' => "G_UNICODE_BREAK_REGIONAL_INDICATOR", 'SA' => "G_UNICODE_BREAK_COMPLEX_CONTEXT", 'SG' => "G_UNICODE_BREAK_SURROGATE", 'SP' => "G_UNICODE_BREAK_SPACE", @@ -160,8 +161,17 @@ my @special_cases; my @special_case_offsets; my $special_case_offset = 0; +# Scripts + +my @scripts; + +# East asian widths + +my @eawidths; + $do_decomp = 0; $do_props = 1; +$do_scripts = 1; if (@ARGV && $ARGV[0] eq '-decomp') { $do_decomp = 1; @@ -176,10 +186,11 @@ elsif (@ARGV && $ARGV[0] eq '-both') if (@ARGV != 2) { $0 =~ s@.*/@@; - die "\nUsage: $0 [-decomp | -both] UNICODE-VERSION DIRECTORY\n\n DIRECTORY should contain the following Unicode data files:\n UnicodeData.txt, LineBreak.txt, SpecialCasing.txt, CaseFolding.txt,\n CompositionExclusions.txt\n\n"; + die "\nUsage: $0 [-decomp | -both] UNICODE-VERSION DIRECTORY\n\n DIRECTORY should contain the following Unicode data files:\n UnicodeData.txt, LineBreak.txt, SpecialCasing.txt, CaseFolding.txt,\n CompositionExclusions.txt Scripts.txt extracted/DerivedEastAsianWidth.txt \n\n"; } -my ($unicodedatatxt, $linebreaktxt, $specialcasingtxt, $casefoldingtxt, $compositionexclusionstxt); +my ($unicodedatatxt, $linebreaktxt, $specialcasingtxt, $casefoldingtxt, $compositionexclusionstxt, + $scriptstxt, $derivedeastasianwidth); my $d = $ARGV[1]; opendir (my $dir, $d) or die "Cannot open Unicode data dir $d: $!\n"; @@ -190,6 +201,14 @@ for my $f (readdir ($dir)) $specialcasingtxt = "$d/$f" if ($f =~ /^SpecialCasing.*\.txt/); $casefoldingtxt = "$d/$f" if ($f =~ /^CaseFolding.*\.txt/); $compositionexclusionstxt = "$d/$f" if ($f =~ /^CompositionExclusions.*\.txt/); + $scriptstxt = "$d/$f" if ($f =~ /^Scripts.*\.txt/); +} + +my $extd = $ARGV[1] . "/extracted"; +opendir (my $extdir, $extd) or die "Cannot open Unicode/extracted data dir $extd: $!\n"; +for my $f (readdir ($extdir)) +{ + $derivedeastasianwidthtxt = "$extd/$f" if ($f =~ /^DerivedEastAsianWidth.*\.txt/); } defined $unicodedatatxt or die "Did not find UnicodeData file"; @@ -197,6 +216,8 @@ defined $linebreaktxt or die "Did not find LineBreak file"; defined $specialcasingtxt or die "Did not find SpecialCasing file"; defined $casefoldingtxt or die "Did not find CaseFolding file"; defined $compositionexclusionstxt or die "Did not find CompositionExclusions file"; +defined $scriptstxt or die "Did not find Scripts file"; +defined $derivedeastasianwidthtxt or die "Did not find DerivedEastAsianWidth file"; print "Creating decomp table\n" if ($do_decomp); print "Creating property table\n" if ($do_props); @@ -488,6 +509,51 @@ while () close INPUT; +print "Reading scripts\n"; + +open (INPUT, "< $scriptstxt") || exit 1; + +while () { + s/#.*//; + next if /^\s*$/; + if (!/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*([A-Za-z_]+)\s*$/) { + die "Cannot parse line: '$_'\n"; + } + + if (defined $2) { + push @scripts, [ hex $1, hex $2, uc $3 ]; + } else { + push @scripts, [ hex $1, hex $1, uc $3 ]; + } +} + +close INPUT; + +print "Reading derived east asian widths\n"; + +open (INPUT, "< $derivedeastasianwidthtxt") || exit 1; + +while () +{ + my ($start_code, $end_code); + + chop; + + s/#.*//; + next if /^\s*$/; + if (!/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*([A-Za-z_]+)\s*$/) { + die "Cannot parse line: '$_'\n"; + } + + if (defined $2) { + push @eawidths, [ hex $1, hex $2, $3 ]; + } else { + push @eawidths, [ hex $1, hex $1, $3 ]; + } +} + +close INPUT; + if ($do_props) { &print_tables ($last_code) } @@ -495,9 +561,12 @@ if ($do_decomp) { &print_decomp ($last_code); &output_composition_table; } - &print_line_break ($last_code); +if ($do_scripts) { + &print_scripts +} + exit 0; @@ -663,6 +732,11 @@ sub print_tables &output_special_case_table (\*OUT); &output_casefold_table (\*OUT); + # + # And the widths tables + # + &output_width_tables (\*OUT); + print OUT "#endif /* CHARTABLES_H */\n"; close (OUT); @@ -876,6 +950,7 @@ sub print_decomp print OUT " gunichar b;\n"; print OUT "} decomposition_step;\n\n"; + # There's lots of room to optimize the following table... print OUT "static const decomposition_step decomp_step_table[] =\n{\n"; $first = 1; my @steps = (); @@ -896,20 +971,6 @@ sub print_decomp } print OUT "\n};\n\n"; - print OUT "static const decomposition_step comp_step_table[] =\n{\n"; - my @inverted; - @inverted = sort { @{$a}[1] <=> @{$b}[1] || - @{$a}[2] <=> @{$b}[2] } @steps; - $first = 1; - foreach my $i ( 0 .. $#inverted ) - { - print OUT ",\n" - if ! $first; - $first = 0; - printf OUT qq( { 0x%05x, 0x%05x, 0x%05x }), $inverted[$i][0], $inverted[$i][1], $inverted[$i][2]; - } - print OUT "\n};\n\n"; - print OUT "#endif /* DECOMP_H */\n"; printf STDERR "Generated %d bytes in decomp tables\n", $bytes_out; @@ -1122,6 +1183,10 @@ sub output_composition_table @values = map { hex ($_) } split /\s+/, $compositions{$code}; # non-starters + if ($cclass[$code]) { + delete $compositions{$code}; + next; + } if ($cclass[$values[0]]) { delete $compositions{$code}; next; @@ -1258,12 +1323,9 @@ sub output_composition_table # Output second singletons - print OUT "static const guint16 compose_second_single[][2] = {\n"; + print OUT "static const gunichar compose_second_single[][2] = {\n"; $i = 0; for $record (@second_singletons) { - if ($record->[1] > 0xFFFF or $record->[2] > 0xFFFF) { - die "time to switch compose_second_single to gunichar"; - } print OUT ",\n" if $i++ > 0; printf OUT " { %#06x, %#06x }", $record->[1], $record->[2]; } @@ -1342,5 +1404,146 @@ EOT printf "Generated %d bytes for casefold table\n", $recordlen * @casefold; } - +sub output_one_width_table +{ + my ($out, $name, $wpe) = @_; + my $start; + my $end; + my $wp; + my $rex; + + print $out "static const struct Interval g_unicode_width_table_${name}[] = {\n"; + + $rex = qr/$wpe/; + + for (my $i = 0; $i <= $#eawidths; $i++) { + $start = $eawidths[$i]->[0]; + $end = $eawidths[$i]->[1]; + $wp = $eawidths[$i]->[2]; + + next if ($wp !~ $rex); + + while ($i <= $#eawidths - 1 && + $eawidths[$i + 1]->[0] == $end + 1 && + ($eawidths[$i + 1]->[2] =~ $rex)) { + $i++; + $end = $eawidths[$i]->[1]; + } + + printf $out "{0x%04X, 0x%04X},\n", $start, $end; + } + + printf $out "};\n\n"; +} + +sub output_width_tables +{ + my $out = shift; + + @eawidths = sort { $a->[0] <=> $b->[0] } @eawidths; + + print $out <gscripttable.h" or die "Cannot open gscripttable.h: $!\n"; + + print OUT<[0] <=> $b->[0] } @scripts; + + $easy_range = 0x2000; + print OUT< $end) { + $start = $scripts[$i]->[0]; + $end = $scripts[$i]->[1]; + $script = $scripts[$i]->[2]; + $i++; + } + + if ($c < $start) { + printf OUT " G_UNICODE_SCRIPT_UNKNOWN,"; + } else { + printf OUT " G_UNICODE_SCRIPT_%s,", $script; + } + } + + if ($end >= $easy_range) { + $i--; + $scripts[$i]->[0] = $easy_range; + } + + print OUT<[0]; + $end = $scripts[$i]->[1]; + $script = $scripts[$i]->[2]; + + while ($i <= $#scripts - 1 && + $scripts[$i + 1]->[0] == $end + 1 && + $scripts[$i + 1]->[2] eq $script) { + $i++; + $end = $scripts[$i]->[1]; + } + printf OUT " { %#06x, %5d, G_UNICODE_SCRIPT_%s },\n", $start, $end - $start + 1, $script; + } + + printf OUT<