From 7e3d32b7053b47ca7feecf185abac96b619770c2 Mon Sep 17 00:00:00 2001 From: Christian Persch Date: Sat, 3 May 2014 18:49:07 +0200 Subject: [PATCH] unicode: Move gscripttable.h generation into main script So we just have to run one script when updating the unicode data, not two. --- glib/gen-script-table.pl | 119 ----------------------------------------- glib/gen-unicode-tables.pl | 129 +++++++++++++++++++++++++++++++++++++++++++-- glib/gscripttable.h | 14 ++--- 3 files changed, 133 insertions(+), 129 deletions(-) delete mode 100755 glib/gen-script-table.pl diff --git a/glib/gen-script-table.pl b/glib/gen-script-table.pl deleted file mode 100755 index 27268ab..0000000 --- a/glib/gen-script-table.pl +++ /dev/null @@ -1,119 +0,0 @@ -#!/usr/bin/perl -w -# -# Script to convert http://www.unicode.org/Public/UNIDATA/Scripts.txt -# into a machine-readable table. -# -###################################################################### - -if (@ARGV != 1) { - die "Usage: gen-script-table.pl Scripts.txt > gscripttable.h\n"; -} - -open IN, $ARGV[0] || die "Cannot open $ARGV[0]: $!\n"; - -my @ranges; -my $file; -my $easy_range; -my $i; -my $start; -my $end; -my $script; - - -while () { - if (/^\#\s+(Scripts-.*.txt)/) { - $file = $1; - } - - s/#.*//; - next if /^\s*$/; - if (!/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*([A-Za-z_]+)\s*$/) { - die "Cannot parse line: '$_'\n"; - } - - if (defined $2) { - push @ranges, [ hex $1, hex $2, uc $3 ]; - } else { - push @ranges, [ hex $1, hex $1, uc $3 ]; - } -} - -@ranges = sort { $a->[0] <=> $b->[0] } @ranges; -$date = gmtime; - -print <<"EOT"; -/* gscripttable.h: Generated by gen-script-table.pl - * - * Date: $date - * Source: $file - * - * Do not edit. - */ - -EOT - -$easy_range = 0x2000; - -print <<"EOT"; -#define G_EASY_SCRIPTS_RANGE $easy_range - -static const guchar g_script_easy_table[$easy_range] = { -EOT - -$i = 0; -$end = -1; - -for (my $c = 0; $c < $easy_range; $c++) { - - if ($c % 3 == 0) { - printf "\n "; - } - - if ($c > $end) { - $start = $ranges[$i]->[0]; - $end = $ranges[$i]->[1]; - $script = $ranges[$i]->[2]; - $i++; - } - - if ($c < $start) { - printf " G_UNICODE_SCRIPT_UNKNOWN,"; - } else { - printf " G_UNICODE_SCRIPT_%s,", $script; - } -} - -if ($end >= $easy_range) { - $i--; - $ranges[$i]->[0] = $easy_range; -} - - -print <<"EOT"; - -}; - -static const struct { - gunichar start; - guint16 chars; - guint16 script; -} g_script_table[] = { -EOT - -for (; $i <= $#ranges; $i++) { - $start = $ranges[$i]->[0]; - $end = $ranges[$i]->[1]; - $script = $ranges[$i]->[2]; - - while ($i <= $#ranges - 1 && - $ranges[$i + 1]->[0] == $end + 1 && - $ranges[$i + 1]->[2] eq $script) { - $i++; - $end = $ranges[$i]->[1]; - } - - printf " { %#06x, %5d, G_UNICODE_SCRIPT_%s },\n", $start, $end - $start + 1, $script; -} - -printf "};\n"; - diff --git a/glib/gen-unicode-tables.pl b/glib/gen-unicode-tables.pl index e6520b8..46c4b27 100755 --- a/glib/gen-unicode-tables.pl +++ b/glib/gen-unicode-tables.pl @@ -161,12 +161,17 @@ my @special_cases; my @special_case_offsets; my $special_case_offset = 0; +# Scripts + +my @scripts; + # East asian widths my @eawidths; $do_decomp = 0; $do_props = 1; +$do_scripts = 1; if (@ARGV && $ARGV[0] eq '-decomp') { $do_decomp = 1; @@ -181,11 +186,11 @@ elsif (@ARGV && $ARGV[0] eq '-both') if (@ARGV != 2) { $0 =~ s@.*/@@; - die "\nUsage: $0 [-decomp | -both] UNICODE-VERSION DIRECTORY\n\n DIRECTORY should contain the following Unicode data files:\n UnicodeData.txt, LineBreak.txt, SpecialCasing.txt, CaseFolding.txt,\n CompositionExclusions.txt extracted/DerivedEastAsianWidth.txt \n\n"; + die "\nUsage: $0 [-decomp | -both] UNICODE-VERSION DIRECTORY\n\n DIRECTORY should contain the following Unicode data files:\n UnicodeData.txt, LineBreak.txt, SpecialCasing.txt, CaseFolding.txt,\n CompositionExclusions.txt Scripts.txt extracted/DerivedEastAsianWidth.txt \n\n"; } my ($unicodedatatxt, $linebreaktxt, $specialcasingtxt, $casefoldingtxt, $compositionexclusionstxt, - $derivedeastasianwidth); + $scriptstxt, $derivedeastasianwidth); my $d = $ARGV[1]; opendir (my $dir, $d) or die "Cannot open Unicode data dir $d: $!\n"; @@ -196,6 +201,7 @@ for my $f (readdir ($dir)) $specialcasingtxt = "$d/$f" if ($f =~ /^SpecialCasing.*\.txt/); $casefoldingtxt = "$d/$f" if ($f =~ /^CaseFolding.*\.txt/); $compositionexclusionstxt = "$d/$f" if ($f =~ /^CompositionExclusions.*\.txt/); + $scriptstxt = "$d/$f" if ($f =~ /^Scripts.*\.txt/); } my $extd = $ARGV[1] . "/extracted"; @@ -210,6 +216,7 @@ defined $linebreaktxt or die "Did not find LineBreak file"; defined $specialcasingtxt or die "Did not find SpecialCasing file"; defined $casefoldingtxt or die "Did not find CaseFolding file"; defined $compositionexclusionstxt or die "Did not find CompositionExclusions file"; +defined $scriptstxt or die "Did not find Scripts file"; defined $derivedeastasianwidthtxt or die "Did not find DerivedEastAsianWidth file"; print "Creating decomp table\n" if ($do_decomp); @@ -502,6 +509,26 @@ while () close INPUT; +print "Reading scripts\n"; + +open (INPUT, "< $scriptstxt") || exit 1; + +while () { + s/#.*//; + next if /^\s*$/; + if (!/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*([A-Za-z_]+)\s*$/) { + die "Cannot parse line: '$_'\n"; + } + + if (defined $2) { + push @scripts, [ hex $1, hex $2, uc $3 ]; + } else { + push @scripts, [ hex $1, hex $1, uc $3 ]; + } +} + +close INPUT; + print "Reading derived east asian widths\n"; open (INPUT, "< $derivedeastasianwidthtxt") || exit 1; @@ -534,9 +561,12 @@ if ($do_decomp) { &print_decomp ($last_code); &output_composition_table; } - &print_line_break ($last_code); +if ($do_scripts) { + &print_scripts +} + exit 0; @@ -1427,3 +1457,96 @@ EOT &output_one_width_table ($out,"wide", "[FW]"); &output_one_width_table ($out, "ambiguous", "[A]"); } + +sub print_scripts +{ + my $start; + my $end; + my $script; + my $easy_range; + my $i; + + print STDERR "Writing gscripttable.h\n"; + + open OUT, ">gscripttable.h" or die "Cannot open gscripttable.h: $!\n"; + + print OUT<[0] <=> $b->[0] } @scripts; + + $easy_range = 0x2000; + + print OUT< $end) { + $start = $scripts[$i]->[0]; + $end = $scripts[$i]->[1]; + $script = $scripts[$i]->[2]; + $i++; + } + + if ($c < $start) { + printf OUT " G_UNICODE_SCRIPT_UNKNOWN,"; + } else { + printf OUT " G_UNICODE_SCRIPT_%s,", $script; + } + } + + if ($end >= $easy_range) { + $i--; + $scripts[$i]->[0] = $easy_range; + } + + print OUT<[0]; + $end = $scripts[$i]->[1]; + $script = $scripts[$i]->[2]; + + while ($i <= $#scripts - 1 && + $scripts[$i + 1]->[0] == $end + 1 && + $scripts[$i + 1]->[2] eq $script) { + $i++; + $end = $scripts[$i]->[1]; + } + printf OUT " { %#06x, %5d, G_UNICODE_SCRIPT_%s },\n", $start, $end - $start + 1, $script; + } + + printf OUT<