tests/gen-casemap-txt.pl

   1 #! /usr/bin/perl -w
   2
   3 #    Copyright (C) 1998, 1999 Tom Tromey
   4 #    Copyright (C) 2001 Red Hat Software
   5
   6 #    This program is free software; you can redistribute it and/or modify
   7 #    it under the terms of the GNU General Public License as published by
   8 #    the Free Software Foundation; either version 2, or (at your option)
   9 #    any later version.
  10
  11 #    This program is distributed in the hope that it will be useful,
  12 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 #    GNU General Public License for more details.
  15
  16 #    You should have received a copy of the GNU General Public License
  17 #    along with this program; if not, write to the Free Software
  18 #    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  19 #    02111-1307, USA.
  20
  21 # gen-casemap-test.pl - Generate test cases for case mapping from Unicode data.
  22 # See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
  23 # I consider the output of this program to be unrestricted.  Use it as
  24 # you will.
  25
  26 require 5.006;
  27 use utf8;
  28
  29 if (@ARGV != 3) {
  30     $0 =~ s@.*/@@;
  31     die "Usage: $0 UNICODE-VERSION UnicodeData.txt SpecialCasing.txt\n";
  32 }
  33
  34 use vars qw($CODE $NAME $CATEGORY $COMBINING_CLASSES $BIDI_CATEGORY $DECOMPOSITION $DECIMAL_VALUE $DIGIT_VALUE $NUMERIC_VALUE $MIRRORED $OLD_NAME $COMMENT $UPPER $LOWER $TITLE $BREAK_CODE $BREAK_CATEGORY $BREAK_NAME $CASE_CODE $CASE_LOWER $CASE_TITLE $CASE_UPPER $CASE_CONDITION);
  35
  36 # Names of fields in Unicode data table.
  37 $CODE = 0;
  38 $NAME = 1;
  39 $CATEGORY = 2;
  40 $COMBINING_CLASSES = 3;
  41 $BIDI_CATEGORY = 4;
  42 $DECOMPOSITION = 5;
  43 $DECIMAL_VALUE = 6;
  44 $DIGIT_VALUE = 7;
  45 $NUMERIC_VALUE = 8;
  46 $MIRRORED = 9;
  47 $OLD_NAME = 10;
  48 $COMMENT = 11;
  49 $UPPER = 12;
  50 $LOWER = 13;
  51 $TITLE = 14;
  52
  53 # Names of fields in the SpecialCasing table
  54 $CASE_CODE = 0;
  55 $CASE_LOWER = 1;
  56 $CASE_TITLE = 2;
  57 $CASE_UPPER = 3;
  58 $CASE_CONDITION = 4;
  59
  60 my @upper;
  61 my @title;
  62 my @lower;
  63
  64 binmode STDOUT, ":utf8";
  65 open (INPUT, "< $ARGV[1]") || exit 1;
  66
  67 $last_code = -1;
  68 while (<INPUT>)
  69 {
  70     chop;
  71     @fields = split (';', $_, 30);
  72     if ($#fields != 14)
  73     {
  74         printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $#fields);
  75     }
  76
  77     $code = hex ($fields[$CODE]);
  78
  79     if ($code > $last_code + 1)
  80     {
  81         # Found a gap.
  82         if ($fields[$NAME] =~ /Last>/)
  83         {
  84             # Fill the gap with the last character read,
  85             # since this was a range specified in the char database
  86             @gfields = @fields;
  87         }
  88         else
  89         {
  90             # The gap represents undefined characters.  Only the type
  91             # matters.
  92             @gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
  93                         '', '', '', '');
  94         }
  95         for (++$last_code; $last_code < $code; ++$last_code)
  96         {
  97             $gfields{$CODE} = sprintf ("%04x", $last_code);
  98             &process_one ($last_code, @gfields);
  99         }
 100     }
 101     &process_one ($code, @fields);
 102     $last_code = $code;
 103 }
 104
 105 close INPUT;
 106
 107 open (INPUT, "< $ARGV[2]") || exit 1;
 108
 109 while (<INPUT>)
 110 {
 111     my $code;
 112
 113     chop;
 114
 115     next if /^#/;
 116     next if /^\s*$/;
 117
 118     s/\s*#.*//;
 119
 120     @fields = split ('\s*;\s*', $_, 30);
 121
 122     $raw_code = $fields[$CASE_CODE];
 123     $code = hex ($raw_code);
 124
 125     if ($#fields != 4 && $#fields != 5)
 126     {
 127         printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields);
 128         next;
 129     }
 130
 131     if (defined $fields[5]) {
 132         # Ignore conditional special cases - we'll handle them manually
 133         next;
 134     }
 135
 136     $upper[$code] = &make_hex ($fields[$CASE_UPPER]);
 137     $lower[$code] = &make_hex ($fields[$CASE_LOWER]);
 138     $title[$code] = &make_hex ($fields[$CASE_TITLE]);
 139 }
 140
 141 close INPUT;
 142
 143 print <<EOT;
 144 # Test cases generated from Unicode $ARGV[0] data
 145 # by gen-case-tests.pl. Do not edit.
 146 #
 147 # Some special hand crafted tests
 148 #
 149 tr_TR\ti\ti\t\x{0130}\t\x{0130}\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
 150 tr_TR\tI\t\x{0131}\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
 151 tr_TR\tI\x{0307}\ti\tI\x{0307}\tI\x{0307}\t# I => LATIN SMALL LETTER DOTLESS I
 152 tr_TR.UTF-8\ti\ti\t\x{0130}\t\x{0130}\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
 153 tr_TR.UTF-8\tI\t\x{0131}\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
 154 tr_TR.UTF-8\tI\x{0307}\ti\tI\x{0307}\tI\x{0307}\t# I => LATIN SMALL LETTER DOTLESS I
 155 # Test reordering of YPOGEGRAMMENI across other accents
 156 \t\x{03b1}\x{0345}\x{0314}\t\x{03b1}\x{0345}\x{314}\t\x{0391}\x{0345}\x{0314}\t\x{0391}\x{0314}\x{0399}\t
 157 \t\x{03b1}\x{0314}\x{0345}\t\x{03b1}\x{314}\x{0345}\t\x{0391}\x{0314}\x{0345}\t\x{0391}\x{0314}\x{0399}\t
 158 # Handling of final and nonfinal sigma
 159         ΜΆΙΟΣ      μάιος      Μάιος      ΜΆΙΟΣ
 160         ΜΆΙΟΣ      μάιος      Μάιος      ΜΆΙΟΣ
 161         ΣΙΓΜΑ      σιγμα      Σιγμα      ΣΙΓΜΑ
 162 # Lithuanian rule of i followed by letter with dot. Not at all sure
 163 # about the titlecase part here
 164 lt_LT\ti\x{117}\ti\x{117}\tIe\tIE\t
 165 lt_LT\tie\x{307}\tie\x{307}\tIe\tIE\t
 166 lt_LT\t\x{00cc}\ti\x{0307}\x{0300}\t\x{00cc}\t\x{00cc}\t # LATIN CAPITAL LETTER I WITH GRAVE
 167 lt_LT\t\x{00CD}\ti\x{0307}\x{0301}\t\x{00CD}\t\x{00CD}\t # LATIN CAPITAL LETTER I WITH ACUTE
 168 lt_LT\t\x{0128}\ti\x{0307}\x{0303}\t\x{0128}\t\x{0128}\t # LATIN CAPITAL LETTER I WITH TILDE
 169 lt_LT\tI\x{0301}\ti\x{0307}\x{0301}\tI\x{0301}\tI\x{0301}\t # LATIN CAPITAL LETTER I (with acute accent)
 170 lt_LT\tI\x{0300}\ti\x{0307}\x{0300}\tI\x{0300}\tI\x{0300}\t # LATIN CAPITAL LETTER I (with grave accent)
 171 lt_LT\tI\x{0303}\ti\x{0307}\x{0303}\tI\x{0303}\tI\x{0303}\t # LATIN CAPITAL LETTER I (with tilde above)
 172 lt_LT\tI\x{0328}\x{0301}\ti\x{0307}\x{0328}\x{0301}\tI\x{0328}\x{0301}\tI\x{0328}\x{0301}\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
 173 lt_LT\tJ\x{0301}\tj\x{0307}\x{0301}\tJ\x{0301}\tJ\x{0301}\t # LATIN CAPITAL LETTER J (with acute accent)
 174 lt_LT\t\x{012e}\x{0301}\t\x{012f}\x{0307}\x{0301}\t\x{012e}\x{0301}\t\x{012e}\x{0301}\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
 175 lt_LT.UTF-8\ti\x{117}\ti\x{117}\tIe\tIE\t
 176 lt_LT.UTF-8\tie\x{307}\tie\x{307}\tIe\tIE\t
 177 lt_LT.UTF-8\t\x{00cc}\ti\x{0307}\x{0300}\t\x{00cc}\t\x{00cc}\t # LATIN CAPITAL LETTER I WITH GRAVE
 178 lt_LT.UTF-8\t\x{00CD}\ti\x{0307}\x{0301}\t\x{00CD}\t\x{00CD}\t # LATIN CAPITAL LETTER I WITH ACUTE
 179 lt_LT.UTF-8\t\x{0128}\ti\x{0307}\x{0303}\t\x{0128}\t\x{0128}\t # LATIN CAPITAL LETTER I WITH TILDE
 180 lt_LT.UTF-8\tI\x{0301}\ti\x{0307}\x{0301}\tI\x{0301}\tI\x{0301}\t # LATIN CAPITAL LETTER I (with acute accent)
 181 lt_LT.UTF-8\tI\x{0300}\ti\x{0307}\x{0300}\tI\x{0300}\tI\x{0300}\t # LATIN CAPITAL LETTER I (with grave accent)
 182 lt_LT.UTF-8\tI\x{0303}\ti\x{0307}\x{0303}\tI\x{0303}\tI\x{0303}\t # LATIN CAPITAL LETTER I (with tilde above)
 183 lt_LT.UTF-8\tI\x{0328}\x{0301}\ti\x{0307}\x{0328}\x{0301}\tI\x{0328}\x{0301}\tI\x{0328}\x{0301}\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
 184 lt_LT.UTF-8\tJ\x{0301}\tj\x{0307}\x{0301}\tJ\x{0301}\tJ\x{0301}\t # LATIN CAPITAL LETTER J (with acute accent)
 185 lt_LT.UTF-8\t\x{012e}\x{0301}\t\x{012f}\x{0307}\x{0301}\t\x{012e}\x{0301}\t\x{012e}\x{0301}\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
 186 # Special case not at initial position
 187 \ta\x{fb04}\ta\x{fb04}\tAffl\tAFFL\t# FB04
 188 #
 189 # Now the automatic tests
 190 #
 191 EOT
 192 &print_tests;
 193
 194 exit 0;
 195
 196 # Process a single character.
 197 sub process_one
 198 {
 199     my ($code, @fields) = @_;
 200
 201     my $type =  $fields[$CATEGORY];
 202     if ($type eq 'Ll')
 203     {
 204         $upper[$code] = make_hex ($fields[$UPPER]);
 205         $lower[$code] = pack ("U", $code);
 206         $title[$code] = make_hex ($fields[$TITLE]);
 207     }
 208     elsif ($type eq 'Lu')
 209     {
 210         $lower[$code] = make_hex ($fields[$LOWER]);
 211         $upper[$code] = pack ("U", $code);
 212         $title[$code] = make_hex ($fields[$TITLE]);
 213     }
 214
 215     if ($type eq 'Lt')
 216     {
 217         $upper[$code] = make_hex ($fields[$UPPER]);
 218         $lower[$code] = pack ("U", hex ($fields[$LOWER]));
 219         $title[$code] = make_hex ($fields[$LOWER]);
 220     }
 221 }
 222
 223 sub print_tests
 224 {
 225     for ($i = 0; $i < 0x10ffff; $i++) {
 226         if ($i == 0x3A3) {
 227             # Greek sigma needs special tests
 228             next;
 229         }
 230
 231         my $lower = $lower[$i];
 232         my $title = $title[$i];
 233         my $upper = $upper[$i];
 234
 235         if (defined $upper || defined $lower || defined $title) {
 236             printf "\t%s\t%s\t%s\t%s\t# %4X\n",
 237                     pack ("U", $i),
 238                     (defined $lower ? $lower : ""),
 239                     (defined $title ? $title : ""),
 240                     (defined $upper ? $upper : ""),
 241                     $i;
 242         }
 243     }
 244 }
 245
 246 sub make_hex
 247 {
 248     my $codes = shift;
 249
 250     $codes =~ s/^\s+//;
 251     $codes =~ s/\s+$//;
 252
 253     if ($codes eq "0" || $codes eq "") {
 254         return "";
 255     } else {
 256         return pack ("U*", map { hex ($_) } split /\s+/, $codes);
 257     }
 258 }