codepage/cptable.pl

   1 #!/usr/bin/perl
   2 #
   3 # Produce a codepage matching table.  For each 8-bit character, list
   4 # a primary and an alternate match (the latter used for case-insensitive
   5 # matching.)
   6 #
   7 # Usage:
   8 #       cptable.pl UnicodeData console-cp.txt filesystem-cp.txt output.cp
   9 #
  10 # Note: for the format of the UnicodeData file, see:
  11 # http://www.unicode.org/Public/UNIDATA/UCD.html
  12 #
  13
  14 ($ucd, $cpco, $cpfs, $cpout) = @ARGV;
  15
  16 if (!defined($cpout)) {
  17     die "Usage: $0 UnicodeData console-cp.txt fs-cp.txt output.cp\n";
  18 }
  19
  20 %ucase   = ();
  21 %lcase   = ();
  22 %tcase   = ();
  23 %decomp  = ();
  24
  25 open(UCD, '<', $ucd)
  26     or die "$0: could not open unicode data: $ucd: $!\n";
  27 while (defined($line = <UCD>)) {
  28     chomp $line;
  29     @f = split(/;/, $line);
  30     $n = hex $f[0];
  31     $ucase{$n} = ($f[12] ne '') ? hex $f[12] : $n;
  32     $lcase{$n} = ($f[13] ne '') ? hex $f[13] : $n;
  33     $tcase{$n} = ($f[14] ne '') ? hex $f[14] : $n;
  34     if ($f[5] =~ /^[0-9A-F\s]+$/) {
  35         # This character has a canonical decomposition.
  36         # The regular expression rejects angle brackets, so other
  37         # decompositions aren't permitted.
  38         $decomp{$n} = [];
  39         foreach my $dch (split(' ', $f[5])) {
  40             push(@{$decomp{$n}}, hex $dch);
  41         }
  42     }
  43 }
  44 close(UCD);
  45
  46 #
  47 # Filesystem and console codepages.  The filesystem codepage is used
  48 # for FAT shortnames, whereas the console codepage is whatever is used
  49 # on the screen and keyboard.
  50 #
  51 @xtab = (undef) x 256;
  52 %tabx = ();
  53 open(CPFS, '<', $cpfs)
  54     or die "$0: could not open fs codepage: $cpfs: $!\n";
  55 while (defined($line = <CPFS>)) {
  56     $line =~ s/\s*(\#.*|)$//;
  57     @f = split(/\s+/, $line);
  58     next if (scalar @f != 2);
  59     next if (hex $f[0] > 255);
  60     $xtab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode
  61     $tabx{hex $f[1]} = hex $f[0]; # Unicode -> Codepage
  62 }
  63 close(CPFS);
  64
  65 @ytab = (undef) x 256;
  66 %taby = ();
  67 open(CPCO, '<', $cpco)
  68     or die "$0: could not open console codepage: $cpco: $!\n";
  69 while (defined($line = <CPCO>)) {
  70     $line =~ s/\s*(\#.*|)$//;
  71     @f = split(/\s+/, $line);
  72     next if (scalar @f != 2);
  73     next if (hex $f[0] > 255);
  74     $ytab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode
  75     $taby{hex $f[1]} = hex $f[0]; # Unicode -> Codepage
  76 }
  77 close(CPCO);
  78
  79 open(CPOUT, '>', $cpout)
  80     or die "$0: could not open output file: $cpout: $!\n";
  81 #
  82 # Magic number, in anticipation of being able to load these
  83 # files dynamically...
  84 #
  85 print CPOUT pack("VV", 0x58a8b3d4, 0x51d21eb1);
  86
  87 # Header fields available for future use...
  88 print CPOUT pack("VVVVVV", 0, 0, 0, 0, 0, 0);
  89
  90 #
  91 # Self (shortname) uppercase table.
  92 # This depends both on the console codepage and the filesystem codepage;
  93 # the logical transcoding operation is:
  94 #
  95 # $tabx{$ucase{$ytab[$i]}}
  96 #
  97 # ... where @ytab is console codepage -> Unicode and
  98 # %tabx is Unicode -> filesystem codepage.
  99 #
 100 @uctab = (undef) x 256;
 101 for ($i = 0; $i < 256; $i++) {
 102     $uuc = $ucase{$ytab[$i]};   # Unicode upper case
 103     if (defined($tabx{$uuc})) {
 104         # Straight-forward conversion
 105         $u = $tabx{$uuc};
 106     } elsif (defined($tabx{${$decomp{$uuc}}[0]})) {
 107         # Upper case equivalent stripped of accents
 108         $u = $tabx{${$decomp{$uuc}}[0]};
 109     } else {
 110         # No equivalent at all found.  Assume it is a lower-case-only
 111         # character, like greek alpha in CP437.
 112         $u = $i;
 113     }
 114     $uctab[$i] = $u;
 115     print CPOUT pack("C", $u);
 116 }
 117
 118 #
 119 # Self (shortname) lowercase table.
 120 # This depends both on the console codepage and the filesystem codepage;
 121 # the logical transcoding operation is:
 122 #
 123 # $taby{$lcase{$xtab[$i]}}
 124 #
 125 # ... where @ytab is console codepage -> Unicode and
 126 # %tabx is Unicode -> filesystem codepage.
 127 #
 128 @lctab = (undef) x 256;
 129 for ($i = 0; $i < 256; $i++) {
 130     $llc = $lcase{$xtab[$i]};   # Unicode lower case
 131     if (defined($l = $taby{$llc}) && $uctab[$l] == $i) {
 132         # Straight-forward conversion
 133     } elsif (defined($l = $tabx{${$decomp{$llc}}[0]}) && $uctab[$l] == $i) {
 134         # Lower case equivalent stripped of accents
 135     } else {
 136         # No equivalent at all found.  Find *anything* that matches the
 137         # bijection criterion...
 138         for ($l = 0; $l < 256; $l++) {
 139             last if ($uctab[$l] == $i);
 140         }
 141         $l = $i if ($l == 256); # If nothing, we're screwed anyway...
 142     }
 143     $lctab[$i] = $l;
 144     print CPOUT pack("C", $l);
 145 }
 146
 147 #
 148 # Unicode (longname) matching table.
 149 # This only depends on the console codepage.
 150 #
 151 $pp0 = '';  $pp1 = '';
 152 for ($i = 0; $i < 256; $i++) {
 153     if (!defined($ytab[$i])) {
 154         $p0 = $p1 = 0xffff;
 155     } else {
 156         $p0 = $ytab[$i];
 157         if ($ucase{$p0} != $p0) {
 158             $p1 = $ucase{$p0};
 159         } elsif ($lcase{$p0} != $p0) {
 160             $p1 = $lcase{$p0};
 161         } elsif ($tcase{$p0} != $p0) {
 162             $p1 = $tcase{$p0};
 163         } else {
 164             $p1 = $p0;
 165         }
 166     }
 167     # Only the BMP is supported...
 168     $p0 = 0xffff if ($p0 > 0xffff);
 169     $p1 = 0xffff if ($p1 > 0xffff);
 170     $pp0 .= pack("v", $p0);
 171     $pp1 .= pack("v", $p1);
 172 }
 173 print CPOUT $pp0, $pp1;
 174 close (CPOUT);
 175
 176