From 21edbdacf38c9eddc9dcca5c555a8a587f9df6fb Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 16 Jun 2008 00:18:48 -0700 Subject: [PATCH] codepage: strip accents during shortname upper-casing If we don't have a direct upper-case equivalent to a character, but we *do* have the "naked" version of the upper-case character (defined as the first character of the canonical Unicode decomposition of the case-mapped character), then use it for the intracodepage table (used for shortnames.) This matches DOS behaviour. No obvious way, yet, to handle the fact that DOS doesn't uppercase the lowercase sigma, but that's not a huge problem; besides, lowercase sigma is mapped to position 0xE5, which would require dealing with the special handling of this character in the first position. --- codepage/cptable.pl | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/codepage/cptable.pl b/codepage/cptable.pl index 44c710c..9a9a13d 100755 --- a/codepage/cptable.pl +++ b/codepage/cptable.pl @@ -7,12 +7,16 @@ # Usage: # cptable.pl UnicodeData cpXXX.txt cpXXX.bin # +# Note: for the format of the UnicodeData file, see: +# http://www.unicode.org/Public/UNIDATA/UCD.html +# ($ucd, $cpin, $cpout) = @ARGV; %ucase = (); %lcase = (); %tcase = (); +%decomp = (); open(UCD, '<', $ucd) or die; while (defined($line = )) { @@ -22,6 +26,15 @@ while (defined($line = )) { $ucase{$n} = hex $f[12] if ($f[12] ne ''); $lcase{$n} = hex $f[13] if ($f[13] ne ''); $tcase{$n} = hex $f[14] if ($f[14] ne ''); + if ($f[5] =~ /^[0-9A-F\s]+$/) { + # This character has a canonical decomposition. + # The regular expression rejects angle brackets, so other + # decompositions aren't permitted. + $decomp{$n} = []; + foreach my $dch (split(' ', $f[5])) { + push(@{$decomp{$n}}, hex $dch); + } + } } close(UCD); @@ -53,8 +66,16 @@ print CPOUT pack("VVVVVV", 0, 0, 0, 0, 0, 0); # Self (shortname) uppercase table # for ($i = 0; $i < 256; $i++) { - $u = $tabx{$ucase{$xtab[$i]}}; - $u = $i unless (defined($u)); + $uuc = $ucase{$xtab[$i]}; # Unicode upper case + if (defined($tabx{$uuc})) { + # Straight-forward conversion + $u = $tabx{$uuc}; + } elsif (defined($tabx{${$decomp{$uuc}}[0]})) { + # Upper case equivalent stripped of accents + $u = $tabx{${$decomp{$uuc}}[0]}; + } else { + $u = $i; + } print CPOUT pack("C", $u); } -- 2.7.4