From daf3b8d4938645bc97bae0c97b089ea40463c913 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Wed, 15 Dec 2010 09:57:25 -0700 Subject: [PATCH] Revamp t/uni/fold.t This patch revamps fold.t but using essentially the same tests on essentially the same character set. It: 1) Works on EBCDIC 2) Uses test.pl 3) Separates out the 8 tests per character that previously were all combined into a single test per character 4) Outputs on each line the actual test performed 5) Corrects and hardens some tests on multi-character folding characters. To expand on point 5): Previously, the wrong behavior was tested for; correct behavior failed. For example, ":\N{LATIN SMALL LIGATURE ST}:" =~ /:[_st]:/i previously passed. But the fold of the string is two characters, and so should not match a one-character long character class. Instead it should match: ":\N{LATIN SMALL LIGATURE ST}:" =~ /:[_st]{2}:/i The new test includes TODO tests for both of them. ":\N{LATIN SMALL LIGATURE ST}:" !~ /:[_st]:/i ":\N{LATIN SMALL LIGATURE ST}:" =~ /:[_st]{2}:/i Also the inverse relation ":st:" =~ /:[_\N{LATIN SMALL LIGATURE ST}]:/i passes, semi-coincidentally, for some. By changing the test to ":ST:" =~ /:[_\N{LATIN SMALL LIGATURE ST}]:/i they all fail, (and are made TODO's). --- t/uni/fold.t | 142 +++++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 105 insertions(+), 37 deletions(-) diff --git a/t/uni/fold.t b/t/uni/fold.t index 0f71c80..c841614 100644 --- a/t/uni/fold.t +++ b/t/uni/fold.t @@ -1,54 +1,122 @@ +use strict; +use warnings; + +# re/fold_grind.t has more complex tests, but doesn't test every fold + BEGIN { chdir 't' if -d 't'; @INC = '../lib'; + require './test.pl'; } +binmode *STDOUT, ":utf8"; + use File::Spec; +our $TODO; +plan("no_plan"); + +# Read in the official case folding definitions. my $CF = File::Spec->catfile(File::Spec->catdir(File::Spec->updir, "lib", "unicore"), "CaseFolding.txt"); -use constant EBCDIC => ord 'A' == 193; +die qq[$0: failed to open "$CF": $!\n] if ! open(my $fh, "<", $CF); + +my @CF; +my %reverse_fold; +while (<$fh>) { + # Skip S since we are going for 'F'ull case folding. I is obsolete starting + # with Unicode 3.2, but leaving it in does no harm, and allows backward + # compatibility + next unless my ($code, $type, $mapping, $name) = $_ =~ + /^([0-9A-F]+); ([CFI]); ((?:[0-9A-F]+)(?: [0-9A-F]+)*); \# (.+)/; + + # Convert any 0-255 range chars to native. + $code = sprintf("%04X", ord_latin1_to_native(hex $code)) if hex $code < 0x100; + $mapping = join " ", map { $_ = + sprintf("%04X", ord_latin1_to_native(hex $_)) } + split / /, $mapping; + + push @CF, [$code, $mapping, $type, $name]; + + # Get the inverse fold for single-char mappings. + $reverse_fold{pack "U0U*", hex $mapping} = pack "U0U*", hex $code if $type ne 'F'; +} -if (open(CF, $CF)) { - my @CF; +close($fh) or die "$0 Couldn't close $CF"; - while () { - # Skip S since we are going for 'F'ull case folding. I is obsolete starting - # with Unicode 3.2, but leaving it in does no harm, and allows backward - # compatibility - if (/^([0-9A-F]+); ([CFI]); ((?:[0-9A-F]+)(?: [0-9A-F]+)*); \# (.+)/) { - next if EBCDIC && hex $1 < 0x100; - push @CF, [$1, $2, $3, $4]; - } +foreach my $test_ref (@CF) { + my ($code, $mapping, $type, $name) = @$test_ref; + my $c = pack("U0U*", hex $code); + my $f = pack("U0U*", map { hex } split " ", $mapping); + my $f_length = length $f; + foreach my $test ( + qq[":$c:" =~ /:$c:/], + qq[":$c:" =~ /:$c:/i], + qq[":$c:" =~ /:[_$c]:/], # Place two chars in [] so doesn't get + # optimized to a non-charclass + qq[":$c:" =~ /:[_$c]:/i], + qq[":$c:" =~ /:$f:/i], + qq[":$f:" =~ /:$c:/i], + ) { + ok eval $test, "$code - $name - $mapping - $type - $test"; } - close(CF); - - die qq[$0: failed to find casefoldings from "$CF"\n] unless @CF; - - print "1..", scalar @CF, "\n"; - - my $i = 0; - for my $cf (@CF) { - my ($code, $status, $mapping, $name) = @$cf; - $i++; - my $a = pack("U0U*", hex $code); - my $b = pack("U0U*", map { hex } split " ", $mapping); - my $t0 = ":$a:" =~ /:$a:/ ? 1 : 0; - my $t1 = ":$a:" =~ /:$a:/i ? 1 : 0; - my $t2 = ":$a:" =~ /:[_$a]:/ ? 1 : 0; # Two chars in [] so doesn't get - # optimized to a non-charclass - my $t3 = ":$a:" =~ /:[_$a]:/i ? 1 : 0; - my $t4 = ":$a:" =~ /:$b:/i ? 1 : 0; - my $t5 = ":$a:" =~ /:[_$b]:/i ? 1 : 0; - my $t6 = ":$b:" =~ /:$a:/i ? 1 : 0; - my $t7 = ":$b:" =~ /:[_$a]:/i ? 1 : 0; - print $t0 && $t1 && $t2 && $t3 && $t4 && $t5 && $t6 && $t7 ? - "ok $i \# - $code - $name - $mapping - $status\n" : - "not ok $i \# - $code - $name - $mapping - $status - $t0 $t1 $t2 $t3 $t4 $t5 $t6 $t7\n"; + # Certain tests weren't convenient to put in the list above since they are + # TODO's in multi-character folds. + if ($f_length == 1) { + + # The qq loses the utf8ness of ":$f:". These tests are not about + # finding bugs in utf8ness, so make sure it's utf8. + my $test = qq[my \$s = ":$f:"; utf8::upgrade(\$s); \$s =~ /:[_$c]:/i]; + ok eval $test, "$code - $name - $mapping - $type - $test"; + $test = qq[":$c:" =~ /:[_$f]:/i]; + ok eval $test, "$code - $name - $mapping - $type - $test"; + } + else { + + # There are two classes of multi-char folds that don't pass. For + # example, + # ":ß:" =~ /:[_s]{2}:/i + # ":ss:" =~ /:[_ß]:/i + # + # Some of the old tests for the second case happened to pass somewhat + # coincidentally. But none would pass if changed to this. + # ":SS:" =~ /:[_ß]:/i + # + # As the capital SS doesn't get folded. When those pass, it means + # that the code has been changed to take into account folding in the + # string, and all should pass, capitalized or not. So, what is done + # is to essentially upper-case the string for this class (but use the + # reverse fold not uc(), as that is more correct) + my $u; + for my $i (0 .. $f_length - 1) { + my $cur_char = substr($f, $i, 1); + $u .= $reverse_fold{$cur_char} || $cur_char; + } + my $test; + + local $TODO = 'Multi-char fold in [character class]'; + + TODO: { # e.g., ":ß:" !~ /:[_s]:/i # A multi-char fold should not + # match just one char + $test = qq[":$c:" !~ /:[_$f]:/i]; + ok eval $test, "$code - $name - $mapping - $type - $test"; + } + TODO: { # e.g., ":ß:" =~ /:[_s]{2}:/i + $test = qq[":$c:" =~ /:[_$f]{$f_length}:/i]; + ok eval $test, "$code - $name - $mapping - $type - $test"; + } + TODO: { # e.g., ":SS:" =~ /:[_ß]:/i + $test = qq[ my \$s = ":$u:"; utf8::upgrade(\$s); \$s =~ /:[_$c]:/i]; + ok eval $test, "$code - $name - $mapping - $type - $test"; + } } -} else { - die qq[$0: failed to open "$CF": $!\n]; } + +my $num_tests = curr_test() - 1; + +die qq[$0: failed to find casefoldings from "$CF"\n] unless $num_tests > 0; + +plan($num_tests); -- 2.7.4