From: Karl Williamson Date: Mon, 14 Feb 2011 15:44:12 +0000 (-0700) Subject: fold_grind.t: Test multi-char folds X-Git-Tag: accepted/trunk/20130322.191538~5578 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=55efcec6eaba5f71068ad86a17184a439d10bad8;p=platform%2Fupstream%2Fperl.git fold_grind.t: Test multi-char folds --- diff --git a/t/re/fold_grind.t b/t/re/fold_grind.t index a46945f..8cbb86c 100644 --- a/t/re/fold_grind.t +++ b/t/re/fold_grind.t @@ -54,6 +54,18 @@ sub range_type { my %todos; map { $todos{$_} = '1' } ( +95557, +95558, +95561, +95562, +95573, +95574, +95605, +95606, +95609, +95610, +95621, +95622, ); sub numerically { @@ -94,9 +106,17 @@ while (<$fh>) { my $from = hex $hex_from; if ($fold_type eq 'F') { - next; # XXX TODO multi-char folds - my $from_range_type = range_type($from); + my $from_range_type = range_type($from); + + # If we were testing comprehensively, we would try every combination + # of upper and lower case in the fold, but it is quite likely that if + # the code can handle all combinations if it can handle the cases + # where everything is upper and when everything is lower. Because of + # complement matching, we need to do both. And we use the + # reverse-fold instead of uppercase. @folded = map { hex $_ } @folded; + # XXX better to use reverse fold of these instead of uc + my @uc_folded = map { ord uc chr $_ } @folded; # Include three code points that are handled internally by the regex # engine specially, plus all non-above-255 multi folds (which actually @@ -108,16 +128,16 @@ while (<$fh>) { || $from_range_type != $Unicode || grep { range_type($_) != $from_range_type } @folded) { - $tests{$from} = [ [ @folded ] ]; + $tests{$from} = [ [ @folded ], [ @uc_folded ] ]; } else { - # Must be Unicode here, so chr is automatically utf8. Get the - # number of bytes in each. This is because the optimizer cares - # about length differences. - my $from_length = length encode('utf-8', chr($from)); - my $to_length = length encode('utf-8', pack 'U*', @folded); - push @{$multi_folds{$from_length}{$to_length}}, { $from => [ @folded ] }; + # The only multi-char non-utf8 fold is DF, which is handled above, + # so here chr() must be utf8. Get the number of bytes in each. + # This is because the optimizer cares about length differences. + my $from_length = length encode('UTF-8', chr($from)); + my $to_length = length encode('UTF-8', pack 'U*', @folded); + push @{$multi_folds{$from_length}{$to_length}}, { $from => [ [ @folded ], [ @uc_folded ] ] }; } } @@ -387,6 +407,36 @@ foreach my $test (sort { numerically } keys %tests) { $op = 1; } $op = ! $op if $must_match && $inverted; + + if ($inverted && @target > 1) { + # When doing an inverted match against a + # multi-char target, and there is not something on + # the left to anchor the match, if it shouldn't + # succeed, skip, as what will happen (when working + # correctly) is that it will match the first + # position correctly, and then be inverted to not + # match; then it will go to the second position + # where it won't match, but get inverted to match, + # and hence succeeding. + next if ! ($l_anchor || $prepend) && ! $op; + + # Can't ever match for latin1 code points non-uni + # semantics that have a inverted multi-char fold + # when there is something on both sides and the + # quantifier isn't such as to span the required + # width, which is 2 or 3. + $op = 0 if $ord < 255 + && ! $uni_semantics + && $both_sides + && ( ! $quantifier || $quantifier eq '?') + && $parend < 2; + + # Similarly can't ever match when inverting a multi-char + # fold for /aa and the quantifier isn't sufficient + # to allow it to span to both sides. + $op = 0 if $target_has_ascii && $charset eq 'aa' && $both_sides && ( ! $quantifier || $quantifier eq '?') && $parend < 2; + } + $op = ($op) ? '=~' : '!~'; my $debug .= " uni_semantics=$uni_semantics, should_fail=$should_fail, bracketed=$bracketed, prepend=$prepend, append=$append, parend=$parend, quantifier=$quantifier, l_anchor=$l_anchor, r_anchor=$r_anchor";