that should never begin words, for an example an accent that must follow
another character previously could precede all other characters.
+=item *
+
+Case insensitive matching in regular expressions compiled under C<use
+locale> now works much more sanely when the pattern and/or target string
+are encoded in UTF-8. Previously, under these conditions the localeness
+was completely lost. Now, code points above 255 are treated as Unicode,
+but code points between 0 and 255 are treated using the current locale
+rules, regardless of whether the pattern or string are encoded in UTF-8.
+The few case insensitive matches that cross the 255/256 boundary are not
+allowed. For example, 0xFF does not caselessly match the character at
+0x178, LATIN CAPITAL LETTER Y WITH DIAERESIS, because 0xFF may not be
+LATIN SMALL LETTER Y in the current locale, and Perl has no way of
+knowing if that character even exists in the locale, much less what code
+point it is.
+
=back
=head1 Known Problems
Do case-insensitive pattern matching.
If C<use locale> is in effect, the case map is taken from the current
-locale. See L<perllocale>.
+locale for code points less than 255, and from Unicode rules for larger
+code points. See L<perllocale>.
=item x
X</x>
intervening call of the
L<setlocale() function|perllocale/The setlocale function>.
This modifier is automatically set if the regular expression is compiled
-within the scope of a C<"use locale"> pragma. Results are not
-well-defined when using this and matching against a utf8-encoded string.
+within the scope of a C<"use locale"> pragma.
+Perl only allows single-byte locales. This means that code points above
+255 are treated as Unicode no matter what locale is in effect.
+Under Unicode rules, there are a few case-insensitive matches that cross the
+boundary 255/256 boundary. These are disallowed. For example,
+0xFF does not caselessly match the character at 0x178, LATIN CAPITAL
+LETTER Y WITH DIAERESIS, because 0xFF may not be LATIN SMALL LETTER Y
+in the current locale, and Perl has no way of knowing if that character
+even exists in the locale, much less what code point it is.
C<"u"> means to use Unicode semantics when pattern matching. It is
automatically set if the regular expression is encoded in utf8, or is
* folding, and the source isn't ASCII, look through all the
* characters it folds to. If any one of them is ASCII, forbid
* this fold. (cp is uni, so the 127 below is correct even for
- * EBCDIC) */
- if (use_this_char_fold && cp > 127 && MORE_ASCII_RESTRICTED) {
+ * EBCDIC). Similarly under locale rules, we don't mix under 256
+ * with above 255. XXX It really doesn't make sense to have \N{}
+ * which means a Unicode rules under locale. I (khw) think this
+ * should be warned about, but the counter argument is that people
+ * who have programmed around Perl's earlier lack of specifying the
+ * rules and used \N{} to force Unicode things in a local
+ * environment shouldn't get suddenly a warning */
+ if (use_this_char_fold) {
+ if (LOC && cp < 256) { /* Fold not known until run-time */
+ use_this_char_fold = FALSE;
+ }
+ else if ((cp > 127 && MORE_ASCII_RESTRICTED)
+ || (cp > 255 && LOC))
+ {
U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
U8* s = tmpbuf;
U8* e;
e = s + foldlen;
while (s < e) {
- if (isASCII(*s)) {
+ if (isASCII(*s)
+ || (LOC && (UTF8_IS_INVARIANT(*s)
+ || UTF8_IS_DOWNGRADEABLE_START(*s))))
+ {
use_this_char_fold = FALSE;
break;
}
s += UTF8SKIP(s);
}
+ }
}
if (! use_this_char_fold) { /* Not folding, just append to the
if ( RExC_flags & RXf_PMf_EXTENDED)
p = regwhite( pRExC_state, p );
if (UTF && FOLD) {
- /* Prime the casefolded buffer. */
- if (isASCII(ender)) {
+ /* Prime the casefolded buffer. Locale rules, which apply
+ * only to code points < 256, aren't known until execution,
+ * so for them, just output the original character using
+ * utf8 */
+ if (LOC && ender < 256) {
+ if (UNI_IS_INVARIANT(ender)) {
+ *tmpbuf = (U8) ender;
+ foldlen = 1;
+ } else {
+ *tmpbuf = UTF8_TWO_BYTE_HI(ender);
+ *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
+ foldlen = 2;
+ }
+ }
+ else if (isASCII(ender)) { /* Note: Here can't also be LOC
+ */
ender = toLOWER(ender);
*tmpbuf = (U8) ender;
foldlen = 1;
}
- else if (! MORE_ASCII_RESTRICTED) {
+ else if (! MORE_ASCII_RESTRICTED && ! LOC) {
+
+ /* Locale and /aa require more selectivity about the
+ * fold, so are handled below. Otherwise, here, just
+ * use the fold */
ender = toFOLD_uni(ender, tmpbuf, &foldlen);
}
else {
- /* When not to mix ASCII with non-, reject folds that
- * mix them, using only the non-folded code point. So
- * do the fold to a temporary, and inspect each
- * character in it. */
+ /* Under locale rules or /aa we are not to mix,
+ * respectively, ords < 256 or ASCII with non-. So
+ * reject folds that mix them, using only the
+ * non-folded code point. So do the fold to a
+ * temporary, and inspect each character in it. */
U8 trialbuf[UTF8_MAXBYTES_CASE+1];
U8* s = trialbuf;
UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
bool fold_ok = TRUE;
while (s < e) {
- if (isASCII(*s)) {
+ if (isASCII(*s)
+ || (LOC && (UTF8_IS_INVARIANT(*s)
+ || UTF8_IS_DOWNGRADEABLE_START(*s))))
+ {
fold_ok = FALSE;
break;
}
* target string is utf8, or under unicode rules */
if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
while (loc < e) {
- if (MORE_ASCII_RESTRICTED && (isASCII(*loc) != isASCII(j))) {
+
+ /* Can't mix ascii with non- under /aa */
+ if (MORE_ASCII_RESTRICTED
+ && (isASCII(*loc) != isASCII(j)))
+ {
goto end_multi_fold;
}
- /* XXX Discard this fold if any are latin1
- * and LOC */
if (UTF8_IS_INVARIANT(*loc)
|| UTF8_IS_DOWNGRADEABLE_START(*loc))
{
+ /* Can't mix above and below 256 under
+ * LOC */
+ if (LOC) {
+ goto end_multi_fold;
+ }
ANYOF_FLAGS(ret)
|= ANYOF_NONBITMAP_NON_UTF8;
break;
Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
}
c = SvUV(*c_p);
- if (MORE_ASCII_RESTRICTED && (isASCII(c) != isASCII(j))) {
+
+ /* /aa doesn't allow folds between ASCII and
+ * non-; /l doesn't allow them between above
+ * and below 256 */
+ if ((MORE_ASCII_RESTRICTED && (isASCII(c) != isASCII(j)))
+ || (LOC && ((c < 256) != (j < 256))))
+ {
continue;
}
}
/* It may be that the code point is already
* in this range or already in the bitmap,
- * XXX THink about LOC
* in which case we need do nothing */
else if ((c < start || c > end)
&& (c > 255
case EXACTFL:
if (UTF_PATTERN || utf8_target) {
- utf8_fold_flags = 0; /* XXX, add new flag for locale */
+ utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
goto do_exactf_utf8;
}
fold_array = PL_fold_locale;
PL_reg_flags |= RF_tainted;
folder = foldEQ_locale;
fold_array = PL_fold_locale;
- fold_utf8_flags = 0;
+ fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
goto do_exactf;
case EXACTFU:
folder = foldEQ_locale;
fold_array = PL_fold_locale;
type = REFFL;
- utf8_fold_flags = 0;
+ utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
goto do_nref;
case NREFFA:
PL_reg_flags |= RF_tainted;
folder = foldEQ_locale;
fold_array = PL_fold_locale;
- utf8_fold_flags = 0;
+ utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
goto do_ref;
case REFFA:
case EXACTFL:
PL_reg_flags |= RF_tainted;
- /* FALL THROUGH */
+ utf8_flags = FOLDEQ_UTF8_LOCALE;
+ goto do_exactf;
+
case EXACTF:
case EXACTFU:
utf8_flags = 0;
assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
if (utf8_target) { /* Use full Unicode fold matching */
-
- /* For the EXACTFL case, It doesn't really make sense to compare
- * locale and utf8, but it is best we can do. The documents warn
- * against mixing them */
-
char *tmpeol = loceol;
while (hardcount < max
&& foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
return $Unicode;
}
-my %todos;
+my %todos; # List of test numbers that are expected to fail
map { $todos{$_} = '1' } (
-95557,
-95558,
-95561,
-95562,
-95573,
-95574,
-95605,
-95606,
-95609,
-95610,
-95621,
-95622,
+127405,
+127406,
+127425,
+127426,
+127437,
+127438,
+127469,
+127470,
+127489,
+127490,
+127501,
+127502,
);
sub numerically {
# To cut down on the number of tests
my $has_tested_aa_above_latin1;
my $has_tested_latin1_aa;
+my $has_tested_l_above_latin1;
+my $has_tested_latin1_l;
# For use by pairs() in generating combinations
sub prefix {
my $pattern_above_latin1 = grep { $_ > 255 } @pattern;
my $target_has_ascii = grep { $_ < 128 } @target;
my $pattern_has_ascii = grep { $_ < 128 } @pattern;
+ my $target_has_latin1 = grep { $_ < 256 } @target;
+ my $pattern_has_latin1 = grep { $_ < 256 } @pattern;
my $is_self = @target == 1 && @pattern == 1 && $target[0] == $pattern[0];
# We don't test multi-char folding into other multi-chars. We are testing
#diag $progress;
# Now grind out tests, using various combinations.
- foreach my $charset ('d', 'u', 'aa') {
+ foreach my $charset ('d', 'l', 'u', 'aa') {
# /aa should only affect things with folds in the ASCII range. But, try
# it on one pair in the other ranges just to make sure it doesn't break
$has_tested_latin1_aa = $test;
}
}
+ elsif ($charset eq 'l') {
+ if (! $target_has_latin1 && ! $pattern_has_latin1) {
+ next if defined $has_tested_latin1_l && $has_tested_latin1_l != $test;
+ $has_tested_latin1_l = $test;
+ }
+ }
foreach my $utf8_target (0, 1) { # Both utf8 and not, for
# code points < 256
foreach my $utf8_pattern (0, 1) {
next if $pattern_above_latin1 && ! $utf8_pattern;
- my $uni_semantics = $utf8_target || $charset eq 'u' || ($charset eq 'd' && $utf8_pattern) || $charset =~ /a/;
+
+ # Our testing of 'l' uses the POSIX locale, which is ASCII-only
+ my $uni_semantics = $charset ne 'l' && ($utf8_target || $charset eq 'u' || ($charset eq 'd' && $utf8_pattern) || $charset =~ /a/);
my $upgrade_pattern = "";
$upgrade_pattern = ' utf8::upgrade($p);' if ! $pattern_above_latin1 && $utf8_pattern;
my @rhs = @x_pattern;
my $rhs = join "", @rhs;
my $should_fail = (! $uni_semantics && $ord >= 128 && $ord < 256 && ! $is_self)
- || ($charset eq 'aa' && $target_has_ascii != $pattern_has_ascii);
+ || ($charset eq 'aa' && $target_has_ascii != $pattern_has_ascii)
+ || ($charset eq 'l' && $target_has_latin1 != $pattern_has_latin1);
# Do simple tests of referencing capture buffers, named and
# numbered.
# fold for /aa and the quantifier isn't sufficient
# to allow it to span to both sides.
$op = 0 if $target_has_ascii && $charset eq 'aa' && $both_sides && ( ! $quantifier || $quantifier eq '?') && $parend < 2;
+
+ # Or for /l
+ $op = 0 if $target_has_latin1 && $charset eq 'l' && $both_sides && ( ! $quantifier || $quantifier eq '?') && $parend < 2;
}
$op = ($op) ? '=~' : '!~';