From: Karl Williamson Date: Mon, 27 Jan 2014 22:35:00 +0000 (-0700) Subject: Work properly under UTF-8 LC_CTYPE locales X-Git-Tag: upstream/5.20.0~597 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=31f05a37c4e9c37a7263491f2fc0237d836e1a80;p=platform%2Fupstream%2Fperl.git Work properly under UTF-8 LC_CTYPE locales This large (sorry, I couldn't figure out how to meaningfully split it up) commit causes Perl to fully support LC_CTYPE operations (case changing, character classification) in UTF-8 locales. As a side effect it resolves [perl #56820]. The basics are easy, but there were a lot of details, and one troublesome edge case discussed below. What essentially happens is that when the locale is changed to a UTF-8 one, a global variable is set TRUE (FALSE when changed to a non-UTF-8 locale). Within the scope of 'use locale', this variable is checked, and if TRUE, the code that Perl uses for non-locale behavior is used instead of the code for locale behavior. Since Perl's internal representation is UTF-8, we get UTF-8 behavior for a UTF-8 locale. More work had to be done for regular expressions. There are three cases. 1) The character classes \w, [[:punct:]] needed no extra work, as the changes fall out from the base work. 2) Strings that are to be matched case-insensitively. These form EXACTFL regops (nodes). Notice that if such a string contains only characters above-Latin1 that match only themselves, that the node can be downgraded to an EXACT-only node, which presents better optimization possibilities, as we now have a fixed string known at compile time to be required to be in the target string to match. Similarly if all characters in the string match only other above-Latin1 characters case-insensitively, the node can be downgraded to a regular EXACTFU node (match, folding, using Unicode, not locale, rules). The code changes for this could be done without accepting UTF-8 locales fully, but there were edge cases which needed to be handled differently if I stopped there, so I continued on. In an EXACTFL node, all such characters are now folded at compile time (just as before this commit), while the other characters whose folds are locale-dependent are left unfolded. This means that they have to be folded at execution time based on the locale in effect at the moment. Again, this isn't a change from before. The difference is that now some of the folds that need to be done at execution time (in regexec) are potentially multi-char. Some of the code in regexec was trivial to extend to account for this because of existing infrastructure, but the part dealing with regex quantifiers, had to have more work. Also the code that joins EXACTish nodes together had to be expanded to account for the possibility of multi-character folds within locale handling. This was fairly easy, because it already has infrastructure to handle these under somewhat different circumstances. 3) In bracketed character classes, represented by ANYOF nodes, a new inversion list was created giving the characters that should be matched by this node when the runtime locale is UTF-8. The list is ignored except under that circumstance. To do this, I created a new ANYOF type which has an extra SV for the inversion list. The edge case that caused the most difficulty is folding involving the MICRO SIGN, U+00B5. It folds to the GREEK SMALL LETTER MU, as does the GREEK CAPITAL LETTER MU. The MICRO SIGN is the only 0-255 range character that folds to outside that range. The issue is that it doesn't naturally fall out that it will match the CAP MU. If we let the CAP MU fold to the samll mu at compile time (which it can because both are above-Latin1 and so the fold is the same no matter what locale is in effect), it could appear that the regnode can be downgraded away from EXACTFL to EXACTFU, but doing so would cause the MICRO SIGN to not case insensitvely match the CAP MU. This could be special cased in regcomp and regexec, but I wanted to avoid that. Instead the mktables tables are set up to include the CAP MU as a character whose presence forbids the downgrading, so the special casing is in mktables, and not in the C code. --- diff --git a/embed.fnc b/embed.fnc index f324dd7..41ebb5e 100644 --- a/embed.fnc +++ b/embed.fnc @@ -650,7 +650,7 @@ p |UV |_to_upper_title_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const char S_o #endif AMp |UV |to_uni_lower |UV c|NN U8 *p|NN STRLEN *lenp AMmp |UV |to_uni_fold |UV c|NN U8 *p|NN STRLEN *lenp -AMp |UV |_to_uni_fold_flags|UV c|NN U8 *p|NN STRLEN *lenp|const U8 flags +AMp |UV |_to_uni_fold_flags|UV c|NN U8 *p|NN STRLEN *lenp|U8 flags ADMpPR |bool |is_uni_alnum_lc|UV c ADMpPR |bool |is_uni_alnumc_lc|UV c ADMpPR |bool |is_uni_idfirst_lc|UV c @@ -1513,13 +1513,13 @@ Apd |UV |to_utf8_case |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp \ |NN SV **swashp|NN const char *normal|NULLOK const char *special Abmd |UV |to_utf8_lower |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp AMp |UV |_to_utf8_lower_flags |NN const U8 *p|NN U8* ustrp \ - |NULLOK STRLEN *lenp|const bool flags + |NULLOK STRLEN *lenp|bool flags Abmd |UV |to_utf8_upper |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp AMp |UV |_to_utf8_upper_flags |NN const U8 *p|NN U8* ustrp \ - |NULLOK STRLEN *lenp|const bool flags + |NULLOK STRLEN *lenp|bool flags Abmd |UV |to_utf8_title |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp AMp |UV |_to_utf8_title_flags |NN const U8 *p|NN U8* ustrp \ - |NULLOK STRLEN *lenp|const bool flags + |NULLOK STRLEN *lenp|bool flags Abmd |UV |to_utf8_fold |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp AMp |UV |_to_utf8_fold_flags|NN const U8 *p|NN U8* ustrp \ |NULLOK STRLEN *lenp|U8 flags @@ -2089,7 +2089,7 @@ Es |void |ssc_or |NN const RExC_state_t *pRExC_state \ |NN const regnode_ssc *or_with Es |SV* |get_ANYOF_cp_list_for_ssc \ |NN const RExC_state_t *pRExC_state \ - |NN const regnode_charclass_posixl* const node + |NN const regnode_charclass_posixl_fold* const node Ei |void |ssc_intersection|NN regnode_ssc *ssc \ |NN SV* const invlist|const bool invert_2nd Ei |void |ssc_union |NN regnode_ssc *ssc \ diff --git a/embedvar.h b/embedvar.h index 51fc265..5ae8d05 100644 --- a/embedvar.h +++ b/embedvar.h @@ -161,6 +161,7 @@ #define PL_in_clean_objs (vTHX->Iin_clean_objs) #define PL_in_eval (vTHX->Iin_eval) #define PL_in_load_module (vTHX->Iin_load_module) +#define PL_in_utf8_CTYPE_locale (vTHX->Iin_utf8_CTYPE_locale) #define PL_incgv (vTHX->Iincgv) #define PL_initav (vTHX->Iinitav) #define PL_inplace (vTHX->Iinplace) diff --git a/ext/XS-APItest/t/handy.t b/ext/XS-APItest/t/handy.t index 3a8abc9..9a70ec2 100644 --- a/ext/XS-APItest/t/handy.t +++ b/ext/XS-APItest/t/handy.t @@ -1,5 +1,9 @@ #!perl -w +BEGIN { + require 'loc_tools.pl'; # Contains find_utf8_locale() +} + use strict; use Test::More; use Config; @@ -13,6 +17,7 @@ sub truth($) { # Converts values so is() works } my $locale; +my $utf8_locale; if($Config{d_setlocale}) { require POSIX; $locale = POSIX::setlocale( &POSIX::LC_ALL, "C"); @@ -31,6 +36,8 @@ if($Config{d_setlocale}) { last; } } + + $utf8_locale = find_utf8_locale(); } } @@ -149,13 +156,30 @@ foreach my $name (sort keys %properties) { if (defined $locale) { require locale; import locale; + POSIX::setlocale( &POSIX::LC_ALL, "C"); $ret = truth eval "test_is${function}_LC($i)"; if ($@) { fail($@); } else { my $truth = truth($matches && $i < 128); - is ($ret, $truth, "is${function}_LC( $display_name ) == $truth"); + is ($ret, $truth, "is${function}_LC( $display_name ) == $truth (C locale)"); + } + } + + if (defined $utf8_locale) { + use locale; + + POSIX::setlocale( &POSIX::LC_ALL, $utf8_locale); + $ret = truth eval "test_is${function}_LC($i)"; + if ($@) { + fail($@); + } + else { + + # UTF-8 locale works on full range 0-255 + my $truth = truth($matches && $i < 256); + is ($ret, $truth, "is${function}_LC( $display_name ) == $truth ($utf8_locale)"); } } } @@ -171,13 +195,28 @@ foreach my $name (sort keys %properties) { if (defined $locale && $name ne 'vertws') { require locale; import locale; + POSIX::setlocale( &POSIX::LC_ALL, "C"); $ret = truth eval "test_is${function}_LC_uvchr('$i')"; if ($@) { fail($@); } else { my $truth = truth($matches && ($i < 128 || $i > 255)); - is ($ret, $truth, "is${function}_LC_uvchr( $display_name ) == $truth"); + is ($ret, $truth, "is${function}_LC_uvchr( $display_name ) == $truth (C locale)"); + } + } + + if (defined $utf8_locale && $name ne 'vertws') { + use locale; + + POSIX::setlocale( &POSIX::LC_ALL, $utf8_locale); + $ret = truth eval "test_is${function}_LC_uvchr('$i')"; + if ($@) { + fail($@); + } + else { + my $truth = truth($matches); + is ($ret, $truth, "is${function}_LC_uvchr( $display_name ) == $truth ($utf8_locale)"); } } @@ -195,13 +234,28 @@ foreach my $name (sort keys %properties) { if ($name ne 'vertws' && defined $locale) { require locale; import locale; + POSIX::setlocale( &POSIX::LC_ALL, "C"); $ret = truth eval "test_is${function}_LC_utf8('$char')"; if ($@) { fail($@); } else { my $truth = truth($matches && ($i < 128 || $i > 255)); - is ($ret, $truth, "is${function}_LC_utf8( $display_name ) == $truth"); + is ($ret, $truth, "is${function}_LC_utf8( $display_name ) == $truth (C locale)"); + } + } + + if ($name ne 'vertws' && defined $utf8_locale) { + use locale; + + POSIX::setlocale( &POSIX::LC_ALL, $utf8_locale); + $ret = truth eval "test_is${function}_LC_utf8('$char')"; + if ($@) { + fail($@); + } + else { + my $truth = truth($matches); + is ($ret, $truth, "is${function}_LC_utf8( $display_name ) == $truth ($utf8_locale)"); } } } @@ -292,10 +346,11 @@ foreach my $name (sort keys %to_properties) { } } - if ($name ne 'TITLE' && defined $locale) { + if ($name ne 'TITLE') { # Test _LC; titlecase is not defined in locales. + if (defined $locale) { require locale; import locale; - # Test _LC; titlecase is not defined in locales. + POSIX::setlocale( &POSIX::LC_ALL, "C"); $ret = eval "test_to${function}_LC($j)"; if ($@) { fail($@); @@ -304,7 +359,32 @@ foreach my $name (sort keys %to_properties) { my $should_be = ($i < 128 && $map_ref->[$index] != $missing) ? $map_ref->[$index] + $j - $list_ref->[$index] : $j; - is ($ret, $should_be, sprintf("to${function}_LC( $display_name ) == 0x%02X", $should_be)); + is ($ret, $should_be, sprintf("to${function}_LC( $display_name ) == 0x%02X (C locale)", $should_be)); + } + } + + if (defined $utf8_locale) { + use locale; + + SKIP: { + skip "to${property}_LC does not work for LATIN SMALL LETTER SHARP S", 1 + if $j == 0xDF && ($name eq 'FOLD' || $name eq 'UPPER'); + + POSIX::setlocale( &POSIX::LC_ALL, $utf8_locale); + $ret = eval "test_to${function}_LC($j)"; + if ($@) { + fail($@); + } + else { + my $should_be = ($i < 256 + && ! ref $map_ref->[$index] + && $map_ref->[$index] != $missing + ) + ? $map_ref->[$index] + $j - $list_ref->[$index] + : $j; + is ($ret, $should_be, sprintf("to${function}_LC( $display_name ) == 0x%02X ($utf8_locale)", $should_be)); + } + } } } diff --git a/globvar.sym b/globvar.sym index f33d025..3cab4bf 100644 --- a/globvar.sym +++ b/globvar.sym @@ -23,6 +23,7 @@ magic_data magic_vtable_names magic_vtables memory_wrap +mod_latin1_uc no_aelem no_dir_func no_func diff --git a/handy.h b/handy.h index 0714d4e..c65170a 100644 --- a/handy.h +++ b/handy.h @@ -523,12 +523,13 @@ Variant C is like C, but the input is a pointer to a classification of just the first (possibly multi-byte) character in the string is tested. -Variant C is like the C and C variants, but uses -the C library function that gives the named classification instead of -hard-coded rules. For example, C returns the result of calling -C. This means that the result is based on the current locale, which -is what C in the name stands for. FALSE is always returned if the input -won't fit into an octet. +Variant C is like the C and C variants, but the +result is based on the current locale, which is what C in the name stands +for. If Perl can determine that the current locale is a UTF-8 locale, it uses +the published Unicode rules; otherwise, it uses the C library function that +gives the named classification. For example, C when not in a +UTF-8 locale returns the result of calling C. FALSE is always +returned if the input won't fit into an octet. Variant C is like C, but is defined on any UV. It returns the same as C for input code points less than 256, and @@ -1241,18 +1242,24 @@ EXTCONST U32 PL_charclass[]; #define toUPPER_LATIN1_MOD(c) ((! FITS_IN_8_BITS(c)) \ ? (c) \ : PL_mod_latin1_uc[ (U8) (c) ]) +#define IN_UTF8_CTYPE_LOCALE PL_in_utf8_CTYPE_locale /* Use foo_LC_uvchr() instead of these for beyond the Latin1 range */ /* For internal core Perl use only: the base macro for defining macros like * isALPHA_LC, which uses the current LC_CTYPE locale. 'c' is the code point - * (0-255) to check. 'utf8_locale_classnum' is currently unused. The code to - * actually do the test this is passed in 'non_utf8'. If 'c' is above 255, 0 - * is returned. For accessing the full range of possible code points under - * locale rules, use the macros based on _generic_LC_uvchr instead of this. */ + * (0-255) to check. In a UTF-8 locale, the result is the same as calling + * isFOO_L1(); the 'utf8_locale_classnum' parameter is something like + * _CC_UPPER, which gives the class number for doing this. For non-UTF-8 + * locales, the code to actually do the test this is passed in 'non_utf8'. If + * 'c' is above 255, 0 is returned. For accessing the full range of possible + * code points under locale rules, use the macros based on _generic_LC_uvchr + * instead of this. */ #define _generic_LC_base(c, utf8_locale_classnum, non_utf8) \ (! FITS_IN_8_BITS(c) \ ? 0 \ + : IN_UTF8_CTYPE_LOCALE \ + ? cBOOL(PL_charclass[(U8) (c)] & _CC_mask(utf8_locale_classnum)) \ : cBOOL(non_utf8)) /* For internal core Perl use only: a helper macro for defining macros like @@ -1275,15 +1282,41 @@ EXTCONST U32 PL_charclass[]; * helper macros */ #define _generic_toLOWER_LC(c, function, cast) (! FITS_IN_8_BITS(c) \ ? (c) \ + : (IN_UTF8_CTYPE_LOCALE) \ + ? PL_latin1_lc[ (U8) (c) ] \ : function((cast)(c))) +/* Note that the result can be larger than a byte in a UTF-8 locale. It + * returns a single value, so can't adequately return the upper case of LATIN + * SMALL LETTER SHARP S in a UTF-8 locale (which should be a string of two + * values "SS"); instead it asserts against that under DEBUGGING, and + * otherwise returns its input */ #define _generic_toUPPER_LC(c, function, cast) \ (! FITS_IN_8_BITS(c) \ ? (c) \ - : function((cast)(c))) - + : ((! IN_UTF8_CTYPE_LOCALE) \ + ? function((cast)(c)) \ + : ((((U8)(c)) == MICRO_SIGN) \ + ? GREEK_CAPITAL_LETTER_MU \ + : ((((U8)(c)) == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) \ + ? LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS \ + : ((((U8)(c)) == LATIN_SMALL_LETTER_SHARP_S) \ + ? (__ASSERT_(0) (c)) \ + : PL_mod_latin1_uc[ (U8) (c) ]))))) + +/* Note that the result can be larger than a byte in a UTF-8 locale. It + * returns a single value, so can't adequately return the fold case of LATIN + * SMALL LETTER SHARP S in a UTF-8 locale (which should be a string of two + * values "ss"); instead it asserts against that under DEBUGGING, and + * otherwise returns its input */ #define _generic_toFOLD_LC(c, function, cast) \ - _generic_toLOWER_LC(c, function, cast) + (LIKELY((c) != MICRO_SIGN) \ + ? (__ASSERT_(! IN_UTF8_CTYPE_LOCALE \ + || (c) != LATIN_SMALL_LETTER_SHARP_S) \ + _generic_toLOWER_LC(c, function, cast)) \ + : (IN_UTF8_CTYPE_LOCALE) \ + ? GREEK_SMALL_LETTER_MU \ + : (c)) /* Use the libc versions for these if available. */ #if defined(HAS_ISASCII) && ! defined(USE_NEXT_CTYPE) diff --git a/intrpvar.h b/intrpvar.h index 54dfef4..9f10ef8 100644 --- a/intrpvar.h +++ b/intrpvar.h @@ -230,6 +230,7 @@ PERLVARI(I, dumpindent, U16, 4) /* number of blanks per dump PERLVAR(I, exit_flags, U8) /* was exit() unexpected, etc. */ PERLVAR(I, utf8locale, bool) /* utf8 locale detected */ +PERLVAR(I, in_utf8_CTYPE_locale, bool) PERLVARA(I, colors,6, char *) /* values from PERL_RE_COLORS env var */ diff --git a/lib/locale.pm b/lib/locale.pm index fbb4a18..f7575f5 100644 --- a/lib/locale.pm +++ b/lib/locale.pm @@ -27,24 +27,6 @@ expressions, LC_COLLATE for string comparison, and LC_NUMERIC for number formatting). Each "use locale" or "no locale" affects statements to the end of the enclosing BLOCK. -Starting in Perl 5.16, a hybrid mode for this pragma is available, - - use locale ':not_characters'; - -which enables only the portions of locales that don't affect the character -set (that is, all except LC_COLLATE and LC_CTYPE). This is useful when mixing -Unicode and locales, including UTF-8 locales. - - use locale ':not_characters'; - use open ":locale"; # Convert I/O to/from Unicode - use POSIX qw(locale_h); # Import the LC_ALL constant - setlocale(LC_ALL, ""); # Generally required for the next - # statement to take effect - printf "%.2f\n", 12345.67' # Locale-defined formatting - @x = sort @y; # Unicode-defined sorting order. - # (Note that you will get better - # results using Unicode::Collate.) - See L for more detailed information on how Perl supports locales. diff --git a/lib/locale.t b/lib/locale.t index 8afbeab..987d19a 100644 --- a/lib/locale.t +++ b/lib/locale.t @@ -602,10 +602,10 @@ foreach my $Locale (@Locale) { next; } - # We test UTF-8 locales only under ':not_characters'; otherwise they have - # documented deficiencies. Non- UTF-8 locales are tested only under plain - # 'use locale', as otherwise we would have to convert everything in them - # to Unicode. + # We test UTF-8 locales only under ':not_characters'; It is easier to + # test them in other test files than here. Non- UTF-8 locales are tested + # only under plain 'use locale', as otherwise we would have to convert + # everything in them to Unicode. my %UPPER = (); # All alpha X for which uc(X) == X and lc(X) != X my %lower = (); # All alpha X for which lc(X) == X and uc(X) != X diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 8ab0b46..8f6def0 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -13849,6 +13849,50 @@ sub compile_perl() { my $any_folds = $perl->add_match_table("_Perl_Any_Folds", Description => "Code points that particpate in some fold", ); + my $loc_problem_folds = $perl->add_match_table( + "_Perl_Problematic_Locale_Folds", + Description => + "Code points that are in some way problematic under locale", + ); + + # This allows regexec.c to skip some work when appropriate. Some of the + # entries in _Perl_Problematic_Locale_Folds are multi-character folds, + my $loc_problem_folds_start = $perl->add_match_table( + "_Perl_Problematic_Locale_Foldeds_Start", + Description => + "The first character of every sequence in _Perl_Problematic_Locale_Folds", + ); + + my $cf = property_ref('Case_Folding'); + + # Every character 0-255 is problematic because what each folds to depends + # on the current locale + $loc_problem_folds->add_range(0, 255); + $loc_problem_folds_start += $loc_problem_folds; + + # Also problematic are anything these fold to outside the range. Likely + # forever the only thing folded to by these outside the 0-255 range is the + # GREEK SMALL MU (from the MICRO SIGN), but it's easy to make the code + # completely general, which should catch any unexpected changes or errors. + # We look at each code point 0-255, and add its fold (including each part + # of a multi-char fold) to the list. See the commit message for these + # changes for a more complete description of the MU issue. + foreach my $range ($loc_problem_folds->ranges) { + foreach my $code_point($range->start .. $range->end) { + my $fold_range = $cf->containing_range($code_point); + next unless defined $fold_range; + + my @hex_folds = split " ", $fold_range->value; + my $start_cp = hex $hex_folds[0]; + foreach my $i (0 .. @hex_folds - 1) { + my $cp = hex $hex_folds[$i]; + next unless $cp > 255; # Already have the < 256 ones + + $loc_problem_folds->add_range($cp, $cp); + $loc_problem_folds_start->add_range($start_cp, $start_cp); + } + } + } my $folds_to_multi_char = $perl->add_match_table( "_Perl_Folds_To_Multi_Char", @@ -13856,19 +13900,38 @@ sub compile_perl() { "Code points whose fold is a string of more than one character", ); - foreach my $range (property_ref('Case_Folding')->ranges) { + # Look through all the known folds to populate these tables. + foreach my $range ($cf->ranges) { my $start = $range->start; my $end = $range->end; $any_folds->add_range($start, $end); - my @hex_code_points = split " ", $range->value; - if (@hex_code_points > 1) { + my @hex_folds = split " ", $range->value; + if (@hex_folds > 1) { # Is multi-char fold $folds_to_multi_char->add_range($start, $end); } - foreach my $i (0 .. @hex_code_points - 1) { - my $code_point = hex $hex_code_points[$i]; - $any_folds->add_range($code_point, $code_point); + my $found_locale_problematic = 0; + + # Look at each of the folded-to characters... + foreach my $i (0 .. @hex_folds - 1) { + my $cp = hex $hex_folds[$i]; + $any_folds->add_range($cp, $cp); + + # The fold is problematic if any of the folded-to characters is + # already considered problematic. + if ($loc_problem_folds->contains($cp)) { + $loc_problem_folds->add_range($start, $end); + $found_locale_problematic = 1; + } + } + + # If this is a problematic fold, add to the start chars the + # folding-from characters and first folded-to character. + if ($found_locale_problematic) { + $loc_problem_folds_start->add_range($start, $end); + my $cp = hex $hex_folds[0]; + $loc_problem_folds_start->add_range($cp, $cp); } } diff --git a/locale.c b/locale.c index a477c3b..e9c3bcc 100644 --- a/locale.c +++ b/locale.c @@ -243,14 +243,24 @@ Perl_new_ctype(pTHX_ const char *newctype) PERL_ARGS_ASSERT_NEW_CTYPE; + PL_in_utf8_CTYPE_locale = is_cur_LC_category_utf8(LC_CTYPE); + + /* A UTF-8 locale gets standard rules. But note that code still has to + * handle this specially because of the three problematic code points */ + if (PL_in_utf8_CTYPE_locale) { + Copy(PL_fold_latin1, PL_fold_locale, 256, U8); + } + else { + for (i = 0; i < 256; i++) { if (isUPPER_LC((U8) i)) - PL_fold_locale[i] = toLOWER_LC((U8) i); + PL_fold_locale[i] = (U8) toLOWER_LC((U8) i); else if (isLOWER_LC((U8) i)) - PL_fold_locale[i] = toUPPER_LC((U8) i); + PL_fold_locale[i] = (U8) toUPPER_LC((U8) i); else PL_fold_locale[i] = (U8) i; } + } #endif /* USE_LOCALE_CTYPE */ PERL_ARGS_ASSERT_NEW_CTYPE; diff --git a/perl.h b/perl.h index b748850..db08bd3 100644 --- a/perl.h +++ b/perl.h @@ -3332,6 +3332,7 @@ struct regnode_charclass_class; /* A hopefully less confusing name. The sub-classes are all Posix classes only * used under /l matching */ typedef struct regnode_charclass_class regnode_charclass_posixl; +typedef struct regnode_charclass_posixl_fold regnode_charclass_posixl_fold; typedef struct regnode_ssc regnode_ssc; typedef struct RExC_state_t RExC_state_t; diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 5f88fa3..ac1cd11 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -25,6 +25,16 @@ XXX New core language features go here. Summarize user-visible core language enhancements. Particularly prominent performance optimisations could go here, but most should go in the L section. +=head2 UTF-8 locales now supported better under C> + +A UTF-8 locale is one in which the character set is Unicode and the +encoding is UTF-8. Now, the POSIX C category operations under +such a locale (within the scope of C>), which include case +changing (like C, C<"\U">), and character classification (C<\w>, +C<\D>, C work just as if not under locale, except taint +rules are followed. Prior to this, Perl only handled single-byte +locales. This resolves [perl #56820]. + =head2 C> now compiles on systems without locale ability Previously doing this caused the program to not compile. Within its diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod index e46be01..a951ec2 100644 --- a/pod/perlfunc.pod +++ b/pod/perlfunc.pod @@ -3338,10 +3338,13 @@ Respects current LC_CTYPE locale for code points < 256; and uses Unicode semantics for the remaining code points (this last can only happen if the UTF8 flag is also set). See L. -A deficiency in this is that case changes that cross the 255/256 +Starting in v5.20, Perl wil use full Unicode rules if the locale is +UTF-8. Otherwise, there is a deficiency in this scheme, which is that +case changes that cross the 255/256 boundary are not well-defined. For example, the lower case of LATIN CAPITAL LETTER SHARP S (U+1E9E) in Unicode semantics is U+00DF (on ASCII -platforms). But under C, the lower case of U+1E9E is +platforms). But under C (prior to v5.20 or not a UTF-8 +locale), the lower case of U+1E9E is itself, because 0xDF may not be LATIN SMALL LETTER SHARP S in the current locale, and Perl has no way of knowing if that character even exists in the locale, much less what code point it is. Perl returns diff --git a/pod/perllocale.pod b/pod/perllocale.pod index 6f94381..0698b34 100644 --- a/pod/perllocale.pod +++ b/pod/perllocale.pod @@ -27,20 +27,22 @@ is controlled per application by using one pragma, one function call, and several environment variables. Unfortunately, there are quite a few deficiencies with the design (and -often, the implementations) of locales, and their use for character sets -has mostly been supplanted by Unicode (see L for an -introduction to that, and keep on reading here for how Unicode interacts -with locales in Perl). - -Perl continues to support the old locale system, and starting in v5.16, -provides a hybrid way to use the Unicode character set, along with the -other portions of locales that may not be so problematic. +often, the implementations) of locales. Unicode was invented (see +L for an introduction to that) in part to address these +design deficiencies, and nowadays, there is a series of "UTF-8 +locales", based on Unicode. These are locales whose character set is +Unicode, encoded in UTF-8. Starting in v5.20, Perl fully supports +UTF-8 locales, except for sorting and string comparisions. (Use +L for these.) Perl continues to support the old +non UTF-8 locales as well. + (Unicode is also creating C, the "Common Locale Data Repository", L which includes more types of information than are available in the POSIX locale system. At the time of this writing, there was no CPAN module that provides access to this XML-encoded data. However, many of its locales have the POSIX-only data extracted, and are -available at L.) +available as UTF-8 locales at +L.) =head1 WHAT IS A LOCALE @@ -166,7 +168,8 @@ Starting in v5.16, there is an optional parameter to this pragma: use locale ':not_characters'; -This parameter allows better mixing of locales and Unicode, and is +This parameter allows better mixing of locales and Unicode (less useful +in v5.20 and later), and is described fully in L, but briefly, it tells Perl to not use the character portions of the locale definition, that is the C and C categories. Instead it will use the @@ -738,6 +741,11 @@ C<$equal_in_locale> will be true if the collation locale specifies a dictionary-like ordering that ignores space characters completely and which folds case. +Perl only supports single-byte locales for C. This means +that a UTF-8 locale likely will just give you machine-native ordering. +Use L for the full implementation of the Unicode +Collation Algorithm. + If you have a single string that you want to check for "equality in locale" against several others, you might think you could gain a little efficiency by using C in conjunction with C: @@ -801,6 +809,21 @@ C class to C. Unfortunately, this creates big problems for regular expressions. "|" still means alternation even though it matches C<\w>. +Starting in v5.20, Perl supports UTF-8 locales for C, but +otherwise Perl only supports single-byte locales, such as the ISO 8859 +series. This means that wide character locales, for example for Asian +languages, are not supported. The UTF-8 locale support is actually a +superset of POSIX locales, because it is really full Unicode behavior +as if no locale were in effect at all (except for tainting; see +L). POSIX locales, even UTF-8 ones, +are lacking certain concepts in Unicode, such as the idea that changing +the case of a character could expand to be more than one character. +Perl in a UTF-8 locale, will give you that expansion. Prior to v5.20, +Perl treated a UTF-8 locale on some platforms like an ISO 8859-1 one, +with some restrictions, and on other platforms more like the "C" locale. +For releases v5.16 and v5.18, C> could be +used as a workaround for this (see L). + Note that there are quite a few things that are unaffected by the current locale. All the escape sequences for particular characters, C<\n> for example, always mean the platform's native one. This means, @@ -1277,9 +1300,11 @@ into bankers, bikers, gamers, and so on. =head1 Unicode and UTF-8 The support of Unicode is new starting from Perl version v5.6, and more fully -implemented in versions v5.8 and later. See L. It is -strongly recommended that when combining Unicode and locale (starting in -v5.16), you use +implemented in versions v5.8 and later. See L. + +Starting in Perl v5.20, UTF-8 locales are supported in Perl, except for +C (use L instead). If you have Perl v5.16 +or v5.18 and can't upgrade, you can use use locale ':not_characters'; @@ -1299,26 +1324,38 @@ into the locale. (See L). On a per-filehandle basis, you can instead use the L module, or the L module, both available from CPAN. The latter module also has methods to ease the handling of C and environment variables, and can be used -on individual strings. Also, if you know that all your locales will be +on individual strings. If you know that all your locales will be UTF-8, as many are these days, you can use the L|perlrun/-C> command line switch. This form of the pragma allows essentially seamless handling of locales -with Unicode. The collation order will be Unicode's. It is strongly +with Unicode. The collation order will be by Unicode code point order. +It is strongly recommended that when you need to order and sort strings that you use the standard module L which gives much better results in many instances than you can get with the old-style locale handling. -For pre-v5.16 Perls, or if you use the locale pragma without the -C<:not_characters> parameter, Perl tries to work with both Unicode and -locales--but there are problems. - -Perl does not handle multi-byte locales in this case, such as have been -used for various -Asian languages, such as Big5 or Shift JIS. However, the increasingly -common multi-byte UTF-8 locales, if properly implemented, may work -reasonably well (depending on your C library implementation) in this -form of the locale pragma, simply because both +All the modules and switches just described can be used in v5.20 with +just plain C, and, should the input locales not be UTF-8, +you'll get the less than ideal behavior, described below, that you get +with pre-v5.16 Perls, or when you use the locale pragma without the +C<:not_characters> parameter in v5.16 and v5.18. If you are using +exclusively UTF-8 locales in v5.20 and higher, the rest of this section +does not apply to you. + +There are two cases, multi-byte and single-byte locales. First +multi-byte: + +The only multi-byte (or wide character) locale that Perl is ever likely +to support is UTF-8. This is due to the difficulty of implementation, +the fact that high quality UTF-8 locales are now published for every +area of the world (L), and that +failing all that you can use the L module to translate to/from +your locale. So, you'll have to do one of those things if you're using +one of these locales, such as Big5 or Shift JIS. For UTF-8 locales, in +Perls (pre v5.20) that don't have full UTF-8 locale support, they may +work reasonably well (depending on your C library implementation) +simply because both they and Perl store characters that take up multiple bytes the same way. However, some, if not most, C library implementations may not process the characters in the upper half of the Latin-1 range (128 - 255) @@ -1326,7 +1363,10 @@ properly under LC_CTYPE. To see if a character is a particular type under a locale, Perl uses the functions like C. Your C library may not work for UTF-8 locales with those functions, instead only working under the newer wide library functions like C. +However, they are treated like single-byte locales, and will have the +restrictions described below. +For single-byte locales, Perl generally takes the tack to use locale rules on code points that can fit in a single byte, and Unicode rules for those that can't (though this isn't uniformly applied, see the note at the end of this section). This @@ -1341,7 +1381,8 @@ for Unicode only, such as C<\p{Alpha}>. They assume that 0xD7 always has its Unicode meaning (or the equivalent on EBCDIC platforms). Since Latin1 is a subset of Unicode and 0xD7 is the multiplication sign in both Latin1 and Unicode, C<\p{Alpha}> will never match it, regardless of locale. A similar -issue occurs with C<\N{...}>. It is therefore a bad idea to use C<\p{}> or +issue occurs with C<\N{...}>. Prior to v5.20, It is therefore a bad +idea to use C<\p{}> or C<\N{}> under plain C--I you can guarantee that the locale will be a ISO8859-1. Use POSIX character classes instead. diff --git a/pod/perlop.pod b/pod/perlop.pod index a0aa3a0..3e1553a 100644 --- a/pod/perlop.pod +++ b/pod/perlop.pod @@ -1546,7 +1546,9 @@ If Unicode (for example, C<\N{}> or code points of 0x100 or beyond) is being used, the case map used by C<\l>, C<\L>, C<\u>, and C<\U> is as defined by Unicode. That means that case-mapping a single character can sometimes produce several characters. -Under C, C<\F> produces the same results as C<\L>. +Under C, C<\F> produces the same results as C<\L> +for all locales but a UTF-8 one, where it instead uses the Unicode +definition. All systems use the virtual C<"\n"> to represent a line terminator, called a "newline". There is no such thing as an unvarying, physical diff --git a/pod/perlre.pod b/pod/perlre.pod index 73b2a43..a67a99c 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -254,15 +254,22 @@ the same as the compilation-time locale, and can differ from one match to another if there is an intervening call of the L. -Perl only supports single-byte locales. This means that code points -above 255 are treated as Unicode no matter what locale is in effect. +The only non-single-byte locale Perl supports is (starting in v5.20) +UTF-8. This means that code points above 255 are treated as Unicode no +matter what locale is in effect (since UTF-8 implies Unicode). + Under Unicode rules, there are a few case-insensitive matches that cross -the 255/256 boundary. These are disallowed under C. For example, -0xFF (on ASCII platforms) does not caselessly match the character at -0x178, C, because 0xFF may not be -C in the current locale, and Perl -has no way of knowing if that character even exists in the locale, much -less what code point it is. +the 255/256 boundary. Except for UTF-8 locales in Perls v5.20 and +later, these are disallowed under C. For example, 0xFF (on ASCII +platforms) does not caselessly match the character at 0x178, C, because 0xFF may not be C in the current locale, and Perl has no way of +knowing if that character even exists in the locale, much less what code +point it is. + +In a UTF-8 locale in v5.20 and later, the only visible difference +between locale and non-locale in regular expressions should be tainting +(see L). This modifier may be specified to be the default by C, but see L. diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod index e2df693..4348663 100644 --- a/pod/perluniintro.pod +++ b/pod/perluniintro.pod @@ -802,11 +802,15 @@ L How Does Unicode Work With Traditional Locales? -Starting in Perl 5.16, you can specify +If your locale is a UTF-8 locale, starting in Perl v5.20, Perl works +well for all categories except C dealing with sorting and +the C operator. + +For other locales, starting in Perl 5.16, you can specify use locale ':not_characters'; -to get Perl to work well with traditional locales. The catch is that you +to get Perl to work well with them. The catch is that you have to translate from the locale character set to/from Unicode yourself. See LO> above for how to diff --git a/pp.c b/pp.c index c411768..f9f0b5b 100644 --- a/pp.c +++ b/pp.c @@ -3528,17 +3528,27 @@ PP(pp_ucfirst) } /* is ucfirst() */ else if (IN_LOCALE_RUNTIME) { - *tmpbuf = toUPPER_LC(*s); /* This would be a bug if any locales - * have upper and title case different - */ + if (IN_UTF8_CTYPE_LOCALE) { + goto do_uni_rules; + } + + *tmpbuf = (U8) toUPPER_LC(*s); /* This would be a bug if any + locales have upper and title case + different */ } else if (! IN_UNI_8_BIT) { *tmpbuf = toUPPER(*s); /* Returns caseless for non-ascii, or * on EBCDIC machines whatever the * native function does */ } - else { /* is ucfirst non-UTF-8, not in locale, and cased latin1 */ - UV title_ord = _to_upper_title_latin1(*s, tmpbuf, &tculen, 's'); + else { + /* Here, is ucfirst non-UTF-8, not in locale (unless that locale is + * UTF-8, which we treat as not in locale), and cased latin1 */ + UV title_ord; + + do_uni_rules: + + title_ord = _to_upper_title_latin1(*s, tmpbuf, &tculen, 's'); if (tculen > 1) { assert(tculen == 2); @@ -3700,15 +3710,20 @@ PP(pp_uc) (SvTEMP(source) && !SvSMAGICAL(source) && SvREFCNT(source) == 1)) && !SvREADONLY(source) && SvPOK(source) && !DO_UTF8(source) - && (IN_LOCALE_RUNTIME || ! IN_UNI_8_BIT)) { - - /* We can convert in place. The reason we can't if in UNI_8_BIT is to - * make the loop tight, so we overwrite the source with the dest before - * looking at it, and we need to look at the original source - * afterwards. There would also need to be code added to handle - * switching to not in-place in midstream if we run into characters - * that change the length. - */ + && ((IN_LOCALE_RUNTIME) + ? ! IN_UTF8_CTYPE_LOCALE + : ! IN_UNI_8_BIT)) + { + + /* We can convert in place. The reason we can't if in UNI_8_BIT is to + * make the loop tight, so we overwrite the source with the dest before + * looking at it, and we need to look at the original source + * afterwards. There would also need to be code added to handle + * switching to not in-place in midstream if we run into characters + * that change the length. Since being in locale overrides UNI_8_BIT, + * that latter becomes irrelevant in the above test; instead for + * locale, the size can't normally change, except if the locale is a + * UTF-8 one */ dest = source; s = d = (U8*)SvPV_force_nomg(source, len); min = len + 1; @@ -3806,8 +3821,11 @@ PP(pp_uc) * latin1 as having case; otherwise the latin1 casing. Do the * whole thing in a tight loop, for speed, */ if (IN_LOCALE_RUNTIME) { + if (IN_UTF8_CTYPE_LOCALE) { + goto do_uni_rules; + } for (; s < send; d++, s++) - *d = toUPPER_LC(*s); + *d = (U8) toUPPER_LC(*s); } else if (! IN_UNI_8_BIT) { for (; s < send; d++, s++) { @@ -3815,6 +3833,7 @@ PP(pp_uc) } } else { + do_uni_rules: for (; s < send; d++, s++) { *d = toUPPER_LATIN1_MOD(*s); if (LIKELY(*d != LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { @@ -4169,6 +4188,9 @@ PP(pp_fc) } /* Unflagged string */ else if (len) { if ( IN_LOCALE_RUNTIME ) { /* Under locale */ + if (IN_UTF8_CTYPE_LOCALE) { + goto do_uni_folding; + } for (; s < send; d++, s++) *d = toFOLD_LC(*s); } @@ -4177,6 +4199,7 @@ PP(pp_fc) *d = toFOLD(*s); } else { + do_uni_folding: /* For ASCII and the Latin-1 range, there's only two troublesome * folds, \x{DF} (\N{LATIN SMALL LETTER SHARP S}), which under full * casefolding becomes 'ss'; and \x{B5} (\N{MICRO SIGN}), which diff --git a/proto.h b/proto.h index ec1550c..bf4315d 100644 --- a/proto.h +++ b/proto.h @@ -75,7 +75,7 @@ PERL_CALLCONV bool Perl__is_utf8_perl_idstart(pTHX_ const U8 *p) #define PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART \ assert(p) -PERL_CALLCONV UV Perl__to_uni_fold_flags(pTHX_ UV c, U8 *p, STRLEN *lenp, const U8 flags) +PERL_CALLCONV UV Perl__to_uni_fold_flags(pTHX_ UV c, U8 *p, STRLEN *lenp, U8 flags) __attribute__nonnull__(pTHX_2) __attribute__nonnull__(pTHX_3); #define PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS \ @@ -87,19 +87,19 @@ PERL_CALLCONV UV Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN * #define PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS \ assert(p); assert(ustrp) -PERL_CALLCONV UV Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags) +PERL_CALLCONV UV Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2); #define PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS \ assert(p); assert(ustrp) -PERL_CALLCONV UV Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags) +PERL_CALLCONV UV Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2); #define PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS \ assert(p); assert(ustrp) -PERL_CALLCONV UV Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags) +PERL_CALLCONV UV Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2); #define PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS \ @@ -6616,7 +6616,7 @@ STATIC bool S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state) #define PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS \ assert(pRExC_state) -STATIC SV* S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, const regnode_charclass_posixl* const node) +STATIC SV* S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, const regnode_charclass_posixl_fold* const node) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2); #define PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC \ diff --git a/regcharclass.h b/regcharclass.h index 868edbc..be3ce06 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -729,6 +729,83 @@ : ( ( ( 0xEF == NATIVE_TO_LATIN1(((U8*)s)[0]) ) && ( 0xAC == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ) && ( ( NATIVE_TO_LATIN1(((U8*)s)[2]) <= 0x86 ) || ( 0x93 <= NATIVE_TO_LATIN1(((U8*)s)[2]) && NATIVE_TO_LATIN1(((U8*)s)[2]) <= 0x97 ) ) ) ? 3 : 0 ) /* + PROBLEMATIC_LOCALE_FOLD: characters whose fold is problematic under locale + + \p{_Perl_Problematic_Locale_Folds} +*/ +/*** GENERATED CODE ***/ +#define is_PROBLEMATIC_LOCALE_FOLD_utf8(s) \ +( ( ( NATIVE_TO_LATIN1(((U8*)s)[0]) & 0x80 ) == 0x00 ) ? 1 \ +: ( ( NATIVE_TO_LATIN1(((U8*)s)[0]) & 0xFE ) == 0xC2 ) ? \ + 2 \ +: ( 0xC4 == NATIVE_TO_LATIN1(((U8*)s)[0]) || 0xC7 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ?\ + ( ( 0xB0 == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ? 2 : 0 ) \ +: ( 0xC5 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ? \ + ( ( 0x89 == NATIVE_TO_LATIN1(((U8*)s)[1]) || 0xB8 == NATIVE_TO_LATIN1(((U8*)s)[1]) || 0xBF == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ? 2 : 0 )\ +: ( 0xCE == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ? \ + ( ( ( NATIVE_TO_LATIN1(((U8*)s)[1]) & 0xDF ) == 0x9C ) ? 2 : 0 ) \ +: ( 0xE1 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ? \ + ( ( ( 0xBA == NATIVE_TO_LATIN1(((U8*)s)[1]) ) && ( ( 0x96 <= NATIVE_TO_LATIN1(((U8*)s)[2]) && NATIVE_TO_LATIN1(((U8*)s)[2]) <= 0x9A ) || 0x9E == NATIVE_TO_LATIN1(((U8*)s)[2]) ) ) ? 3 : 0 )\ +: ( 0xE2 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ? \ + ( ( ( 0x84 == NATIVE_TO_LATIN1(((U8*)s)[1]) ) && ( ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xFE ) == 0xAA ) ) ? 3 : 0 )\ +: ( ( ( 0xEF == NATIVE_TO_LATIN1(((U8*)s)[0]) ) && ( 0xAC == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ) && ( NATIVE_TO_LATIN1(((U8*)s)[2]) <= 0x86 ) ) ? 3 : 0 ) + +/*** GENERATED CODE ***/ +#define is_PROBLEMATIC_LOCALE_FOLD_cp(cp) \ +( NATIVE_TO_UNI(cp) <= 0xFF || ( 0xFF < NATIVE_TO_UNI(cp) && \ +( 0x130 == NATIVE_TO_UNI(cp) || ( 0x130 < NATIVE_TO_UNI(cp) && \ +( 0x149 == NATIVE_TO_UNI(cp) || ( 0x149 < NATIVE_TO_UNI(cp) && \ +( 0x178 == NATIVE_TO_UNI(cp) || ( 0x178 < NATIVE_TO_UNI(cp) && \ +( 0x17F == NATIVE_TO_UNI(cp) || ( 0x17F < NATIVE_TO_UNI(cp) && \ +( 0x1F0 == NATIVE_TO_UNI(cp) || ( 0x1F0 < NATIVE_TO_UNI(cp) && \ +( 0x39C == NATIVE_TO_UNI(cp) || ( 0x39C < NATIVE_TO_UNI(cp) && \ +( 0x3BC == NATIVE_TO_UNI(cp) || ( 0x3BC < NATIVE_TO_UNI(cp) && \ +( ( 0x1E96 <= NATIVE_TO_UNI(cp) && NATIVE_TO_UNI(cp) <= 0x1E9A ) || ( 0x1E9A < NATIVE_TO_UNI(cp) &&\ +( 0x1E9E == NATIVE_TO_UNI(cp) || ( 0x1E9E < NATIVE_TO_UNI(cp) && \ +( 0x212A == NATIVE_TO_UNI(cp) || ( 0x212A < NATIVE_TO_UNI(cp) && \ +( 0x212B == NATIVE_TO_UNI(cp) || ( 0xFB00 <= NATIVE_TO_UNI(cp) && NATIVE_TO_UNI(cp) <= 0xFB06 ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) + +/* + PROBLEMATIC_LOCALE_FOLDEDS_START: The first folded character of folds which are problematic under locale + + \p{_Perl_Problematic_Locale_Foldeds_Start} +*/ +/*** GENERATED CODE ***/ +#define is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(s) \ +( ( ( NATIVE_TO_LATIN1(((U8*)s)[0]) & 0x80 ) == 0x00 ) ? 1 \ +: ( ( NATIVE_TO_LATIN1(((U8*)s)[0]) & 0xFE ) == 0xC2 ) ? \ + 2 \ +: ( 0xC4 == NATIVE_TO_LATIN1(((U8*)s)[0]) || 0xC7 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ?\ + ( ( 0xB0 == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ? 2 : 0 ) \ +: ( 0xC5 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ? \ + ( ( 0x89 == NATIVE_TO_LATIN1(((U8*)s)[1]) || 0xB8 == NATIVE_TO_LATIN1(((U8*)s)[1]) || 0xBF == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ? 2 : 0 )\ +: ( 0xCA == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ? \ + ( ( 0xBC == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ? 2 : 0 ) \ +: ( 0xCE == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ? \ + ( ( ( NATIVE_TO_LATIN1(((U8*)s)[1]) & 0xDF ) == 0x9C ) ? 2 : 0 ) \ +: ( 0xE1 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ? \ + ( ( ( 0xBA == NATIVE_TO_LATIN1(((U8*)s)[1]) ) && ( ( 0x96 <= NATIVE_TO_LATIN1(((U8*)s)[2]) && NATIVE_TO_LATIN1(((U8*)s)[2]) <= 0x9A ) || 0x9E == NATIVE_TO_LATIN1(((U8*)s)[2]) ) ) ? 3 : 0 )\ +: ( 0xE2 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ? \ + ( ( ( 0x84 == NATIVE_TO_LATIN1(((U8*)s)[1]) ) && ( ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xFE ) == 0xAA ) ) ? 3 : 0 )\ +: ( ( ( 0xEF == NATIVE_TO_LATIN1(((U8*)s)[0]) ) && ( 0xAC == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ) && ( NATIVE_TO_LATIN1(((U8*)s)[2]) <= 0x86 ) ) ? 3 : 0 ) + +/*** GENERATED CODE ***/ +#define is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(cp) \ +( NATIVE_TO_UNI(cp) <= 0xFF || ( 0xFF < NATIVE_TO_UNI(cp) && \ +( 0x130 == NATIVE_TO_UNI(cp) || ( 0x130 < NATIVE_TO_UNI(cp) && \ +( 0x149 == NATIVE_TO_UNI(cp) || ( 0x149 < NATIVE_TO_UNI(cp) && \ +( 0x178 == NATIVE_TO_UNI(cp) || ( 0x178 < NATIVE_TO_UNI(cp) && \ +( 0x17F == NATIVE_TO_UNI(cp) || ( 0x17F < NATIVE_TO_UNI(cp) && \ +( 0x1F0 == NATIVE_TO_UNI(cp) || ( 0x1F0 < NATIVE_TO_UNI(cp) && \ +( 0x2BC == NATIVE_TO_UNI(cp) || ( 0x2BC < NATIVE_TO_UNI(cp) && \ +( 0x39C == NATIVE_TO_UNI(cp) || ( 0x39C < NATIVE_TO_UNI(cp) && \ +( 0x3BC == NATIVE_TO_UNI(cp) || ( 0x3BC < NATIVE_TO_UNI(cp) && \ +( ( 0x1E96 <= NATIVE_TO_UNI(cp) && NATIVE_TO_UNI(cp) <= 0x1E9A ) || ( 0x1E9A < NATIVE_TO_UNI(cp) &&\ +( 0x1E9E == NATIVE_TO_UNI(cp) || ( 0x1E9E < NATIVE_TO_UNI(cp) && \ +( 0x212A == NATIVE_TO_UNI(cp) || ( 0x212A < NATIVE_TO_UNI(cp) && \ +( 0x212B == NATIVE_TO_UNI(cp) || ( 0xFB00 <= NATIVE_TO_UNI(cp) && NATIVE_TO_UNI(cp) <= 0xFB06 ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) + +/* PATWS: pattern white space \p{PatWS} diff --git a/regcomp.c b/regcomp.c index 9bf05c0..b117d1a 100644 --- a/regcomp.c +++ b/regcomp.c @@ -954,9 +954,6 @@ S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc) if (RExC_contains_locale) { ANYOF_POSIXL_SETALL(ssc); ANYOF_FLAGS(ssc) |= ANYOF_LOCALE|ANYOF_POSIXL; - if (RExC_contains_i) { - ANYOF_FLAGS(ssc) |= ANYOF_LOC_FOLD; - } } else { ANYOF_POSIXL_ZERO(ssc); @@ -989,16 +986,12 @@ S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state, return FALSE; } - if (RExC_contains_locale) { - if (! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE) - || ! (ANYOF_FLAGS(ssc) & ANYOF_POSIXL) - || ! ANYOF_POSIXL_TEST_ALL_SET(ssc)) - { - return FALSE; - } - if (RExC_contains_i && ! (ANYOF_FLAGS(ssc) & ANYOF_LOC_FOLD)) { - return FALSE; - } + if (RExC_contains_locale + && ! ((ANYOF_FLAGS(ssc) & ANYOF_LOCALE) + || ! (ANYOF_FLAGS(ssc) & ANYOF_POSIXL) + || ! ANYOF_POSIXL_TEST_ALL_SET(ssc))) + { + return FALSE; } return TRUE; @@ -1006,16 +999,18 @@ S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state, STATIC SV* S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, - const regnode_charclass_posixl* const node) + const regnode_charclass_posixl_fold* const node) { /* Returns a mortal inversion list defining which code points are matched * by 'node', which is of type ANYOF. Handles complementing the result if * appropriate. If some code points aren't knowable at this time, the - * returned list must, and will, contain every possible code point. */ + * returned list must, and will, contain every code point that is a + * possibility. */ SV* invlist = sv_2mortal(_new_invlist(0)); unsigned int i; const U32 n = ARG(node); + bool new_node_has_latin1 = FALSE; PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC; @@ -1062,12 +1057,13 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, for (i = 0; i < 256; i++) { if (ANYOF_BITMAP_TEST(node, i)) { invlist = add_cp_to_invlist(invlist, i); + new_node_has_latin1 = TRUE; } } /* If this can match all upper Latin1 code points, have to add them * as well */ - if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) { + if (OP(node) == ANYOF_NON_UTF8_NON_ASCII_ALL) { _invlist_union(invlist, PL_UpperLatin1, &invlist); } @@ -1079,6 +1075,21 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, if (ANYOF_FLAGS(node) & ANYOF_INVERT) { _invlist_invert(invlist); } + else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) { + + /* Under /li, any 0-255 could fold to any other 0-255, depending on the + * locale. We can skip this if there are no 0-255 at all. */ + _invlist_union(invlist, PL_Latin1, &invlist); + } + + /* Similarly add the UTF-8 locale possible matches */ + if (ANYOF_FLAGS(node) & ANYOF_LOC_FOLD && ANYOF_UTF8_LOCALE_INVLIST(node)) + { + _invlist_union_maybe_complement_2nd(invlist, + ANYOF_UTF8_LOCALE_INVLIST(node), + ANYOF_FLAGS(node) & ANYOF_INVERT, + &invlist); + } return invlist; } @@ -1151,7 +1162,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, } else { anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, - (regnode_charclass_posixl*) and_with); + (regnode_charclass_posixl_fold*) and_with); anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS; } @@ -1228,7 +1239,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, * standard, in particular almost everything by Microsoft. * The loop below just changes e.g., \w into \W and vice versa */ - regnode_charclass_posixl temp; + regnode_charclass_posixl_fold temp; int add = 1; /* To calculate the index of the complement */ ANYOF_POSIXL_ZERO(&temp); @@ -1300,7 +1311,7 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, } else { ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, - (regnode_charclass_posixl*) or_with); + (regnode_charclass_posixl_fold*) or_with); ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS; } @@ -1444,6 +1455,10 @@ S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL, FALSE); + /* The code points that could match under /li are already incorporated into + * the inversion list and bit map */ + ANYOF_FLAGS(ssc) &= ~ANYOF_LOC_FOLD; + assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE) || RExC_contains_locale); } @@ -3084,10 +3099,18 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * * If a node is to match under /i (folded), the number of characters it matches * can be different than its character length if it contains a multi-character - * fold. *min_subtract is set to the total delta of the input nodes. + * fold. *min_subtract is set to the total delta number of characters of the + * input nodes. * - * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF - * and contains LATIN SMALL LETTER SHARP S + * And *has_exactf_sharp_s is set to indicate whether or not the node contains + * an unfolded multi-char fold. This happens when whether the fold is valid or + * not won't be known until runtime; namely for EXACTF nodes that contain LATIN + * SMALL LETTER SHARP S, as only if the target string being matched against + * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose + * folding rules depend on the locale in force at runtime. (Multi-char folds + * whose components are all above the Latin1 range are not run-time locale + * dependent, and have already been folded by the time this function is + * called.) * * This is as good a place as any to discuss the design of handling these * multi-character fold sequences. It's been wrong in Perl for a very long @@ -3109,11 +3132,12 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * character folded sequences. Since a single character can fold into * such a sequence, the minimum match length for this node is less than * the number of characters in the node. This routine returns in - * *min_subtract how much to subtract from the the actual length of the - * string to get a real minimum match length; it is 0 if there are no - * multi-char foldeds. This delta is used by the caller to adjust the min - * length of the match, and the delta between min and max, so that the - * optimizer doesn't reject these possibilities based on size constraints. + * *min_subtract how many characters to subtract from the the actual + * length of the string to get a real minimum match length; it is 0 if + * there are no multi-char foldeds. This delta is used by the caller to + * adjust the min length of the match, and the delta between min and max, + * so that the optimizer doesn't reject these possibilities based on size + * constraints. * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS * is used for an EXACTFU node that contains at least one "ss" sequence in * it. For non-UTF-8 patterns and strings, this is the only case where @@ -3121,11 +3145,12 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * EXACTFU node without UTF-8 involvement doesn't have to concern itself * with length changes, and so can be processed faster. regexec.c takes * advantage of this. Generally, an EXACTFish node that is in UTF-8 is - * pre-folded by regcomp.c. This saves effort in regex matching. - * However, the pre-folding isn't done for non-UTF8 patterns because the - * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things - * down by forcing the pattern into UTF8 unless necessary. Also what - * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold + * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't + * known until runtime). This saves effort in regex matching. However, + * the pre-folding isn't done for non-UTF8 patterns because the fold of + * the MICRO SIGN requires UTF-8, and we don't want to slow things down by + * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and, + * again, EXACTFL) nodes fold to isn't known until runtime. The fold * possibilities for the non-UTF8 patterns are quite simple, except for * the sharp s. All the ones that don't involve a UTF-8 target string are * members of a fold-pair, and arrays are set up for all of them so that @@ -3133,37 +3158,45 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * this file makes sure that in EXACTFU nodes, the sharp s gets folded to * 'ss', even if the pattern isn't UTF-8. This avoids the issues * described in the next item. - * 3) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the - * pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a - * UTF-8 pattern.) An assumption that the optimizer part of regexec.c - * (probably unwittingly, in Perl_regexec_flags()) makes is that a - * character in the pattern corresponds to at most a single character in - * the target string. (And I do mean character, and not byte here, unlike - * other parts of the documentation that have never been updated to - * account for multibyte Unicode.) sharp s in EXACTF nodes can match the - * two character string 'ss'; in EXACTFA nodes it can match - * "\x{17F}\x{17F}". These violate the assumption, and they are the only - * instances where it is violated. I'm reluctant to try to change the - * assumption, as the code involved is impenetrable to me (khw), so - * instead the code here punts. This routine examines (when the pattern - * isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a - * boolean indicating whether or not the node contains a sharp s. When it - * is true, the caller sets a flag that later causes the optimizer in this - * file to not set values for the floating and fixed string lengths, and - * thus avoids the optimizer code in regexec.c that makes the invalid + * 3) A problem remains for unfolded multi-char folds. (These occur when the + * validity of the fold won't be known until runtime, and so must remain + * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA + * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot + * be an EXACTF node with a UTF-8 pattern.) They also occur for various + * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.) + * The reason this is a problem is that the optimizer part of regexec.c + * (probably unwittingly, in Perl_regexec_flags()) makes an assumption + * that a character in the pattern corresponds to at most a single + * character in the target string. (And I do mean character, and not byte + * here, unlike other parts of the documentation that have never been + * updated to account for multibyte Unicode.) sharp s in EXACTF and + * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes + * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL + * nodes, violate the assumption, and they are the only instances where it + * is violated. I'm reluctant to try to change the assumption, as the + * code involved is impenetrable to me (khw), so instead the code here + * punts. This routine examines EXACTFL nodes, and (when the pattern + * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a + * boolean indicating whether or not the node contains such a fold. When + * it is true, the caller sets a flag that later causes the optimizer in + * this file to not set values for the floating and fixed string lengths, + * and thus avoids the optimizer code in regexec.c that makes the invalid * assumption. Thus, there is no optimization based on string lengths for - * non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s. - * (The reason the assumption is wrong only in these two cases is that all - * other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all - * other folds to their expanded versions. We can't prefold sharp s to - * 'ss' in EXACTF nodes because we don't know at compile time if it - * actually matches 'ss' or not. It will match iff the target string is - * in UTF-8, unlike the EXACTFU nodes, where it always matches; and - * EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8 - * pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem; - * but in a non-UTF8 pattern, folding it to that above-Latin1 string would - * require the pattern to be forced into UTF-8, the overhead of which we - * want to avoid.) + * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern + * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the + * assumption is wrong only in these cases is that all other non-UTF-8 + * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to + * their expanded versions. (Again, we can't prefold sharp s to 'ss' in + * EXACTF nodes because we don't know at compile time if it actually + * matches 'ss' or not. For EXACTF nodes it will match iff the target + * string is in UTF-8. This is in contrast to EXACTFU nodes, where it + * always matches; and EXACTFA where it never does. In an EXACTFA node in + * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the + * problem; but in a non-UTF8 pattern, folding it to that above-Latin1 + * string would require the pattern to be forced into UTF-8, the overhead + * of which we want to avoid. Similarly the unfolded multi-char folds in + * EXACTFL nodes will match iff the locale at the time of match is a UTF-8 + * locale.) * * Similarly, the code that generates tries doesn't currently handle * not-already-folded multi-char folds, and it looks like a pain to change @@ -3271,15 +3304,68 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, * hence missed). The sequences only happen in folding, hence for any * non-EXACT EXACTish node */ if (OP(scan) != EXACT) { - const U8 * const s0 = (U8*) STRING(scan); - const U8 * s = s0; - const U8 * const s_end = s0 + STR_LEN(scan); + U8* s0 = (U8*) STRING(scan); + U8* s = s0; + U8* s_end = s0 + STR_LEN(scan); + + int total_count_delta = 0; /* Total delta number of characters that + multi-char folds expand to */ /* One pass is made over the node's string looking for all the * possibilities. to avoid some tests in the loop, there are two main * cases, for UTF-8 patterns (which can't have EXACTF nodes) and * non-UTF-8 */ if (UTF) { + U8* folded = NULL; + + if (OP(scan) == EXACTFL) { + U8 *d; + + /* An EXACTFL node would already have been changed to another + * node type unless there is at least one character in it that + * is problematic; likely a character whose fold definition + * won't be known until runtime, and so has yet to be folded. + * For all but the UTF-8 locale, folds are 1-1 in length, but + * to handle the UTF-8 case, we need to create a temporary + * folded copy using UTF-8 locale rules in order to analyze it. + * This is because our macros that look to see if a sequence is + * a multi-char fold assume everything is folded (otherwise the + * tests in those macros would be too complicated and slow). + * Note that here, the non-problematic folds will have already + * been done, so we can just copy such characters. We actually + * don't completely fold the EXACTFL string. We skip the + * unfolded multi-char folds, as that would just create work + * below to figure out the size they already are */ + + Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8); + d = folded; + while (s < s_end) { + STRLEN s_len = UTF8SKIP(s); + if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) { + Copy(s, d, s_len, U8); + d += s_len; + } + else if (is_FOLDS_TO_MULTI_utf8(s)) { + *has_exactf_sharp_s = TRUE; + Copy(s, d, s_len, U8); + d += s_len; + } + else if (isASCII(*s)) { + *(d++) = toFOLD(*s); + } + else { + STRLEN len; + _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL); + d += len; + } + s += s_len; + } + + /* Point the remainder of the routine to look at our temporary + * folded copy */ + s = folded; + s_end = d; + } /* End of creating folded copy of EXACTFL string */ /* Examine the string for a multi-character fold sequence. UTF-8 * patterns have all characters pre-folded by the time this code is @@ -3294,34 +3380,25 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, continue; } - /* Nodes with 'ss' require special handling, except for EXACTFL - * and EXACTFA-ish for which there is no multi-char fold to - * this */ + /* Nodes with 'ss' require special handling, except for + * EXACTFA-ish for which there is no multi-char fold to this */ if (len == 2 && *s == 's' && *(s+1) == 's' - && OP(scan) != EXACTFL && OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) { count = 2; - OP(scan) = EXACTFU_SS; + if (OP(scan) != EXACTFL) { + OP(scan) = EXACTFU_SS; + } s += 2; } else { /* Here is a generic multi-char fold. */ - const U8* multi_end = s + len; - - /* Count how many characters in it. In the case of /l and - * /aa, no folds which contain ASCII code points are - * allowed, so check for those, and skip if found. (In - * EXACTFL, no folds are allowed to any Latin1 code point, - * not just ASCII. But there aren't any of these - * currently, nor ever likely, so don't take the time to - * test for them. The code that generates the - * is_MULTI_foo() macros croaks should one actually get put - * into Unicode .) */ - if (OP(scan) != EXACTFL - && OP(scan) != EXACTFA - && OP(scan) != EXACTFA_NO_TRIE) - { + U8* multi_end = s + len; + + /* Count how many characters in it. In the case of /aa, no + * folds which contain ASCII code points are allowed, so + * check for those, and skip if found. */ + if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) { count = utf8_length(s, multi_end); s = multi_end; } @@ -3341,9 +3418,23 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, /* The delta is how long the sequence is minus 1 (1 is how long * the character that folds to the sequence is) */ - *min_subtract += count - 1; - next_iteration: ; + total_count_delta += count - 1; + next_iteration: ; } + + /* We created a temporary folded copy of the string in EXACTFL + * nodes. Therefore we need to be sure it doesn't go below zero, + * as the real string could be shorter */ + if (OP(scan) == EXACTFL) { + int total_chars = utf8_length((U8*) STRING(scan), + (U8*) STRING(scan) + STR_LEN(scan)); + if (total_count_delta > total_chars) { + total_count_delta = total_chars; + } + } + + *min_subtract += total_count_delta; + Safefree(folded); } else if (OP(scan) == EXACTFA) { @@ -3363,23 +3454,23 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, continue; } } - else if (OP(scan) != EXACTFL) { - - /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the - * multi-char folds that are all Latin1. (This code knows that - * there are no current multi-char folds possible with EXACTFL, - * relying on fold_grind.t to catch any errors if the very unlikely - * event happens that some get added in future Unicode versions.) - * As explained in the comments preceding this function, we look - * also for the sharp s in EXACTF nodes; it can be in the final - * position. Otherwise we can stop looking 1 byte earlier because - * have to find at least two characters for a multi-fold */ - const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1; + else { + + /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char + * folds that are all Latin1. As explained in the comments + * preceding this function, we look also for the sharp s in EXACTF + * and EXACTFL nodes; it can be in the final position. Otherwise + * we can stop looking 1 byte earlier because have to find at least + * two characters for a multi-fold */ + const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL) + ? s_end + : s_end -1; while (s < upper) { int len = is_MULTI_CHAR_FOLD_latin1(s); if (! len) { /* Not a multi-char fold. */ - if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF) + if (*s == LATIN_SMALL_LETTER_SHARP_S + && (OP(scan) == EXACTF || OP(scan) == EXACTFL)) { *has_exactf_sharp_s = TRUE; } @@ -3396,8 +3487,9 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, * changed so that a sharp s in the string can match this * ss in the pattern, but they remain EXACTF nodes, as they * won't match this unless the target string is is UTF-8, - * which we don't know until runtime */ - if (OP(scan) != EXACTF) { + * which we don't know until runtime. EXACTFL nodes can't + * transform into EXACTFU nodes */ + if (OP(scan) != EXACTF && OP(scan) != EXACTFL) { OP(scan) = EXACTFU_SS; } } @@ -4169,12 +4261,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } if (OP(scan) == EXACTFL) { if (flags & SCF_DO_STCLASS_AND) { - ssc_flags_and(data->start_class, - ANYOF_LOCALE|ANYOF_LOC_FOLD); + ssc_flags_and(data->start_class, ANYOF_LOCALE); } else if (flags & SCF_DO_STCLASS_OR) { - ANYOF_FLAGS(data->start_class) - |= ANYOF_LOCALE|ANYOF_LOC_FOLD; + ANYOF_FLAGS(data->start_class) |= ANYOF_LOCALE; } /* We don't know what the folds are; it could be anything. XXX @@ -10813,16 +10903,15 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, } len = 1; } - else if (FOLD && (! LOC || code_point > 255)) { - /* Folding, and ok to do so now */ + else if (FOLD && (! LOC + || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point))) + { /* Folding, and ok to do so now */ _to_uni_fold_flags(code_point, character, &len, - FOLD_FLAGS_FULL | ((LOC) - ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0)); + FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); } else if (code_point <= MAX_UTF8_TWO_BYTE) { @@ -11445,7 +11534,6 @@ tryagain: char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE]; char *s0; U8 upper_parse = MAX_NODE_STRING_SIZE; - STRLEN foldlen; U8 node_type = compute_EXACTish(pRExC_state); bool next_is_quantifier; char * oldp = NULL; @@ -11453,9 +11541,15 @@ tryagain: /* We can convert EXACTF nodes to EXACTFU if they contain only * characters that match identically regardless of the target * string's UTF8ness. The reason to do this is that EXACTF is not - * trie-able, EXACTFU is. (We don't need to figure this out until - * pass 2) */ - bool maybe_exactfu = node_type == EXACTF && PASS2; + * trie-able, EXACTFU is. + * + * Similarly, we can convert EXACTFL nodes to EXACTFU if they + * contain only above-Latin1 characters (hence must be in UTF8), + * which don't participate in folds with Latin1-range characters, + * as the latter's folds aren't known until runtime. (We don't + * need to figure this out until pass 2) */ + bool maybe_exactfu = PASS2 + && (node_type == EXACTF || node_type == EXACTFL); /* If a folding node contains only code points that don't * participate in folds, it can be changed into an EXACT node, @@ -11472,10 +11566,9 @@ tryagain: reparse: - /* We do the EXACTFish to EXACT node only if folding, and not if in - * locale, as whether a character folds or not isn't known until - * runtime. (And we don't need to figure this out until pass 2) */ - maybe_exact = FOLD && ! LOC && PASS2; + /* We do the EXACTFish to EXACT node only if folding. (And we + * don't need to figure this out until pass 2) */ + maybe_exact = FOLD && PASS2; /* XXX The node can hold up to 255 bytes, yet this only goes to * 127. I (khw) do not know why. Keeping it somewhat less than @@ -11766,7 +11859,10 @@ tryagain: goto loopdone; } - if (! FOLD) { /* The simple case, just append the literal */ + if (! FOLD /* The simple case, just append the literal */ + || (LOC /* Also don't fold for tricky chars under /l */ + && is_PROBLEMATIC_LOCALE_FOLD_cp(ender))) + { if (UTF) { const STRLEN unilen = reguni(pRExC_state, ender, s); if (unilen > 0) { @@ -11784,15 +11880,27 @@ tryagain: else { REGC((char)ender, s++); } + + /* Can get here if folding only if is one of the /l + * characters whose fold depends on the locale. The + * occurrence of any of these indicate that we can't + * simplify things */ + if (FOLD) { + maybe_exact = FALSE; + maybe_exactfu = FALSE; + } } - else /* FOLD */ if (! ( UTF + else /* FOLD */ + if (! ( UTF /* See comments for join_exact() as to why we fold this * non-UTF at compile time */ || (node_type == EXACTFU && ender == LATIN_SMALL_LETTER_SHARP_S))) { /* Here, are folding and are not UTF-8 encoded; therefore - * the character must be in the range 0-255. */ + * the character must be in the range 0-255, and is not /l + * (Not /l because we already handled these under /l in + * is_PROBLEMATIC_LOCALE_FOLD_cp */ if (IS_IN_SOME_FOLD_L1(ender)) { maybe_exact = FALSE; @@ -11822,35 +11930,33 @@ tryagain: * unfolded, and we have to calculate how many EXACTish * nodes it will take; and we may run out of room in a node * in the middle of a potential multi-char fold, and have - * to back off accordingly. */ - - /* Prime the casefolded buffer. Locale rules, which apply - * only to code points < 256, aren't known until execution, - * so for them, just output the original character using - * utf8. If we start to fold non-UTF patterns, be sure to - * update join_exact() */ - if (LOC && ender < 256) { - if (UVCHR_IS_INVARIANT(ender)) { - *s = (U8) ender; - foldlen = 1; - } else { - *s = UTF8_TWO_BYTE_HI(ender); - *(s + 1) = UTF8_TWO_BYTE_LO(ender); - foldlen = 2; - } + * to back off accordingly. (Hence we can't use REGC for + * the simple case just below.) */ + + UV folded; + if (isASCII(ender)) { + folded = toFOLD(ender); + *(s)++ = (U8) folded; } - else { /* ender >= 256 */ - UV folded = _to_uni_fold_flags( - ender, - (U8 *) s, - &foldlen, - FOLD_FLAGS_FULL - | ((LOC) ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0) - ); + else { + STRLEN foldlen; + folded = _to_uni_fold_flags( + ender, + (U8 *) s, + &foldlen, + FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); + s += foldlen; + + /* The loop increments each time, as all but this + * path (and one other) through it add a single byte to + * the EXACTish node. But this one has changed len to + * be the correct final value, so subtract one to + * cancel out the increment that follows */ + len += foldlen - 1; + } /* If this node only contains non-folding code points * so far, see if this new one is also non-folding */ if (maybe_exact) { @@ -11869,15 +11975,6 @@ tryagain: } } ender = folded; - } - s += foldlen; - - /* The loop increments each time, as all but this - * path (and one other) through it add a single byte to the - * EXACTish node. But this one has changed len to be the - * correct final value, so subtract one to cancel out the - * increment that follows */ - len += foldlen - 1; } if (next_is_quantifier) { @@ -11926,9 +12023,8 @@ tryagain: if (! UTF) { - /* These two have no multi-char folds to non-UTF characters - */ - if (ASCII_FOLD_RESTRICTED || LOC) { + /* This has no multi-char folds to non-UTF characters */ + if (ASCII_FOLD_RESTRICTED) { goto loopdone; } @@ -11959,11 +12055,7 @@ tryagain: } } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { - - /* No Latin1 characters participate in multi-char - * folds under /l */ - if (LOC - || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE( + if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE( *s, *(s+1)))) { break; @@ -12074,7 +12166,7 @@ tryagain: * code points in the node that participate in folds; * similarly for 'maybe_exactfu' and code points that match * differently depending on UTF8ness of the target string - * */ + * (for /u), or depending on locale for /l */ if (maybe_exact) { OP(ret) = EXACT; } @@ -13477,22 +13569,33 @@ parseit: * space will contain a bit for each named class that is to be matched * against. This isn't needed for \p{} and pseudo-classes, as they are * not affected by locale, and hence are dealt with separately */ - if (LOC - && ! need_class - && (ANYOF_LOCALE == ANYOF_POSIXL + if (LOC) { + if (FOLD && ! need_class) { + need_class = 1; + if (SIZE_ONLY) { + RExC_size += ANYOF_POSIXL_FOLD_SKIP - ANYOF_SKIP; + } + else { + RExC_emit += ANYOF_POSIXL_FOLD_SKIP - ANYOF_SKIP; + } + } + if (ANYOF_LOCALE == ANYOF_POSIXL || (namedclass > OOB_NAMEDCLASS - && namedclass < ANYOF_POSIXL_MAX))) - { - need_class = 1; - if (SIZE_ONLY) { + && namedclass < ANYOF_POSIXL_MAX)) + { + if (! need_class) { + need_class = 1; + if (SIZE_ONLY) { RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP; } else { RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP; } + } ANYOF_POSIXL_ZERO(ret); ANYOF_FLAGS(ret) |= ANYOF_POSIXL; } + } if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ U8 classnum; @@ -13736,11 +13839,9 @@ parseit: value, foldbuf, &foldlen, - FOLD_FLAGS_FULL - | ((LOC) ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0) + FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED + ? FOLD_FLAGS_NOMIX_ASCII + : 0) ); /* Here, should be the first character of the @@ -14105,6 +14206,19 @@ parseit: UV start, end; /* End points of code point ranges */ SV* fold_intersection = NULL; + SV** use_list; + + /* Our calculated list will be for Unicode rules. For locale + * matching, we have to keep a separate list that is consulted at + * runtime only when the locale indicates Unicode rules. For + * non-locale, we just use to the general list */ + if (LOC) { + use_list = &ANYOF_UTF8_LOCALE_INVLIST(ret); + *use_list = NULL; + } + else { + use_list = &cp_list; + } /* Only the characters in this class that participate in folds need * be checked. Get the intersection of this class and all the @@ -14140,15 +14254,8 @@ parseit: while (invlist_iternext(fold_intersection, &start, &end)) { UV j; - /* Locale folding for Latin1 characters is deferred until - * runtime */ - if (LOC && start < 256) { - start = 256; - } - /* Look at every character in the range */ for (j = start; j <= end; j++) { - U8 foldbuf[UTF8_MAXBYTES_CASE+1]; STRLEN foldlen; SV** listp; @@ -14167,9 +14274,10 @@ parseit: if (IS_IN_SOME_FOLD_L1(j)) { /* ASCII is always matched; non-ASCII is matched - * only under Unicode rules */ - if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) { - cp_list = add_cp_to_invlist(cp_list, + * only under Unicode rules (which could happen + * under /l if the locale is a UTF-8 one */ + if (isASCII(j) || ! DEPENDS_SEMANTICS) { + *use_list = add_cp_to_invlist(*use_list, PL_fold_latin1[j]); } else { @@ -14194,32 +14302,31 @@ parseit: switch (j) { case 'k': case 'K': - cp_list = - add_cp_to_invlist(cp_list, KELVIN_SIGN); + *use_list = + add_cp_to_invlist(*use_list, KELVIN_SIGN); break; case 's': case 'S': - cp_list = add_cp_to_invlist(cp_list, + *use_list = add_cp_to_invlist(*use_list, LATIN_SMALL_LETTER_LONG_S); break; case MICRO_SIGN: - cp_list = add_cp_to_invlist(cp_list, - GREEK_CAPITAL_LETTER_MU); - cp_list = add_cp_to_invlist(cp_list, - GREEK_SMALL_LETTER_MU); + *use_list = add_cp_to_invlist(*use_list, + GREEK_CAPITAL_LETTER_MU); + *use_list = add_cp_to_invlist(*use_list, + GREEK_SMALL_LETTER_MU); break; case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: - cp_list = - add_cp_to_invlist(cp_list, - ANGSTROM_SIGN); + *use_list = + add_cp_to_invlist(*use_list, ANGSTROM_SIGN); break; case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: - cp_list = add_cp_to_invlist(cp_list, + *use_list = add_cp_to_invlist(*use_list, LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); break; case LATIN_SMALL_LETTER_SHARP_S: - cp_list = add_cp_to_invlist(cp_list, + *use_list = add_cp_to_invlist(*use_list, LATIN_CAPITAL_LETTER_SHARP_S); break; case 'F': case 'f': @@ -14255,11 +14362,9 @@ parseit: * the simple fold, as the multi-character folds have been * handled earlier and separated out */ _to_uni_fold_flags(j, foldbuf, &foldlen, - ((LOC) - ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0)); + (ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0); /* Single character fold of above Latin1. Add everything in * its fold closure to the list that this node should match. @@ -14281,23 +14386,30 @@ parseit: } c = SvUV(*c_p); - /* /aa doesn't allow folds between ASCII and non-; - * /l doesn't allow them between above and below - * 256 */ + /* /aa doesn't allow folds between ASCII and non- */ if ((ASCII_FOLD_RESTRICTED - && (isASCII(c) != isASCII(j))) - || (LOC && c < 256)) { + && (isASCII(c) != isASCII(j)))) + { + continue; + } + + /* Folds under /l which cross the 255/256 boundary + * are added to a separate list. (These are valid + * only when the locale is UTF-8.) */ + if (c < 256 && LOC) { + *use_list = add_cp_to_invlist(*use_list, c); continue; } - /* Folds involving non-ascii Latin1 characters - * under /d are added to a separate list */ if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) { cp_list = add_cp_to_invlist(cp_list, c); } else { - depends_list = add_cp_to_invlist(depends_list, c); + /* Similarly folds involving non-ascii Latin1 + * characters under /d are added to their list */ + depends_list = add_cp_to_invlist(depends_list, + c); } } } @@ -14430,10 +14542,21 @@ parseit: /* If we didn't do folding, it's because some information isn't available * until runtime; set the run-time fold flag for these. (We don't have to * worry about properties folding, as that is taken care of by the swash - * fetching) */ - if (FOLD && LOC) - { - ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + * fetching). We know to set the flag if we have a non-NULL list for UTF-8 + * locales, or the class matches at least one 0-255 range code point */ + if (LOC && FOLD) { + if (ANYOF_UTF8_LOCALE_INVLIST(ret)) { + ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + } + else if (cp_list) { /* Look to see if there a 0-255 code point is in + the list */ + UV start, end; + invlist_iterinit(cp_list); + if (invlist_iternext(cp_list, &start, &end) && start < 256) { + ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + } + invlist_iterfinish(cp_list); + } } /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known @@ -14489,7 +14612,7 @@ parseit: if (cp_list && ! invert && ! depends_list - && ! (ANYOF_FLAGS(ret) & ANYOF_POSIXL) + && ! (ANYOF_FLAGS(ret) & (ANYOF_LOC_FOLD|ANYOF_POSIXL)) && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION /* We don't optimize if we are supposed to make sure all non-Unicode @@ -15492,8 +15615,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } } - if (flags & (ANYOF_ABOVE_LATIN1_ALL|ANYOF_ABOVE_LATIN1_ALL) - || ANYOF_NONBITMAP(o)) + if ((flags & ANYOF_ABOVE_LATIN1_ALL) + || ANYOF_UTF8_LOCALE_INVLIST(o) || ANYOF_NONBITMAP(o)) { if (do_sep) { Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); @@ -15567,6 +15690,25 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) SvREFCNT_dec_NN(lv); } } + + /* Output any UTF-8 locale code points */ + if (flags & ANYOF_LOC_FOLD && ANYOF_UTF8_LOCALE_INVLIST(o)) { + UV start, end; + int max_entries = 256; + + sv_catpvs(sv, "{utf8 locale}"); + invlist_iterinit(ANYOF_UTF8_LOCALE_INVLIST(o)); + while (invlist_iternext(ANYOF_UTF8_LOCALE_INVLIST(o), + &start, &end)) { + put_range(sv, start, end); + max_entries --; + if (max_entries < 0) { + sv_catpvs(sv, "..."); + break; + } + } + invlist_iterfinish(ANYOF_UTF8_LOCALE_INVLIST(o)); + } } Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); @@ -16430,8 +16572,11 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, } else if (PL_regkind[(U8)op] == ANYOF) { /* arglen 1 + class block */ - node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL) - ? ANYOF_POSIXL_SKIP : ANYOF_SKIP); + node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) + ? ANYOF_POSIXL_FOLD_SKIP + : (ANYOF_FLAGS(node) & ANYOF_POSIXL) + ? ANYOF_POSIXL_SKIP + : ANYOF_SKIP); node = NEXTOPER(node); } else if (PL_regkind[(U8)op] == EXACT) { diff --git a/regcomp.h b/regcomp.h index 3a3add3..ac13597 100644 --- a/regcomp.h +++ b/regcomp.h @@ -191,9 +191,23 @@ struct regnode_charclass_class { U32 classflags; /* and run-time */ }; -/* Synthetic start class; is a regnode_charclass_class plus an SV*. Note that - * the 'next_off' field is unused, as the SSC stands alone, so there is never a - * next node. */ +/* like above, but also has folds that are used only if the runtime locale is + * UTF-8. */ +struct regnode_charclass_posixl_fold { + U8 flags; /* ANYOF_POSIXL bit must go here */ + U8 type; + U16 next_off; + U32 arg1; /* used as ptr in S_regclass */ + char bitmap[ANYOF_BITMAP_SIZE]; /* both compile-time */ + U32 classflags; /* and run-time */ + SV* utf8_locale_list; /* list of code points matched by folds + in a UTF-8 locale */ +}; + +/* A synthetic start class; is a regnode_charclass_posixl_fold, plus an extra + * SV*, used only during its construction and which is not used by regexec.c. + * Note that the 'next_off' field is unused, as the SSC stands alone, so there + * is never a next node. */ struct regnode_ssc { U8 flags; /* ANYOF_POSIXL bit must go here */ U8 type; @@ -201,6 +215,8 @@ struct regnode_ssc { U32 arg1; /* used as ptr in S_regclass */ char bitmap[ANYOF_BITMAP_SIZE]; /* both compile-time */ U32 classflags; /* and run-time */ + SV* utf8_locale_list; /* list of code points matched by folds + in a UTF-8 locale */ SV* invlist; /* list of code points matched */ }; @@ -470,6 +486,7 @@ struct regnode_ssc { #define ANYOF_SIZE (sizeof(struct regnode_charclass)) #define ANYOF_POSIXL_SIZE (sizeof(regnode_charclass_posixl)) #define ANYOF_CLASS_SIZE ANYOF_POSIXL_SIZE +#define ANYOF_POSIXL_FOLD_SIZE (sizeof(regnode_charclass_posixl_fold)) #define ANYOF_FLAGS(p) ((p)->flags) @@ -522,8 +539,11 @@ struct regnode_ssc { #define ANYOF_SKIP ((ANYOF_SIZE - 1)/sizeof(regnode)) #define ANYOF_POSIXL_SKIP ((ANYOF_POSIXL_SIZE - 1)/sizeof(regnode)) +#define ANYOF_POSIXL_FOLD_SKIP ((ANYOF_POSIXL_FOLD_SIZE - 1)/sizeof(regnode)) #define ANYOF_CLASS_SKIP ANYOF_POSIXL_SKIP +#define ANYOF_UTF8_LOCALE_INVLIST(node) (((regnode_charclass_posixl_fold*) (node))->utf8_locale_list) + /* * Utility definitions. */ diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl index 8aee7c4..300cd01 100755 --- a/regen/regcharclass.pl +++ b/regen/regcharclass.pl @@ -1524,6 +1524,14 @@ FOLDS_TO_MULTI: characters that fold to multi-char strings => UTF8 :fast \p{_Perl_Folds_To_Multi_Char} +PROBLEMATIC_LOCALE_FOLD : characters whose fold is problematic under locale +=> UTF8 cp :fast +\p{_Perl_Problematic_Locale_Folds} + +PROBLEMATIC_LOCALE_FOLDEDS_START : The first folded character of folds which are problematic under locale +=> UTF8 cp :fast +\p{_Perl_Problematic_Locale_Foldeds_Start} + PATWS: pattern white space => generic generic_non_low cp : fast safe \p{PatWS} diff --git a/regexec.c b/regexec.c index 0fc80c5..d967084 100644 --- a/regexec.c +++ b/regexec.c @@ -1258,7 +1258,7 @@ STMT_START { \ STRLEN skiplen; \ U8 flags = FOLD_FLAGS_FULL; \ switch (trie_type) { \ - case trie_utf8_exactfa_fold: \ + case trie_utf8_exactfa_fold: \ flags |= FOLD_FLAGS_NOMIX_ASCII; \ /* FALL THROUGH */ \ case trie_utf8_fold: \ @@ -1286,7 +1286,7 @@ STMT_START { \ len=0; \ } else { \ len = 1; \ - uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \ + uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \ skiplen = UNISKIP( uvc ); \ foldlen -= skiplen; \ uscan = foldbuf + skiplen; \ @@ -1537,7 +1537,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, goto do_exactf_non_utf8; case EXACTFL: - if (is_utf8_pat || utf8_target) { + if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) { utf8_fold_flags = FOLDEQ_LOCALE; goto do_exactf_utf8; } @@ -3425,6 +3425,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, dVAR; U8 *pat = (U8*)STRING(text_node); + U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1]; if (OP(text_node) == EXACT) { @@ -3444,7 +3445,56 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, c2 = c1 = valid_utf8_to_uvchr(pat, NULL); } } - else /* an EXACTFish node */ + else { /* an EXACTFish node */ + U8 *pat_end = pat + STR_LEN(text_node); + + /* An EXACTFL node has at least some characters unfolded, because what + * they match is not known until now. So, now is the time to fold + * the first few of them, as many as are needed to determine 'c1' and + * 'c2' later in the routine. If the pattern isn't UTF-8, we only need + * to fold if in a UTF-8 locale, and then only the Sharp S; everything + * else is 1-1 and isn't assumed to be folded. In a UTF-8 pattern, we + * need to fold as many characters as a single character can fold to, + * so that later we can check if the first ones are such a multi-char + * fold. But, in such a pattern only locale-problematic characters + * aren't folded, so we can skip this completely if the first character + * in the node isn't one of the tricky ones */ + if (OP(text_node) == EXACTFL) { + + if (! is_utf8_pat) { + if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S) + { + folded[0] = folded[1] = 's'; + pat = folded; + pat_end = folded + 2; + } + } + else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) { + U8 *s = pat; + U8 *d = folded; + int i; + + for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) { + if (isASCII(*s)) { + *(d++) = (U8) toFOLD_LC(*s); + s++; + } + else { + STRLEN len; + _to_utf8_fold_flags(s, + d, + &len, + FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE); + d += len; + s += UTF8SKIP(s); + } + } + + pat = folded; + pat_end = d; + } + } + if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8(pat)) || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1(pat))) { @@ -3506,13 +3556,13 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, c2 = SvUV(*c_p); /* Folds that cross the 255/256 boundary are forbidden if - * EXACTFL, or EXACTFA and one is ASCIII. Since the - * pattern character is above 256, and its only other match - * is below 256, the only legal match will be to itself. - * We have thrown away the original, so have to compute - * which is the one above 255 */ + * EXACTFL (and isnt a UTF8 locale), or EXACTFA and one is + * ASCIII. Since the pattern character is above 256, and + * its only other match is below 256, the only legal match + * will be to itself. We have thrown away the original, so + * have to compute which is the one above 255 */ if ((c1 < 256) != (c2 < 256)) { - if (OP(text_node) == EXACTFL + if ((OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE) || ((OP(text_node) == EXACTFA || OP(text_node) == EXACTFA_NO_TRIE) && (isASCII(c1) || isASCII(c2)))) @@ -3531,7 +3581,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, else /* Here, c1 is < 255 */ if (utf8_target && HAS_NONLATIN1_FOLD_CLOSURE(c1) - && OP(text_node) != EXACTFL + && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE)) && ((OP(text_node) != EXACTFA && OP(text_node) != EXACTFA_NO_TRIE) || ! isASCII(c1))) @@ -3581,6 +3631,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, } } } + } /* Here have figured things out. Set up the returns */ if (use_chrtest_void) { @@ -4275,7 +4326,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) s = STRING(scan); ln = STR_LEN(scan); - if (utf8_target || is_utf8_pat || state_num == EXACTFU_SS) { + if (utf8_target + || is_utf8_pat + || state_num == EXACTFU_SS + || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE)) + { /* Either target or the pattern are utf8, or has the issue where * the fold lengths may differ. */ const char * const l = locinput; @@ -4883,8 +4938,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) s = reginfo->strbeg + ln; if (type != REF /* REF can do byte comparison */ - && (utf8_target || type == REFFU)) - { /* XXX handle REFFL better */ + && (utf8_target || type == REFFU || type == REFFL)) + { char * limit = reginfo->strend; /* This call case insensitively compares the entire buffer @@ -7524,6 +7579,16 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const } } + /* For /li matching and the current locale is a UTF-8 one, look at the + * special list, valid for just these circumstances. */ + if (! match + && (flags & ANYOF_LOC_FOLD) + && IN_UTF8_CTYPE_LOCALE + && ANYOF_UTF8_LOCALE_INVLIST(n)) + { + match = _invlist_contains_cp(ANYOF_UTF8_LOCALE_INVLIST(n), c); + } + /* If the bitmap didn't (or couldn't) match, and something outside the * bitmap could match, try that. Locale nodes specify completely the * behavior of code points in the bit map (otherwise, a utf8 target would diff --git a/sv.c b/sv.c index db8ce82..2ae8b90 100644 --- a/sv.c +++ b/sv.c @@ -13470,6 +13470,7 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags, /* Did the locale setup indicate UTF-8? */ PL_utf8locale = proto_perl->Iutf8locale; + PL_in_utf8_CTYPE_locale = proto_perl->Iin_utf8_CTYPE_locale; /* Unicode features (see perlrun/-C) */ PL_unicode = proto_perl->Iunicode; diff --git a/t/loc_tools.pl b/t/loc_tools.pl index 9136302..890f8e1 100644 --- a/t/loc_tools.pl +++ b/t/loc_tools.pl @@ -195,9 +195,39 @@ sub is_locale_utf8 ($) { # Return a boolean as to if core Perl thinks the input # is a UTF-8 locale my $locale = shift; - # The locale name doesn't necessarily have to have "utf8" in it to be a - # UTF-8 locale, but it works, mostly. - return $locale =~ /UTF-?8/i; + use locale; + + my $save_locale = setlocale(&POSIX::LC_CTYPE()); + if (! $save_locale) { + ok(0, "Verify could save previous locale"); + return 0; + } + + if (! setlocale(&POSIX::LC_CTYPE(), $locale)) { + ok(0, "Verify could setlocale to $locale"); + return 0; + } + + my $ret = 0; + + # Use an op that gives different results for UTF-8 than any other locale. + # If a platform has UTF-8 locales, there should be at least one locale on + # most platforms with UTF-8 in its name, so if there is a bug in the op + # giving a false negative, we should get a failure for those locales as we + # go through testing all the locales on the platform. + if (CORE::fc(chr utf8::unicode_to_native(0xdf)) ne "ss") { + if ($locale =~ /UTF-?8/i) { + ok (0, "Verify $locale with UTF-8 in name is a UTF-8 locale"); + } + } + else { + $ret = 1; + } + + die "Couldn't restore locale '$save_locale'" + unless setlocale(&POSIX::LC_CTYPE(), $save_locale); + + return $ret; } sub find_utf8_locale (;$) { # Return the name of locale that core Perl thinks diff --git a/t/op/lc.t b/t/op/lc.t index 66f365b..9d3240b 100644 --- a/t/op/lc.t +++ b/t/op/lc.t @@ -5,12 +5,14 @@ BEGIN { chdir 't'; @INC = '../lib'; + require Config; import Config; require './test.pl'; + require './loc_tools.pl'; # Contains find_utf8_locale() } use feature qw( fc ); -plan tests => 134; +plan tests => 134 + 4 * 256; is(lc(undef), "", "lc(undef) is ''"); is(lcfirst(undef), "", "lcfirst(undef) is ''"); @@ -313,3 +315,38 @@ $h{k} = bless[], "\x{130}bcde"; # U+0130 grows with lc() # using delete marks it as TEMP, so uc-in-place is permitted like lc delete $h{k}, qr "^i\x{307}bcde=array\(.*\)", 'lc(TEMP ref) does not produce a corrupt string'; + + +my $utf8_locale = find_utf8_locale(); + +SKIP: { + skip 'Can\'t find a UTF-8 locale', 4*256 unless defined $utf8_locale; + + use feature qw( unicode_strings ); + + no locale; + + my @unicode_lc; + my @unicode_uc; + my @unicode_lcfirst; + my @unicode_ucfirst; + + # Get all the values outside of 'locale' + for my $i (0 .. 255) { + push @unicode_lc, lc(chr $i); + push @unicode_uc, uc(chr $i); + push @unicode_lcfirst, lcfirst(chr $i); + push @unicode_ucfirst, ucfirst(chr $i); + } + + use if $Config{d_setlocale}, qw(POSIX locale_h); + use locale; + setlocale(LC_CTYPE, $utf8_locale); + + for my $i (0 .. 255) { + is(lc(chr $i), $unicode_lc[$i], "In a UTF-8 locale, lc(chr $i) is the same as official Unicode"); + is(uc(chr $i), $unicode_uc[$i], "In a UTF-8 locale, uc(chr $i) is the same as official Unicode"); + is(lcfirst(chr $i), $unicode_lcfirst[$i], "In a UTF-8 locale, lcfirst(chr $i) is the same as official Unicode"); + is(ucfirst(chr $i), $unicode_ucfirst[$i], "In a UTF-8 locale, ucfirst(chr $i) is the same as official Unicode"); + } +} diff --git a/t/re/charset.t b/t/re/charset.t index 0c77c31..578964b 100644 --- a/t/re/charset.t +++ b/t/re/charset.t @@ -4,6 +4,7 @@ BEGIN { chdir 't' if -d 't'; @INC = '../lib'; require './test.pl'; + require './loc_tools.pl'; } use strict; @@ -35,6 +36,8 @@ $testcases{'[:print:]'} = $testcases{'[:graph:]'}; $testcases{'[:space:]'} = $testcases{'\s'}; $testcases{'[:word:]'} = $testcases{'\w'}; +my $utf8_locale; + my @charsets = qw(a d u aa); if (! is_miniperl() && $Config{d_setlocale}) { require POSIX; @@ -50,15 +53,34 @@ if (! is_miniperl() && $Config{d_setlocale}) { # legal, but since we don't know what the right answers should be, # skip the locale tests in that situation. for my $i (128 .. 255) { - goto untestable_locale if chr($i) =~ /[[:print:]]/; + goto skip_adding_C_locale if chr($i) =~ /[[:print:]]/; } push @charsets, 'l'; - untestable_locale: + + skip_adding_C_locale: + + # Use a pseudo-modifier 'L' to indicate to use /l with a UTF-8 locale + $utf8_locale = find_utf8_locale(); + push @charsets, 'L' if defined $utf8_locale; } } # For each possible character set... foreach my $charset (@charsets) { + my $locale; + my $charset_mod = lc $charset; + my $charset_display; + if ($charset_mod eq 'l') { + $locale = POSIX::setlocale(&POSIX::LC_ALL, ($charset eq 'l') + ? "C" + : $utf8_locale + ); + die "Couldn't change locale" unless $locale; + $charset_display = $charset_mod . " ($locale)"; + } + else { + $charset_display = $charset_mod; + } # And in utf8 or not foreach my $upgrade ("", 'utf8::upgrade($a); ') { @@ -87,9 +109,9 @@ foreach my $charset (@charsets) { # match or not # Everything always matches in ASCII, or under /u - if ($ord < 128 || $charset eq 'u') { - $reason = "\"$char\" is a $class under /$charset"; - $neg_reason = "\"$char\" is not a $complement under /$charset"; + if ($ord < 128 || $charset eq 'u' || $charset eq 'L') { + $reason = "\"$char\" is a $class under /$charset_display"; + $neg_reason = "\"$char\" is not a $complement under /$charset_display"; } elsif ($charset eq "a" || $charset eq "aa") { $match = 0; @@ -97,8 +119,8 @@ foreach my $charset (@charsets) { $neg_reason = "\"$char\" is non-ASCII, which is a $complement under /a"; } elsif ($ord > 255) { - $reason = "\"$char\" is a $class under /$charset"; - $neg_reason = "\"$char\" is not a $complement under /$charset"; + $reason = "\"$char\" is a $class under /$charset_display"; + $neg_reason = "\"$char\" is not a $complement under /$charset_display"; } elsif ($charset eq 'l') { @@ -152,14 +174,14 @@ foreach my $charset (@charsets) { # Test both class and its complement, and with one or more # than one item to match. foreach my $eval ( - qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset: $lb$class$rb ) /x], - qq[my \$a = "$char" x $length; $upgrade\$a $op qr/ (?$charset: $lb$class$rb\{$length} ) /x], + qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb ) /x], + qq[my \$a = "$char" x $length; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb\{$length} ) /x], ) { ok (eval $eval, $eval . $reason); } foreach my $eval ( - qq[my \$a = "$char"; $upgrade\$a $neg_op qr/ (?$charset: $lb$complement$rb ) /x], - qq[my \$a = "$char" x $length; $upgrade\$a $neg_op qr/ (?$charset: $lb$complement$rb\{$length} ) /x], + qq[my \$a = "$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb ) /x], + qq[my \$a = "$char" x $length; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb\{$length} ) /x], ) { ok (eval $eval, $eval . $neg_reason); } @@ -169,14 +191,14 @@ foreach my $charset (@charsets) { # Test \b, \B at beginning and end of string foreach my $eval ( - qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset: ^ \\b . ) /x], - qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset: . \\b \$) /x], + qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: ^ \\b . ) /x], + qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b \$) /x], ) { ok (eval $eval, $eval . $reason); } foreach my $eval ( - qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset: ^ \\B . ) /x], - qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset: . \\B \$ ) /x], + qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: ^ \\B . ) /x], + qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: . \\B \$ ) /x], ) { ok (eval $eval, $eval . $neg_reason); } @@ -192,14 +214,14 @@ foreach my $charset (@charsets) { my $space = display(chr $space_ord); foreach my $eval ( - qq[my \$a = "$space$char"; $upgrade\$a $op qr/ (?$charset: . \\b . ) /x], - qq[my \$a = "$char$space"; $upgrade\$a $op qr/ (?$charset: . \\b . ) /x], + qq[my \$a = "$space$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x], + qq[my \$a = "$char$space"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x], ) { ok (eval $eval, $eval . $reason . "; \"$space\" is not a \\w"); } foreach my $eval ( - qq[my \$a = "$space$char"; $upgrade\$a $neg_op qr/ (?$charset: . \\B . ) /x], - qq[my \$a = "$char$space"; $upgrade\$a $neg_op qr/ (?$charset: . \\B . ) /x], + qq[my \$a = "$space$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x], + qq[my \$a = "$char$space"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x], ) { ok (eval $eval, $eval . $neg_reason . "; \"$space\" is not a \\w"); } @@ -215,16 +237,16 @@ foreach my $charset (@charsets) { # Determine if the other char is a word char in current # circumstances my $other_is_word = 1; - my $other_reason = "\"$other\" is a $class under /$charset"; - my $other_neg_reason = "\"$other\" is not a $complement under /$charset"; + my $other_reason = "\"$other\" is a $class under /$charset_display"; + my $other_neg_reason = "\"$other\" is not a $complement under /$charset_display"; if ($other_ord > 127 - && $charset ne 'u' + && $charset ne 'u' && $charset ne 'L' && (($charset eq "a" || $charset eq "aa") || ($other_ord < 256 && ($charset eq 'l' || ! $upgrade)))) { $other_is_word = 0; - $other_reason = "\"$other\" is not a $class under /$charset"; - $other_neg_reason = "\"$other\" is a $complement under /$charset"; + $other_reason = "\"$other\" is not a $class under /$charset_display"; + $other_neg_reason = "\"$other\" is a $complement under /$charset_display"; } my $both_reason = $reason; $both_reason .= "; $other_reason" if $other_ord != $ord; @@ -243,14 +265,14 @@ foreach my $charset (@charsets) { } foreach my $eval ( - qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset: $other \\b $char ) /x], - qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset: $char \\b $other ) /x], + qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: $other \\b $char ) /x], + qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: $char \\b $other ) /x], ) { ok (eval $eval, $eval . $both_reason); } foreach my $eval ( - qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset: $other \\B $char ) /x], - qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset: $char \\B $other ) /x], + qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $other \\B $char ) /x], + qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: $char \\B $other ) /x], ) { ok (eval $eval, $eval . $both_neg_reason); } @@ -261,14 +283,14 @@ foreach my $charset (@charsets) { # on source code analysis, to force the testing of the FBC # (find_by_class) portions of regexec.c. foreach my $eval ( - qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset: \\b $char ) /x], - qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset: \\b $other ) /x], + qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: \\b $char ) /x], + qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: \\b $other ) /x], ) { ok (eval $eval, $eval . $both_reason); } foreach my $eval ( - qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset: \\B $char ) /x], - qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset: \\B $other ) /x], + qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $char ) /x], + qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $other ) /x], ) { ok (eval $eval, $eval . $both_neg_reason); } diff --git a/t/re/fold_grind.t b/t/re/fold_grind.t index 57c61bd..203c9cf 100644 --- a/t/re/fold_grind.t +++ b/t/re/fold_grind.t @@ -8,6 +8,7 @@ BEGIN { require './test.pl'; require Config; import Config; skip_all_if_miniperl("no dynamic loading on miniperl, no Encode nor POSIX"); + require './loc_tools.pl'; } use charnames ":full"; @@ -84,16 +85,21 @@ my $okays; # Number of ok's in current subtest my $this_iteration; # Number of possible tests in current subtest my $count = 0; # Number of subtests = number of total tests -sub run_test($$$) { - my ($test, $todo, $debug) = @_; +sub run_test($$$$) { + my ($test, $todo, $do_we_output_locale_name, $debug) = @_; $debug = "" unless $DEBUG; my $res = eval $test; + if ($do_we_output_locale_name) { + $do_we_output_locale_name = 'setlocale(LC_CTYPE, "' + . POSIX::setlocale(&POSIX::LC_CTYPE) + . '"); '; + } if (!$res || $list_all_tests) { # Failed or debug; output the result $count++; - ok($res, "$test; $debug"); + ok($res, "$do_we_output_locale_name$test; $debug"); } else { # Just count the test as passed $okays++; @@ -411,6 +417,8 @@ sub pairs (@) { map { prefix $_[$_], @_[0..$_-1, $_+1..$#_] } 0..$#_ } +my $utf8_locale; + my @charsets = qw(d u a aa); if($Config{d_setlocale}) { my $current_locale = POSIX::setlocale( &POSIX::LC_CTYPE, "C") // ""; @@ -423,10 +431,16 @@ if($Config{d_setlocale}) { # skip the locale tests in that situation. for my $i (128 .. 255) { my $char = chr($i); - goto untestable_locale if uc($char) ne $char || lc($char) ne $char; + goto skip_C_locale_tests if uc($char) ne $char || lc($char) ne $char; } push @charsets, 'l'; - untestable_locale: + + skip_C_locale_tests: + + # Look for utf8 locale. We use the pseudo-modifier 'L' to indicate + # that we really want /l, but change to a UTF-8 locale. + $utf8_locale = find_utf8_locale(); + push @charsets, 'L' if defined $utf8_locale; } } @@ -495,6 +509,15 @@ foreach my $test (sort { numerically } keys %tests) { # Now grind out tests, using various combinations. foreach my $charset (@charsets) { + my $charset_mod = lc $charset; + my $current_locale = ""; + if ($charset_mod eq 'l') { + $current_locale = POSIX::setlocale(&POSIX::LC_CTYPE, + ($charset eq 'L') + ? $utf8_locale + : 'C'); + $current_locale = 'C locale' if $current_locale eq 'C'; + } $okays = 0; $this_iteration = 0; @@ -664,7 +687,7 @@ foreach my $test (sort { numerically } keys %tests) { next if $pattern_above_latin1 && ! $utf8_pattern; # Our testing of 'l' uses the POSIX locale, which is ASCII-only - my $uni_semantics = $charset ne 'l' && ($utf8_target || $charset eq 'u' || ($charset eq 'd' && $utf8_pattern) || $charset =~ /a/); + my $uni_semantics = $charset ne 'l' && ($utf8_target || $charset eq 'u' || $charset eq 'L' || ($charset eq 'd' && $utf8_pattern) || $charset =~ /a/); my $upgrade_pattern = ""; $upgrade_pattern = ' utf8::upgrade($p);' if ! $pattern_above_latin1 && $utf8_pattern; @@ -682,18 +705,18 @@ foreach my $test (sort { numerically } keys %tests) { $op = '!~' if $should_fail; my $todo = 0; # No longer any todo's - my $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; - run_test($eval, $todo, ""); + my $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset_mod:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; + run_test($eval, $todo, ($charset_mod eq 'l'), ""); - $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^(?$rhs)\\k\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; - run_test($eval, $todo, ""); + $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset_mod:^(?$rhs)\\k\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; + run_test($eval, $todo, ($charset_mod eq 'l'), ""); if ($lhs ne $rhs) { - $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; - run_test($eval, "", ""); + $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset_mod:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; + run_test($eval, "", ($charset_mod eq 'l'), ""); - $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^(?$rhs)\\k\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; - run_test($eval, "", ""); + $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset_mod:^(?$rhs)\\k\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; + run_test($eval, "", ($charset_mod eq 'l'), ""); } # See if works on what could be a simple trie. @@ -706,8 +729,8 @@ foreach my $test (sort { numerically } keys %tests) { use bytes; $alternate = 'q' x length $evaled; } - $eval = "my \$c = \"$lhs\"; my \$p = qr/$rhs|$alternate/i$charset;$upgrade_target$upgrade_pattern \$c $op \$p"; - run_test($eval, "", ""); + $eval = "my \$c = \"$lhs\"; my \$p = qr/$rhs|$alternate/i$charset_mod;$upgrade_target$upgrade_pattern \$c $op \$p"; + run_test($eval, "", ($charset_mod eq 'l'), ""); # Check that works when the folded character follows something that # is quantified. This test knows the regex code internals to the @@ -722,12 +745,12 @@ foreach my $test (sort { numerically } keys %tests) { # quick, and this insulates these tests from changes in the # implementation.) for my $quantifier ('?', '??', '*', '*?', '+', '+?', '{1,2}', '{1,2}?') { - $eval = "my \$c = \"_$lhs\"; my \$p = qr/(?$charset:.$quantifier$rhs)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; - run_test($eval, "", ""); - $eval = "my \$c = \"__$lhs\"; my \$p = qr/(?$charset:(?:..)$quantifier$rhs)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; - run_test($eval, "", ""); - $eval = "my \$c = \"__$lhs\"; my \$p = qr/(?$charset:(?:.|\\R)$quantifier$rhs)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; - run_test($eval, "", ""); + $eval = "my \$c = \"_$lhs\"; my \$p = qr/(?$charset_mod:.$quantifier$rhs)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; + run_test($eval, "", ($charset_mod eq 'l'), ""); + $eval = "my \$c = \"__$lhs\"; my \$p = qr/(?$charset_mod:(?:..)$quantifier$rhs)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; + run_test($eval, "", ($charset_mod eq 'l'), ""); + $eval = "my \$c = \"__$lhs\"; my \$p = qr/(?$charset_mod:(?:.|\\R)$quantifier$rhs)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; + run_test($eval, "", ($charset_mod eq 'l'), ""); } foreach my $bracketed (0, 1) { # Put rhs in [...], or not @@ -804,7 +827,7 @@ foreach my $test (sort { numerically } keys %tests) { my $must_match = ! $can_match_null || $both_sides; # for performance, but doing this missed many failures #next unless $must_match; - my $quantified = "(?$charset:$l_anchor$prepend$interior${quantifier}$append$r_anchor)"; + my $quantified = "(?$charset_mod:$l_anchor$prepend$interior${quantifier}$append$r_anchor)"; my $op; if ($must_match && $should_fail) { $op = 0; @@ -854,7 +877,13 @@ foreach my $test (sort { numerically } keys %tests) { } - my $desc = "my \$c = \"$prepend$lhs$append\"; " + my $desc = ""; + if ($charset_mod eq 'l') { + $desc .= 'setlocale(LC_CTYPE, "' + . POSIX::setlocale(&POSIX::LC_CTYPE) + . '"); ' + } + $desc .= "my \$c = \"$prepend$lhs$append\"; " . "my \$p = qr/$quantified/i;" . "$upgrade_target$upgrade_pattern " . "\$c " . ($op ? "=~" : "!~") . " \$p; "; @@ -903,8 +932,9 @@ foreach my $test (sort { numerically } keys %tests) { unless($list_all_tests) { $count++; is $okays, $this_iteration, "$okays subtests ok for" - . " /$charset," - . ' target="' . join("", @x_target) . '",' + . " /$charset_mod" + . (($charset_mod eq 'l') ? " ($current_locale)" : "") + . ', target="' . join("", @x_target) . '",' . ' pat="' . join("", @x_pattern) . '"'; } } diff --git a/t/uni/fold.t b/t/uni/fold.t index 6c06a2f..6cad86d 100644 --- a/t/uni/fold.t +++ b/t/uni/fold.t @@ -9,6 +9,7 @@ BEGIN { @INC = '../lib'; require Config; import Config; require './test.pl'; + require './loc_tools.pl'; # Contains find_utf8_locale() } use feature 'unicode_strings'; @@ -416,8 +417,11 @@ foreach my $test_ref (@CF) { is( fc($troublesome5), "\x{E9}abda\x{3BC}aaf\x{E8}" ); } + { use feature qw( fc unicode_strings ); + use if $Config{d_setlocale}, qw(POSIX locale_h); + setlocale(LC_ALL, "C") if $Config{d_setlocale}; # This tests both code paths in pp_fc @@ -444,12 +448,40 @@ foreach my $test_ref (@CF) { } } +my $utf8_locale = find_utf8_locale(); + { use feature qw( fc ); use locale; is(fc("\x{1E9E}"), fc("\x{17F}\x{17F}"), 'fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")'); + SKIP: { + skip 'Can\'t find a UTF-8 locale', 1 unless defined $utf8_locale; + setlocale(LC_CTYPE, $utf8_locale); + is(fc("\x{1E9E}"), "ss", 'fc("\x{1E9E}") eq "ss" in a UTF-8 locale)'); + } } +SKIP: { + skip 'Can\'t find a UTF-8 locale', 256 unless defined $utf8_locale; + + use feature qw( fc unicode_strings ); + + # Get the official fc values outside locale. + no locale; + my @unicode_fc; + for (0..0xff) { + push @unicode_fc, fc(chr); + } + + # These should match the UTF-8 locale values + setlocale(LC_CTYPE, $utf8_locale); + use locale; + for (0..0xff) { + is(fc(chr), $unicode_fc[$_], "In a UTF-8 locale, fc(chr $_) is the same as official Unicode"); + } +} + + my $num_tests = curr_test() - 1; plan($num_tests); diff --git a/utf8.c b/utf8.c index 7fe4f9e..8be5c11 100644 --- a/utf8.c +++ b/utf8.c @@ -1774,21 +1774,27 @@ Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int f } UV -Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, const U8 flags) +Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags) { /* Not currently externally documented, and subject to change * bits meanings: * FOLD_FLAGS_FULL iff full folding is to be used; - * FOLD_FLAGS_LOCALE iff in locale + * FOLD_FLAGS_LOCALE is set iff the rules from the current underlying + * locale are to be used. * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited */ PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS; + /* Tread a UTF-8 locale as not being in locale at all */ + if (IN_UTF8_CTYPE_LOCALE) { + flags &= ~FOLD_FLAGS_LOCALE; + } + if (c < 256) { UV result = _to_fold_latin1((U8) c, p, lenp, - flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); + flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); /* It is illegal for the fold to cross the 255/256 boundary under * locale; in this case return the original */ return (result > 256 && flags & FOLD_FLAGS_LOCALE) @@ -2087,9 +2093,10 @@ STATIC UV S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp) { /* This is called when changing the case of a utf8-encoded character above - * the Latin1 range, and the operation is in locale. If the result - * contains a character that crosses the 255/256 boundary, disallow the - * change, and return the original code point. See L for why; + * the Latin1 range, and the operation is in a non-UTF-8 locale. If the + * result contains a character that crosses the 255/256 boundary, disallow + * the change, and return the original code point. See L for + * why; * * p points to the original string whose case was changed; assumed * by this routine to be well-formed @@ -2138,10 +2145,11 @@ Instead use L. =cut */ /* Not currently externally documented, and subject to change: - * is set iff locale semantics are to be used for code points < 256 */ + * is set iff iff the rules from the current underlying locale are to + * be used. */ UV -Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags) +Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags) { dVAR; @@ -2149,6 +2157,10 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS; + if (flags && IN_UTF8_CTYPE_LOCALE) { + flags = FALSE; + } + if (UTF8_IS_INVARIANT(*p)) { if (flags) { result = toUPPER_LC(*p); @@ -2197,13 +2209,13 @@ Instead use L. =cut */ /* Not currently externally documented, and subject to change: - * is set iff locale semantics are to be used for code points < 256 - * Since titlecase is not defined in POSIX, uppercase is used instead - * for these/ + * is set iff the rules from the current underlying locale are to be + * used. Since titlecase is not defined in POSIX, for other than a + * UTF-8 locale, uppercase is used instead for code points < 256. */ UV -Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags) +Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags) { dVAR; @@ -2211,6 +2223,10 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS; + if (flags && IN_UTF8_CTYPE_LOCALE) { + flags = FALSE; + } + if (UTF8_IS_INVARIANT(*p)) { if (flags) { result = toUPPER_LC(*p); @@ -2260,10 +2276,12 @@ Instead use L. =cut */ /* Not currently externally documented, and subject to change: - * is set iff locale semantics are to be used for code points < 256 */ + * is set iff iff the rules from the current underlying locale are to + * be used. + */ UV -Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags) +Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags) { UV result; @@ -2271,6 +2289,10 @@ Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS; + if (flags && IN_UTF8_CTYPE_LOCALE) { + flags = FALSE; + } + if (UTF8_IS_INVARIANT(*p)) { if (flags) { result = toLOWER_LC(*p); @@ -2322,9 +2344,8 @@ Instead use L. /* Not currently externally documented, and subject to change, * in - * bit FOLD_FLAGS_LOCALE is set iff locale semantics are to be used for code - * points < 256. Since foldcase is not defined in - * POSIX, lowercase is used instead + * bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying + * locale are to be used. * bit FOLD_FLAGS_FULL is set iff full case folds are to be used; * otherwise simple folds * bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are @@ -2345,6 +2366,10 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags) assert(p != ustrp); /* Otherwise overwrites */ + if (flags & FOLD_FLAGS_LOCALE && IN_UTF8_CTYPE_LOCALE) { + flags &= ~FOLD_FLAGS_LOCALE; + } + if (UTF8_IS_INVARIANT(*p)) { if (flags & FOLD_FLAGS_LOCALE) { result = toFOLD_LC(*p); @@ -4061,13 +4086,11 @@ L (Case Mappings). * 0 for as-documented above * FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an ASCII one, to not match - * FOLDEQ__LOCALE meaning that locale rules are to be used for code - * points below 256; unicode rules for above 255; and - * folds that cross those boundaries are disallowed, - * like the NOMIX_ASCII option - * FOLDEQ_S1_ALREADY_FOLDED s1 has already been folded before calling this - * routine. This allows that step to be skipped. - * FOLDEQ_S2_ALREADY_FOLDED Similarly. + * FOLDEQ_LOCALE is set iff the rules from the current underlying + * locale are to be used. + * FOLDEQ_S1_ALREADY_FOLDED s1 has already been folded before calling this + * routine. This allows that step to be skipped. + * FOLDEQ_S2_ALREADY_FOLDED Similarly. */ I32 Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const char *s2, char **pe2, UV l2, bool u2, U32 flags) @@ -4100,6 +4123,10 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c * and /iaa matches are most likely to involve code points 0-255, and this * function only under rare conditions gets called for 0-255. */ + if (IN_UTF8_CTYPE_LOCALE) { + flags &= ~FOLDEQ_LOCALE; + } + if (pe1) { e1 = *(U8**)pe1; }