From 56ca34cada940c7f6aae9a59da266e541530041e Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Wed, 2 Feb 2011 12:01:34 -0700 Subject: [PATCH] Move ANYOF folding from regexec to regcomp This is for security as well as performance. It allows Unicode properties to not be matched case sensitively. As a result the swash inversion hash is converted from having utf8 keys to numeric, code point, keys. It also for the first time fixes the bug where /i doesn't work for a code point not at the end of a range in a bracketed character class has a multi-character fold --- lib/unicore/mktables | 15 ++++ pod/perldelta.pod | 24 +++++ pod/perlrecharclass.pod | 25 +++++- pod/perlunicode.pod | 27 ++++++ regcomp.c | 230 ++++++++++++++++++++++++++++++++++++++++++------ regexec.c | 2 + utf8.c | 15 ++-- 7 files changed, 300 insertions(+), 38 deletions(-) diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 5e3f0c5..ab90ba3 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -12395,6 +12395,7 @@ sub make_table_pod_entries($) { my $string_count = clarify_number($count); my $status = $input_table->status; my $status_info = $input_table->status_info; + my $caseless_equivalent = $input_table->caseless_equivalent; my $entry_for_first_table; # The entry for the first table output. # Almost certainly, it is the parent. @@ -12618,6 +12619,11 @@ sub make_table_pod_entries($) { } } + if ($caseless_equivalent != 0) { + $parenthesized .= '; ' if $parenthesized ne ""; + $parenthesized .= "/i= " . $caseless_equivalent->complete_name; + } + # Warn if this property isn't the same as one that a # semi-casual user might expect. The other components of this @@ -13060,6 +13066,15 @@ Numbers in (parentheses) indicate the total number of code points matched by the property. For emphasis, those properties that match no code points at all are listed as well in a separate section following the table. +Most properties match the same code points regardless of whether C<"/i"> +case-insensitive matching is specified or not. But a few properties are +affected. These are shown with the notation + + (/i= other_property) + +in the second column. Under case-insensitive matching they match the +same code pode points as the property "other_property". + There is no description given for most non-Perl defined properties (See $unicode_reference_url for that). diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 9d6e258..3f0d2d8 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -51,6 +51,30 @@ XXX For a release on a stable branch, this section aspires to be: [ List each incompatible change as a =head2 entry ] +=head2 Most C<\p{}> properties are now immune from case-insensitive matching + +For most Unicode properties, it doesn't make sense to have them match +differently under C case-insensitive matching than not. And doing +so leads to unexpected results and potential security holes. For +example + + m/\p{ASCII_Hex_Digit}+/i + +could previously match non-ASCII characters because of the Unicode +matching rules. There were a number of bugs in this feature until an +earlier release in the 5.13 series. Now this release reverts, and +removes the feature completely except for the few properties where +people have come to expect it, namely the ones where casing is an +integral part of their functionality, such as C and +C, both of which match the exact same code points, +namely those matched by C. Details are in +L. + +User-defined property handlers that need to match differently under +C must change to read the new boolean parameter passed it which is +non-zero if case-insensitive matching is in effect; 0 if not. See +L. + =head1 Deprecations XXX Any deprecated features, syntax, modules etc. should be listed here. diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod index e7f6f8a..de2256f 100644 --- a/pod/perlrecharclass.pod +++ b/pod/perlrecharclass.pod @@ -273,13 +273,32 @@ C is valid, but means something different. It matches a two character string: a letter (Unicode property C<\pL>), followed by a lowercase C. -For more details, see L; for a +Note that almost all properties are immune to case-insensitive matching. +That is, adding a C regular expression modifier does not change what +they match. There are two sets that are affected. The first set is +C, +C, +and C, +all of which match C under C matching. +And the second set is +C, +C, +and C, +all of which match C under C matching. +(The difference between these sets is that some things, such as Roman +Numerals come in both upper and lower case so they are C, but +aren't considered to be letters, so they aren't Cs.) +This set also includes its subsets C and C both +of which under C matching match C. + +For more details on Unicode properties, see L; for a complete list of possible properties, see -L. +L, +which notes all forms that have C differences. It is also possible to define your own properties. This is discussed in L. - =head4 Examples "a" =~ /\w/ # Match, "a" is a 'word' character. diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index 360af1d..7f3a795 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -349,6 +349,27 @@ You can also use negation in both C<\p{}> and C<\P{}> by introducing a caret (^) between the first brace and the property name: C<\p{^Tamil}> is equal to C<\P{Tamil}>. +Almost all properties are immune to case-insensitive matching. That is, +adding a C regular expression modifier does not change what they +match. There are two sets that are affected. +The first set is +C, +C, +and C, +all of which match C under C matching. +And the second set is +C, +C, +and C, +all of which match C under C matching. +This set also includes its subsets C and C both +of which under C matching match C. +(The difference between these sets is that some things, such as Roman +Numerals come in both upper and lower case so they are C, but aren't considered to be +letters, so they aren't Cs.) +L includes a notation for all forms that have C +differences. + =head3 B Every Unicode character is assigned a general category, which is the "most @@ -773,6 +794,12 @@ C<\p> or C<\P> construct. Note that the effect is compile-time and immutable once defined. +However the subroutines are passed a single parameter which is 0 if +case-sensitive matching is in effect, and non-zero if caseless matching +is in effect. The subroutine may return different values depending on +the value of the flag, and one set of values will immutably be in effect +for all case-sensitive matches; the other set for all case-insensitive +matches. The subroutines must return a specially-formatted string, with one or more newline-separated lines. Each line must be one of the following: diff --git a/regcomp.c b/regcomp.c index 0c5ad08..8007dd3 100644 --- a/regcomp.c +++ b/regcomp.c @@ -8872,14 +8872,14 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) ANYOF_##NAME: \ for (value = 0; value < 256; value++) \ if (TEST) \ - stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \ + stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap); \ yesno = '+'; \ what = WORD; \ break; \ case ANYOF_N##NAME: \ for (value = 0; value < 256; value++) \ if (!TEST) \ - stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \ + stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap); \ yesno = '!'; \ what = WORD; \ break @@ -8894,14 +8894,14 @@ ANYOF_##NAME: \ else if (UNI_SEMANTICS) { \ for (value = 0; value < 256; value++) { \ if (TEST_8(value)) stored += \ - S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \ + S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap); \ } \ } \ else { \ for (value = 0; value < 128; value++) { \ if (TEST_7(UNI_TO_NATIVE(value))) stored += \ S_set_regclass_bit(aTHX_ pRExC_state, ret, \ - (U8) UNI_TO_NATIVE(value)); \ + (U8) UNI_TO_NATIVE(value), &nonbitmap); \ } \ } \ yesno = '+'; \ @@ -8912,18 +8912,18 @@ case ANYOF_N##NAME: \ else if (UNI_SEMANTICS) { \ for (value = 0; value < 256; value++) { \ if (! TEST_8(value)) stored += \ - S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \ + S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap); \ } \ } \ else { \ for (value = 0; value < 128; value++) { \ if (! TEST_7(UNI_TO_NATIVE(value))) stored += S_set_regclass_bit( \ - aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value)); \ + aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &nonbitmap); \ } \ if (ASCII_RESTRICTED) { \ for (value = 128; value < 256; value++) { \ stored += S_set_regclass_bit( \ - aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value)); \ + aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &nonbitmap); \ } \ ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL|ANYOF_UTF8; \ } \ @@ -8957,7 +8957,7 @@ case ANYOF_N##NAME: \ #endif STATIC U8 -S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value) +S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** nonbitmap_ptr) { /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes. @@ -8991,6 +8991,10 @@ S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 don't have unicode semantics for the above ASCII Latin-1 characters, and they have a fold, they should match if the target is utf8, and not otherwise */ + if (! *nonbitmap_ptr) { + *nonbitmap_ptr = _new_invlist(2); + } + *nonbitmap_ptr = add_range_to_invlist(*nonbitmap_ptr, value, value); ANYOF_FLAGS(node) |= ANYOF_UTF8; } @@ -8999,7 +9003,7 @@ S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 PERL_STATIC_INLINE U8 -S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value) +S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** nonbitmap_ptr) { /* This inline function sets a bit in the bitmap if not already set, and if * appropriate, its fold, returning the number of bits that actually @@ -9015,7 +9019,7 @@ S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 valu stored = 1; if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */ - stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value); + stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value, nonbitmap_ptr); } return stored; @@ -9043,6 +9047,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) bool need_class = 0; SV *listsv = NULL; UV n; + HV* nonbitmap = NULL; AV* unicode_alternate = NULL; #ifdef EBCDIC UV literal_endpoint = 0; @@ -9064,8 +9069,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) /* Assume we are going to generate an ANYOF node. */ ret = reganode(pRExC_state, ANYOF, 0); - if (!SIZE_ONLY) + + if (!SIZE_ONLY) { ANYOF_FLAGS(ret) = 0; + } if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */ RExC_naughty++; @@ -9350,9 +9357,9 @@ parseit: if (prevvalue < 256) { stored += - S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) prevvalue); + S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) prevvalue, &nonbitmap); stored += - S_set_regclass_bit(aTHX_ pRExC_state, ret, '-'); + S_set_regclass_bit(aTHX_ pRExC_state, ret, '-', &nonbitmap); } else { ANYOF_FLAGS(ret) |= ANYOF_UTF8; @@ -9404,7 +9411,7 @@ parseit: else { for (value = 0; value < 128; value++) stored += - S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value)); + S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &nonbitmap); } yesno = '+'; what = NULL; /* Doesn't match outside ascii, so @@ -9416,7 +9423,7 @@ parseit: else { for (value = 128; value < 256; value++) stored += - S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value)); + S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &nonbitmap); } ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; yesno = '!'; @@ -9429,7 +9436,7 @@ parseit: /* consecutive digits assumed */ for (value = '0'; value <= '9'; value++) stored += - S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); + S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap); } yesno = '+'; what = POSIX_CC_UNI_NAME("Digit"); @@ -9441,10 +9448,10 @@ parseit: /* consecutive digits assumed */ for (value = 0; value < '0'; value++) stored += - S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); + S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap); for (value = '9' + 1; value < 256; value++) stored += - S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); + S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap); } yesno = '!'; what = POSIX_CC_UNI_NAME("Digit"); @@ -9494,13 +9501,17 @@ parseit: } if (!SIZE_ONLY) stored += - S_set_regclass_bit(aTHX_ pRExC_state, ret, '-'); + S_set_regclass_bit(aTHX_ pRExC_state, ret, '-', &nonbitmap); } else range = 1; /* yeah, it's a range! */ continue; /* but do it the next time */ } } + if (value > 255) { + RExC_uni_semantics = 1; + } + /* now is the next time */ if (!SIZE_ONLY) { if (prevvalue < 256) { @@ -9517,25 +9528,32 @@ parseit: for (i = prevvalue; i <= ceilvalue; i++) if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) { stored += - S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i); + S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i, &nonbitmap); } } else { for (i = prevvalue; i <= ceilvalue; i++) if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) { stored += - S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i); + S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i, &nonbitmap); } } } else #endif for (i = prevvalue; i <= ceilvalue; i++) { - stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i); + stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i, &nonbitmap); } } - if (value > 255 || UTF) { - const UV prevnatvalue = NATIVE_TO_UNI(prevvalue); - const UV natvalue = NATIVE_TO_UNI(value); + if (value > 255) { + const UV prevnatvalue = NATIVE_TO_UNI(prevvalue); + const UV natvalue = NATIVE_TO_UNI(value); + if (! nonbitmap) { + nonbitmap = _new_invlist(2); + } + nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue); + ANYOF_FLAGS(ret) |= ANYOF_UTF8; + } +#if 0 /* If the code point requires utf8 to represent, and we are not * folding, it can't match unless the target is in utf8. Only @@ -9632,6 +9650,7 @@ parseit: } } } +#endif #ifdef EBCDIC literal_endpoint = 0; #endif @@ -9646,6 +9665,167 @@ parseit: return ret; /****** !SIZE_ONLY AFTER HERE *********/ + /* Finish up the non-bitmap entries */ + if (nonbitmap) { + UV* nonbitmap_array; + UV i; + + /* If folding, we add to the list all characters that could fold to or + * from the ones already on the list */ + if (FOLD) { + HV* fold_intersection; + UV* fold_list; + + /* This is a list of all the characters that participate in folds + * (except marks, etc in multi-char folds */ + if (! PL_utf8_foldable) { + SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0); + PL_utf8_foldable = _swash_to_invlist(swash); + } + + /* This is a hash that for a particular fold gives all characters + * that are involved in it */ + if (! PL_utf8_foldclosures) { + + /* If the folds haven't been read in, call a fold + * function to force that */ + if (! PL_utf8_tofold) { + U8 dummy[UTF8_MAXBYTES+1]; + STRLEN dummy_len; + to_utf8_fold((U8*) "A", dummy, &dummy_len); + } + PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); + } + + /* Only the characters in this class that participate in folds need + * be checked. Get the intersection of this class and all the + * possible characters that are foldable. This can quickly narrow + * down a large class */ + fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap); + + /* Now look at the foldable characters in this class individually */ + fold_list = invlist_array(fold_intersection); + for (i = 0; i < invlist_len(fold_intersection); i++) { + UV j; + + /* The next entry is the beginning of the range that is in the + * class */ + UV start = fold_list[i++]; + + + /* The next entry is the beginning of the next range, which + * isn't in the class, so the end of the current range is one + * less than that */ + UV end = fold_list[i] - 1; + + /* Look at every character in the range */ + for (j = start; j <= end; j++) { + + /* Get its fold */ + U8 foldbuf[UTF8_MAXBYTES_CASE+1]; + STRLEN foldlen; + const UV f = to_uni_fold(j, foldbuf, &foldlen); + + if (foldlen > (STRLEN)UNISKIP(f)) { + + /* Any multicharacter foldings (disallowed in + * lookbehind patterns) require the following + * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where + * E folds into "pq" and F folds into "rst", all other + * characters fold to single characters. We save away + * these multicharacter foldings, to be later saved as + * part of the additional "s" data. */ + if (! RExC_in_lookbehind) { + /* XXX Discard this fold if any are latin1 and LOC */ + SV *sv; + + if (!unicode_alternate) { + unicode_alternate = newAV(); + } + sv = newSVpvn_utf8((char*)foldbuf, foldlen, TRUE); + av_push(unicode_alternate, sv); + + /* This node is variable length */ + OP(ret) = ANYOFV; + ANYOF_FLAGS(ret) |= ANYOF_UNICODE; + } + } + else { /* Single character fold */ + SV** listp; + + /* Consider "k" =~ /[K]/i. The line above would have + * just folded the 'k' to itself, and that isn't going + * to match 'K'. So we look through the closure of + * everything that folds to 'k'. That will find the + * 'K'. Initialize the list, if necessary */ + + /* The data structure is a hash with the keys every + * character that is folded to, like 'k', and the + * values each an array of everything that folds to its + * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */ + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) foldbuf, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV k; + for (k = 0; k <= av_len(list); k++) { + SV** c_p = av_fetch(list, k, FALSE); + UV c; + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c = SvUV(*c_p); + + if (c < 256 && AT_LEAST_UNI_SEMANTICS) { + stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) c, &nonbitmap); + } + /* It may be that the code point is already + * in this range or already in the bitmap, + * XXX THink about LOC + * in which case we need do nothing */ + else if ((c < start || c > end) + && (c > 255 + || ! ANYOF_BITMAP_TEST(ret, c))) + { + nonbitmap = add_range_to_invlist(nonbitmap, c, c); + } + } + } + } + } + } + invlist_destroy(fold_intersection); + } /* End of processing all the folds */ + + /* Here have the full list of items to match that aren't in the + * bitmap. Convert to the structure that the rest of the code is + * expecting. XXX That rest of the code should convert to this + * structure */ + nonbitmap_array = invlist_array(nonbitmap); + for (i = 0; i < invlist_len(nonbitmap); i++) { + + /* The next entry is the beginning of the range that is in the + * class */ + UV start = nonbitmap_array[i++]; + + /* The next entry is the beginning of the next range, which isn't + * in the class, so the end of the current range is one less than + * that */ + UV end = nonbitmap_array[i] - 1; + + if (start == end) { + Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start); + } + else { + /* The \t sets the whole range */ + Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n", + /* XXX EBCDIC */ + start, end); + } + } + invlist_destroy(nonbitmap); + } + /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't * set the FOLD flag yet, so this this does optimize those. It doesn't * optimize locale. Doing so perhaps could be done as long as there is diff --git a/regexec.c b/regexec.c index b59b8bf..476a966 100644 --- a/regexec.c +++ b/regexec.c @@ -6813,6 +6813,7 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, } } } +#if 0 if (!match) { /* See if the folded version matches */ SV** listp; @@ -6881,6 +6882,7 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, } } } +#endif } /* If we allocated a string above, free it */ diff --git a/utf8.c b/utf8.c index 18ff1d8..6053465 100644 --- a/utf8.c +++ b/utf8.c @@ -2657,10 +2657,6 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) char* key_end = (char *) uvuni_to_utf8((U8*) key, val); STRLEN key_len = key_end - key; - /* And the value is what the forward mapping is from. */ - char utf8_inverse[UTF8_MAXBYTES+1]; - char *utf8_inverse_end = (char *) uvuni_to_utf8((U8*) utf8_inverse, inverse); - /* Get the list for the map */ if ((listp = hv_fetch(ret, key, key_len, FALSE))) { list = (AV*) *listp; @@ -2679,22 +2675,21 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed"); } entry = *entryp; - if (SvCUR(entry) != key_len) { - continue; - } - if (memEQ(key, SvPVX(entry), key_len)) { + if (SvUV(entry) == val) { found_key = TRUE; break; } } + + /* Make sure there is a mapping to itself on the list */ if (! found_key) { - element = newSVpvn_flags(key, key_len, SVf_UTF8); + element = newSVuv(val); av_push(list, element); } /* Simply add the value to the list */ - element = newSVpvn_flags(utf8_inverse, utf8_inverse_end - utf8_inverse, SVf_UTF8); + element = newSVuv(inverse); av_push(list, element); /* swash_get() increments the value of val for each element in the -- 2.7.4