From 88b3a463c4e11c60eea2075693434b32f43e57fe Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 12 Aug 2013 11:23:34 -0600 Subject: [PATCH] PATCH (partial) [perl #107816] Performance regression since 0abd0d78 0abd0d78 removed making tries under /di matching, the reason being that it was broken for many of the upper Latin1 characters, the ones whose matches aren't fully known until run-time. For example under /di, LATIN CAPITAL LETTER A WITH GRAVE caselessly matches LATIN SMALL LETTER A WITH GRAVE if and only if the target string is encoded in UTF-8. Under /ui matching, these always match, and so tries are constructed for them. But if a regnode doesn't contain any of the 61 problematic characters (nor the sequence 'ss' (upper- and/or lowercase), what it matches is fully known at compile time, and so should be trie-able as-is. This commit merely keeps track of if any character in the regnode is one of the 61 or the 'ss' sequence, and if not, changes its type to be /ui and hence trie-able. --- regcomp.c | 52 +++++++++++++++++++++++++++++++++++++++++++++------- t/re/pat_advanced.t | 17 +++++++++++++++++ t/re/re_tests | 2 ++ 3 files changed, 64 insertions(+), 7 deletions(-) diff --git a/regcomp.c b/regcomp.c index 95695d6..659d51f 100644 --- a/regcomp.c +++ b/regcomp.c @@ -10764,6 +10764,13 @@ tryagain: bool next_is_quantifier; char * oldp = NULL; + /* We can convert EXACTF nodes to EXACTFU if they contain only + * characters that match identically regardless of the target + * string's UTF8ness. The reason to do this is that EXACTF is not + * trie-able, EXACTFU is. (We don't need to figure this out until + * pass 2) */ + bool maybe_exactfu = node_type == EXACTF && PASS2; + /* If a folding node contains only code points that don't * participate in folds, it can be changed into an EXACT node, * which allows the optimizer more things to look for */ @@ -11096,8 +11103,23 @@ tryagain: || (node_type == EXACTFU && ender == LATIN_SMALL_LETTER_SHARP_S))) { + if (IS_IN_SOME_FOLD_L1(ender)) { + maybe_exact = FALSE; + + /* See if the character's fold differs between /d and + * /u. This includes the multi-char fold SHARP S to + * 'ss' */ + if (maybe_exactfu + && (PL_fold[ender] != PL_fold_latin1[ender] + || ender == LATIN_SMALL_LETTER_SHARP_S + || (len > 0 + && isARG2_lower_or_UPPER_ARG1('s', ender) + && isARG2_lower_or_UPPER_ARG1('s', *(s-1))))) + { + maybe_exactfu = FALSE; + } + } *(s++) = (char) ender; - maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender); } else { /* UTF */ @@ -11286,6 +11308,15 @@ tryagain: * do any better */ if (len == 0) { len = full_len; + + /* If the node ends in an 's' we make sure it stays EXACTF, + * as if it turns into an EXACTFU, it could later get + * joined with another 's' that would then wrongly match + * the sharp s */ + if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender)) + { + maybe_exactfu = FALSE; + } } else { /* Here, the node does contain some characters that aren't @@ -11344,12 +11375,19 @@ tryagain: if (len == 0) { OP(ret) = NOTHING; } - else{ - - /* If 'maybe_exact' is still set here, means there are no - * code points in the node that participate in folds */ - if (FOLD && maybe_exact) { - OP(ret) = EXACT; + else { + if (FOLD) { + /* If 'maybe_exact' is still set here, means there are no + * code points in the node that participate in folds; + * similarly for 'maybe_exactfu' and code points that match + * differently depending on UTF8ness of the target string + * */ + if (maybe_exact) { + OP(ret) = EXACT; + } + else if (maybe_exactfu) { + OP(ret) = EXACTFU; + } } alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender); } diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t index 16edf6e..012bc4a 100644 --- a/t/re/pat_advanced.t +++ b/t/re/pat_advanced.t @@ -2216,6 +2216,23 @@ EOP } } ok(! $failed, "Matched multi-char fold across EXACTFish node boundaries; if failed, was at count $failed"); + + # This tests that under /d matching that an 'ss' split across two + # parts of a node doesn't end up turning into something that matches + # \xDF unless it is in utf8. + $failed = 0; + $single = 'a'; # Is non-terminal multi-char fold char + for my $repeat (1 .. 300) { + my $string = $single x $repeat; + my $lhs = "$string\N{LATIN SMALL LETTER SHARP S}"; + utf8::downgrade($lhs); + $string .= "s"; + if ($lhs =~ m/${string}s/di) { + $failed = $repeat; + last; + } + } + ok(! $failed, "Matched multi-char fold 'ss' across EXACTF node boundaries; if failed, was at count $failed"); } { diff --git a/t/re/re_tests b/t/re/re_tests index b24e82d..7420510 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -1591,6 +1591,8 @@ ab[c\\\](??{"x"})]{3}d ab\\](d y - - /st/i \x{DF}\x{FB05} y $& \x{FB05} /ssst/i \x{DF}\x{FB05} y $& \x{DF}\x{FB05} +/[s]s/i \x{DF} n - - +/s[s]/i \x{DF} n - - # [perl #101970] /[[:lower:]]/i \x{100} y $& \x{100} -- 2.7.4