From 2c63ecadcb2533b7a53efa836736d86cb484d969 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 26 Oct 2010 13:14:12 -0600 Subject: [PATCH] ANYOF regnode often overallocates space The ANYOF regnode is used for [bracketed character classes]. It is normally a struct regnode_charclass, but sometimes it is a struct regnode_charclass_class, which adds an extra 4 bytes at the end to store which of things like \d, \w, [:posix:] within the outer bracketed class are to be matched at run time, like [\d\s[:upper:]]. However these are currently only stored the if the match is to be locale based. I haven't looked in the history, but the comments seem to indicate that in the past those things were stored there all the time. But, the space is allocated even if nothing gets stored there. This patch only allocates the space if there is one of these classes occurring in a [bracketed class] and something is stored there, namely only if the regex is being compiled under 'use locale'. The previous code was not noting that these classes matched code points (in the 'stored' variable), but was relying on ANYOF_LARGE getting set which thus avoided it thinking it was a single character class that could be optimized to an EXACT node. Thus, this patch has to explicitly set 'stored' to > 1, as the function already does elsewhere for similar reasons. --- regcomp.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/regcomp.c b/regcomp.c index 831d579..0d469c1 100644 --- a/regcomp.c +++ b/regcomp.c @@ -8177,7 +8177,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) #ifdef EBCDIC UV literal_endpoint = 0; #endif - UV stored = 0; /* number of chars stored in the class */ + UV stored = 0; /* 0, 1, or more than 1 chars stored in the class */ regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in case we need to change the emitted regop to an EXACT. */ @@ -8415,10 +8415,23 @@ parseit: if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ - if (!SIZE_ONLY && !need_class) - ANYOF_CLASS_ZERO(ret); - - need_class = 1; + /* What matches in a locale is not known until runtime, so need to + * (one time per class) allocate extra space to pass to regexec. + * The space will contain a bit for each named class that is to be + * matched against. This isn't needed for \p{} and pseudo-classes, + * as they are not affected by locale, and hence are dealt with + * separately */ + if (LOC && namedclass < ANYOF_MAX && ! need_class) { + need_class = 1; + if (SIZE_ONLY) { + RExC_size += ANYOF_CLASS_ADD_SKIP; + } + else { + RExC_emit += ANYOF_CLASS_ADD_SKIP; + ANYOF_CLASS_ZERO(ret); + } + ANYOF_FLAGS(ret) |= ANYOF_CLASS|ANYOF_LARGE; + } /* a bad range like a-\d, a-[:digit:] ? */ if (range) { @@ -8549,8 +8562,7 @@ parseit: /* Strings such as "+utf8::isWord\n" */ Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what); } - if (LOC) - ANYOF_FLAGS(ret) |= ANYOF_CLASS; + stored+=2; /* can't optimize this class */ continue; } } /* end of namedclass \blah */ @@ -8709,13 +8721,6 @@ parseit: range = 0; /* this range (if it was one) is done now */ } - if (need_class) { - ANYOF_FLAGS(ret) |= ANYOF_LARGE; - if (SIZE_ONLY) - RExC_size += ANYOF_CLASS_ADD_SKIP; - else - RExC_emit += ANYOF_CLASS_ADD_SKIP; - } if (SIZE_ONLY) -- 2.7.4