From 2d6084134c9368aa08d70d491e30e209b2993fd3 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 30 Sep 2012 09:41:51 -0600 Subject: [PATCH] regcomp.c: min len is chars, not bytes The traditionally-called tricky folds occur because, under /i, a 6-byte/3-character sequence can match a 2-byte/1-character sequence. The code here has assumed that the delta quantity is measured in bytes (6-2=4), whereas everywhere else (AFAICT), assumes the measure is to be in characters (3-2=1). --- pod/perlreapi.pod | 12 +++++++----- regcomp.c | 36 ++++++++++++++++-------------------- 2 files changed, 23 insertions(+), 25 deletions(-) diff --git a/pod/perlreapi.pod b/pod/perlreapi.pod index a9e0337..60b81ff 100644 --- a/pod/perlreapi.pod +++ b/pod/perlreapi.pod @@ -567,8 +567,9 @@ values. /* Information about the match that the perl core uses to manage * things */ U32 extflags; /* Flags used both externally and internally */ - I32 minlen; /* mininum possible length of string to match */ - I32 minlenret; /* mininum possible length of $& */ + I32 minlen; /* mininum possible number of chars in */ + string to match */ + I32 minlenret; /* mininum possible number of chars in $& */ U32 gofs; /* chars left of pos that we search from */ /* substring data about strings that must appear @@ -639,14 +640,15 @@ valid flags. =head2 C C -The minimum string length required for the pattern to match. This is used to +The minimum string length (in characters) required for the pattern to match. +This is used to prune the search space by not bothering to match any closer to the end of a string than would allow a match. For instance there is no point in even starting the regex engine if the minlen is 10 but the string is only 5 characters long. There is no way that the pattern can match. -C is the minimum length of the string that would be found -in $& after a match. +C is the minimum length (in characters) of the string that would +be found in $& after a match. The difference between C and C can be seen in the following pattern: diff --git a/regcomp.c b/regcomp.c index 330ab8e..8cef832 100644 --- a/regcomp.c +++ b/regcomp.c @@ -297,8 +297,8 @@ typedef struct RExC_state_t { string can occur infinitely far to the right. - minlenp - A pointer to the minimum length of the pattern that the string - was found inside. This is important as in the case of positive + A pointer to the minimum number of characters of the pattern that the + string was found inside. This is important as in the case of positive lookahead or positive lookbehind we can have multiple patterns involved. Consider @@ -2599,9 +2599,9 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * these get optimized out * * If there are problematic code sequences, *min_subtract is set to the delta - * that the minimum size of the node can be less than its actual size. And, - * the node type of the result is changed to reflect that it contains these - * sequences. + * number of characters that the minimum size of the node can be less than its + * actual size. And, the node type of the result is changed to reflect that it + * contains these sequences. * * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF * and contains LATIN SMALL LETTER SHARP S @@ -2818,15 +2818,12 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81 * * This means that in case-insensitive matching (or "loose - * matching", as Unicode calls it), an EXACTF of length six (the - * UTF-8 encoded byte length of the above casefolded versions) can - * match a target string of length two (the byte length of UTF-8 - * encoded U+0390 or U+03B0). This would rather mess up the - * minimum length computation. (there are other code points that - * also fold to these two sequences, but the delta is smaller) + * matching", as Unicode calls it), an EXACTF of length 3 chars can + * match a target string of length 1 char. This would rather mess + * up the minimum length computation. * * If these sequences are found, the minimum length is decreased by - * four (six minus two). + * two. * * Similarly, 'ss' may match the single char and byte LATIN SMALL * LETTER SHARP S. We decrease the min length by 1 for each @@ -2888,7 +2885,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b break; } greek_sequence: - *min_subtract += 4; + *min_subtract += 2; /* This requires special handling by trie's, so change * the node type to indicate this. If EXACTFA and @@ -3031,7 +3028,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */ { dVAR; - I32 min = 0, pars = 0, code; + I32 min = 0; /* There must be at least this number of characters to match */ + I32 pars = 0, code; regnode *scan = *scanp, *next; I32 delta = 0; int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); @@ -3058,9 +3056,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, fake_study_recurse: while ( scan && OP(scan) != END && scan < last ){ - UV min_subtract = 0; /* How much to subtract from the minimum node - length to get a real minimum (because the - folded version may be shorter) */ + UV min_subtract = 0; /* How mmany chars to subtract from the minimum + node length to get a real minimum (because + the folded version may be shorter) */ bool has_exactf_sharp_s = FALSE; /* Peephole optimizer: */ DEBUG_STUDYDATA("Peep:", data,depth); @@ -3672,9 +3670,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, RExC_seen |= REG_SEEN_EXACTF_SHARP_S; } min += l - min_subtract; - if (min < 0) { - min = 0; - } + assert (min >= 0); delta += min_subtract; if (flags & SCF_DO_SUBSTR) { data->pos_min += l - min_subtract; -- 2.7.4