/* Information about the match that the perl core uses to manage
* things */
U32 extflags; /* Flags used both externally and internally */
- I32 minlen; /* mininum possible length of string to match */
- I32 minlenret; /* mininum possible length of $& */
+ I32 minlen; /* mininum possible number of chars in */
+ string to match */
+ I32 minlenret; /* mininum possible number of chars in $& */
U32 gofs; /* chars left of pos that we search from */
/* substring data about strings that must appear
=head2 C<minlen> C<minlenret>
-The minimum string length required for the pattern to match. This is used to
+The minimum string length (in characters) required for the pattern to match.
+This is used to
prune the search space by not bothering to match any closer to the end of a
string than would allow a match. For instance there is no point in even
starting the regex engine if the minlen is 10 but the string is only 5
characters long. There is no way that the pattern can match.
-C<minlenret> is the minimum length of the string that would be found
-in $& after a match.
+C<minlenret> is the minimum length (in characters) of the string that would
+be found in $& after a match.
The difference between C<minlen> and C<minlenret> can be seen in the
following pattern:
string can occur infinitely far to the right.
- minlenp
- A pointer to the minimum length of the pattern that the string
- was found inside. This is important as in the case of positive
+ A pointer to the minimum number of characters of the pattern that the
+ string was found inside. This is important as in the case of positive
lookahead or positive lookbehind we can have multiple patterns
involved. Consider
* these get optimized out
*
* If there are problematic code sequences, *min_subtract is set to the delta
- * that the minimum size of the node can be less than its actual size. And,
- * the node type of the result is changed to reflect that it contains these
- * sequences.
+ * number of characters that the minimum size of the node can be less than its
+ * actual size. And, the node type of the result is changed to reflect that it
+ * contains these sequences.
*
* And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
* and contains LATIN SMALL LETTER SHARP S
* U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
*
* This means that in case-insensitive matching (or "loose
- * matching", as Unicode calls it), an EXACTF of length six (the
- * UTF-8 encoded byte length of the above casefolded versions) can
- * match a target string of length two (the byte length of UTF-8
- * encoded U+0390 or U+03B0). This would rather mess up the
- * minimum length computation. (there are other code points that
- * also fold to these two sequences, but the delta is smaller)
+ * matching", as Unicode calls it), an EXACTF of length 3 chars can
+ * match a target string of length 1 char. This would rather mess
+ * up the minimum length computation.
*
* If these sequences are found, the minimum length is decreased by
- * four (six minus two).
+ * two.
*
* Similarly, 'ss' may match the single char and byte LATIN SMALL
* LETTER SHARP S. We decrease the min length by 1 for each
break;
}
greek_sequence:
- *min_subtract += 4;
+ *min_subtract += 2;
/* This requires special handling by trie's, so change
* the node type to indicate this. If EXACTFA and
/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
{
dVAR;
- I32 min = 0, pars = 0, code;
+ I32 min = 0; /* There must be at least this number of characters to match */
+ I32 pars = 0, code;
regnode *scan = *scanp, *next;
I32 delta = 0;
int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
fake_study_recurse:
while ( scan && OP(scan) != END && scan < last ){
- UV min_subtract = 0; /* How much to subtract from the minimum node
- length to get a real minimum (because the
- folded version may be shorter) */
+ UV min_subtract = 0; /* How mmany chars to subtract from the minimum
+ node length to get a real minimum (because
+ the folded version may be shorter) */
bool has_exactf_sharp_s = FALSE;
/* Peephole optimizer: */
DEBUG_STUDYDATA("Peep:", data,depth);
RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
}
min += l - min_subtract;
- if (min < 0) {
- min = 0;
- }
+ assert (min >= 0);
delta += min_subtract;
if (flags & SCF_DO_SUBSTR) {
data->pos_min += l - min_subtract;