5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 extern const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
95 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
96 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
103 # if defined(BUGGY_MSC6)
104 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
105 # pragma optimize("a",off)
106 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
107 # pragma optimize("w",on )
108 # endif /* BUGGY_MSC6 */
112 #define STATIC static
116 typedef struct RExC_state_t {
117 U32 flags; /* RXf_* are we folding, multilining? */
118 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
119 char *precomp; /* uncompiled string. */
120 REGEXP *rx_sv; /* The SV that is the regexp. */
121 regexp *rx; /* perl core regexp structure */
122 regexp_internal *rxi; /* internal data for regexp object pprivate field */
123 char *start; /* Start of input for compile */
124 char *end; /* End of input for compile */
125 char *parse; /* Input-scan pointer. */
126 I32 whilem_seen; /* number of WHILEM in this expr */
127 regnode *emit_start; /* Start of emitted-code area */
128 regnode *emit_bound; /* First regnode outside of the allocated space */
129 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
133 I32 size; /* Code size. */
134 I32 npar; /* Capture buffer count, (OPEN). */
135 I32 cpar; /* Capture buffer count, (CLOSE). */
136 I32 nestroot; /* root parens we are in - used by accept */
139 regnode **open_parens; /* pointers to open parens */
140 regnode **close_parens; /* pointers to close parens */
141 regnode *opend; /* END node in program */
142 I32 utf8; /* whether the pattern is utf8 or not */
143 I32 orig_utf8; /* whether the pattern was originally in utf8 */
144 /* XXX use this for future optimisation of case
145 * where pattern must be upgraded to utf8. */
146 I32 uni_semantics; /* If a d charset modifier should use unicode
147 rules, even if the pattern is not in
149 HV *paren_names; /* Paren names */
151 regnode **recurse; /* Recurse regops */
152 I32 recurse_count; /* Number of recurse regops */
155 I32 override_recoding;
156 I32 in_multi_char_class;
157 struct reg_code_block *code_blocks; /* positions of literal (?{})
159 int num_code_blocks; /* size of code_blocks[] */
160 int code_index; /* next code_blocks[] slot */
162 char *starttry; /* -Dr: where regtry was called. */
163 #define RExC_starttry (pRExC_state->starttry)
165 SV *runtime_code_qr; /* qr with the runtime code blocks */
167 const char *lastparse;
169 AV *paren_name_list; /* idx -> name */
170 #define RExC_lastparse (pRExC_state->lastparse)
171 #define RExC_lastnum (pRExC_state->lastnum)
172 #define RExC_paren_name_list (pRExC_state->paren_name_list)
176 #define RExC_flags (pRExC_state->flags)
177 #define RExC_pm_flags (pRExC_state->pm_flags)
178 #define RExC_precomp (pRExC_state->precomp)
179 #define RExC_rx_sv (pRExC_state->rx_sv)
180 #define RExC_rx (pRExC_state->rx)
181 #define RExC_rxi (pRExC_state->rxi)
182 #define RExC_start (pRExC_state->start)
183 #define RExC_end (pRExC_state->end)
184 #define RExC_parse (pRExC_state->parse)
185 #define RExC_whilem_seen (pRExC_state->whilem_seen)
186 #ifdef RE_TRACK_PATTERN_OFFSETS
187 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
189 #define RExC_emit (pRExC_state->emit)
190 #define RExC_emit_start (pRExC_state->emit_start)
191 #define RExC_emit_bound (pRExC_state->emit_bound)
192 #define RExC_naughty (pRExC_state->naughty)
193 #define RExC_sawback (pRExC_state->sawback)
194 #define RExC_seen (pRExC_state->seen)
195 #define RExC_size (pRExC_state->size)
196 #define RExC_npar (pRExC_state->npar)
197 #define RExC_nestroot (pRExC_state->nestroot)
198 #define RExC_extralen (pRExC_state->extralen)
199 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
200 #define RExC_utf8 (pRExC_state->utf8)
201 #define RExC_uni_semantics (pRExC_state->uni_semantics)
202 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
203 #define RExC_open_parens (pRExC_state->open_parens)
204 #define RExC_close_parens (pRExC_state->close_parens)
205 #define RExC_opend (pRExC_state->opend)
206 #define RExC_paren_names (pRExC_state->paren_names)
207 #define RExC_recurse (pRExC_state->recurse)
208 #define RExC_recurse_count (pRExC_state->recurse_count)
209 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
210 #define RExC_contains_locale (pRExC_state->contains_locale)
211 #define RExC_override_recoding (pRExC_state->override_recoding)
212 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
215 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
216 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
217 ((*s) == '{' && regcurly(s, FALSE)))
220 #undef SPSTART /* dratted cpp namespace... */
223 * Flags to be passed up and down.
225 #define WORST 0 /* Worst case. */
226 #define HASWIDTH 0x01 /* Known to match non-null strings. */
228 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
229 * character. (There needs to be a case: in the switch statement in regexec.c
230 * for any node marked SIMPLE.) Note that this is not the same thing as
233 #define SPSTART 0x04 /* Starts with * or + */
234 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
235 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
236 #define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
238 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
240 /* whether trie related optimizations are enabled */
241 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
242 #define TRIE_STUDY_OPT
243 #define FULL_TRIE_STUDY
249 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
250 #define PBITVAL(paren) (1 << ((paren) & 7))
251 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
252 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
253 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
255 /* If not already in utf8, do a longjmp back to the beginning */
256 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
257 #define REQUIRE_UTF8 STMT_START { \
259 *flagp = RESTART_UTF8; \
264 /* This converts the named class defined in regcomp.h to its equivalent class
265 * number defined in handy.h. */
266 #define namedclass_to_classnum(class) ((int) ((class) / 2))
267 #define classnum_to_namedclass(classnum) ((classnum) * 2)
269 /* About scan_data_t.
271 During optimisation we recurse through the regexp program performing
272 various inplace (keyhole style) optimisations. In addition study_chunk
273 and scan_commit populate this data structure with information about
274 what strings MUST appear in the pattern. We look for the longest
275 string that must appear at a fixed location, and we look for the
276 longest string that may appear at a floating location. So for instance
281 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
282 strings (because they follow a .* construct). study_chunk will identify
283 both FOO and BAR as being the longest fixed and floating strings respectively.
285 The strings can be composites, for instance
289 will result in a composite fixed substring 'foo'.
291 For each string some basic information is maintained:
293 - offset or min_offset
294 This is the position the string must appear at, or not before.
295 It also implicitly (when combined with minlenp) tells us how many
296 characters must match before the string we are searching for.
297 Likewise when combined with minlenp and the length of the string it
298 tells us how many characters must appear after the string we have
302 Only used for floating strings. This is the rightmost point that
303 the string can appear at. If set to I32 max it indicates that the
304 string can occur infinitely far to the right.
307 A pointer to the minimum number of characters of the pattern that the
308 string was found inside. This is important as in the case of positive
309 lookahead or positive lookbehind we can have multiple patterns
314 The minimum length of the pattern overall is 3, the minimum length
315 of the lookahead part is 3, but the minimum length of the part that
316 will actually match is 1. So 'FOO's minimum length is 3, but the
317 minimum length for the F is 1. This is important as the minimum length
318 is used to determine offsets in front of and behind the string being
319 looked for. Since strings can be composites this is the length of the
320 pattern at the time it was committed with a scan_commit. Note that
321 the length is calculated by study_chunk, so that the minimum lengths
322 are not known until the full pattern has been compiled, thus the
323 pointer to the value.
327 In the case of lookbehind the string being searched for can be
328 offset past the start point of the final matching string.
329 If this value was just blithely removed from the min_offset it would
330 invalidate some of the calculations for how many chars must match
331 before or after (as they are derived from min_offset and minlen and
332 the length of the string being searched for).
333 When the final pattern is compiled and the data is moved from the
334 scan_data_t structure into the regexp structure the information
335 about lookbehind is factored in, with the information that would
336 have been lost precalculated in the end_shift field for the
339 The fields pos_min and pos_delta are used to store the minimum offset
340 and the delta to the maximum offset at the current point in the pattern.
344 typedef struct scan_data_t {
345 /*I32 len_min; unused */
346 /*I32 len_delta; unused */
350 I32 last_end; /* min value, <0 unless valid. */
353 SV **longest; /* Either &l_fixed, or &l_float. */
354 SV *longest_fixed; /* longest fixed string found in pattern */
355 I32 offset_fixed; /* offset where it starts */
356 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
357 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
358 SV *longest_float; /* longest floating string found in pattern */
359 I32 offset_float_min; /* earliest point in string it can appear */
360 I32 offset_float_max; /* latest point in string it can appear */
361 I32 *minlen_float; /* pointer to the minlen relevant to the string */
362 I32 lookbehind_float; /* is the position of the string modified by LB */
366 struct regnode_charclass_class *start_class;
370 * Forward declarations for pregcomp()'s friends.
373 static const scan_data_t zero_scan_data =
374 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
376 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
377 #define SF_BEFORE_SEOL 0x0001
378 #define SF_BEFORE_MEOL 0x0002
379 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
380 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
383 # define SF_FIX_SHIFT_EOL (0+2)
384 # define SF_FL_SHIFT_EOL (0+4)
386 # define SF_FIX_SHIFT_EOL (+2)
387 # define SF_FL_SHIFT_EOL (+4)
390 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
391 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
393 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
394 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
395 #define SF_IS_INF 0x0040
396 #define SF_HAS_PAR 0x0080
397 #define SF_IN_PAR 0x0100
398 #define SF_HAS_EVAL 0x0200
399 #define SCF_DO_SUBSTR 0x0400
400 #define SCF_DO_STCLASS_AND 0x0800
401 #define SCF_DO_STCLASS_OR 0x1000
402 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
403 #define SCF_WHILEM_VISITED_POS 0x2000
405 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
406 #define SCF_SEEN_ACCEPT 0x8000
408 #define UTF cBOOL(RExC_utf8)
410 /* The enums for all these are ordered so things work out correctly */
411 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
412 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
413 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
414 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
415 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
416 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
417 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
419 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
421 #define OOB_NAMEDCLASS -1
423 /* There is no code point that is out-of-bounds, so this is problematic. But
424 * its only current use is to initialize a variable that is always set before
426 #define OOB_UNICODE 0xDEADBEEF
428 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
429 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
432 /* length of regex to show in messages that don't mark a position within */
433 #define RegexLengthToShowInErrorMessages 127
436 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
437 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
438 * op/pragma/warn/regcomp.
440 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
441 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
443 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
446 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
447 * arg. Show regex, up to a maximum length. If it's too long, chop and add
450 #define _FAIL(code) STMT_START { \
451 const char *ellipses = ""; \
452 IV len = RExC_end - RExC_precomp; \
455 SAVEFREESV(RExC_rx_sv); \
456 if (len > RegexLengthToShowInErrorMessages) { \
457 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
458 len = RegexLengthToShowInErrorMessages - 10; \
464 #define FAIL(msg) _FAIL( \
465 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
466 msg, (int)len, RExC_precomp, ellipses))
468 #define FAIL2(msg,arg) _FAIL( \
469 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
470 arg, (int)len, RExC_precomp, ellipses))
473 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
475 #define Simple_vFAIL(m) STMT_START { \
476 const IV offset = RExC_parse - RExC_precomp; \
477 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
478 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
482 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
484 #define vFAIL(m) STMT_START { \
486 SAVEFREESV(RExC_rx_sv); \
491 * Like Simple_vFAIL(), but accepts two arguments.
493 #define Simple_vFAIL2(m,a1) STMT_START { \
494 const IV offset = RExC_parse - RExC_precomp; \
495 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
496 (int)offset, RExC_precomp, RExC_precomp + offset); \
500 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
502 #define vFAIL2(m,a1) STMT_START { \
504 SAVEFREESV(RExC_rx_sv); \
505 Simple_vFAIL2(m, a1); \
510 * Like Simple_vFAIL(), but accepts three arguments.
512 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
513 const IV offset = RExC_parse - RExC_precomp; \
514 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
515 (int)offset, RExC_precomp, RExC_precomp + offset); \
519 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
521 #define vFAIL3(m,a1,a2) STMT_START { \
523 SAVEFREESV(RExC_rx_sv); \
524 Simple_vFAIL3(m, a1, a2); \
528 * Like Simple_vFAIL(), but accepts four arguments.
530 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
531 const IV offset = RExC_parse - RExC_precomp; \
532 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
533 (int)offset, RExC_precomp, RExC_precomp + offset); \
536 #define vFAIL4(m,a1,a2,a3) STMT_START { \
538 SAVEFREESV(RExC_rx_sv); \
539 Simple_vFAIL4(m, a1, a2, a3); \
542 /* m is not necessarily a "literal string", in this macro */
543 #define reg_warn_non_literal_string(loc, m) STMT_START { \
544 const IV offset = loc - RExC_precomp; \
545 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
546 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
549 #define ckWARNreg(loc,m) STMT_START { \
550 const IV offset = loc - RExC_precomp; \
551 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
552 (int)offset, RExC_precomp, RExC_precomp + offset); \
555 #define vWARN_dep(loc, m) STMT_START { \
556 const IV offset = loc - RExC_precomp; \
557 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
558 (int)offset, RExC_precomp, RExC_precomp + offset); \
561 #define ckWARNdep(loc,m) STMT_START { \
562 const IV offset = loc - RExC_precomp; \
563 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
565 (int)offset, RExC_precomp, RExC_precomp + offset); \
568 #define ckWARNregdep(loc,m) STMT_START { \
569 const IV offset = loc - RExC_precomp; \
570 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
572 (int)offset, RExC_precomp, RExC_precomp + offset); \
575 #define ckWARN2regdep(loc,m, a1) STMT_START { \
576 const IV offset = loc - RExC_precomp; \
577 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
579 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
582 #define ckWARN2reg(loc, m, a1) STMT_START { \
583 const IV offset = loc - RExC_precomp; \
584 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
585 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
588 #define vWARN3(loc, m, a1, a2) STMT_START { \
589 const IV offset = loc - RExC_precomp; \
590 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
591 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
594 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
595 const IV offset = loc - RExC_precomp; \
596 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
597 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
600 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
601 const IV offset = loc - RExC_precomp; \
602 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
603 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
606 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
607 const IV offset = loc - RExC_precomp; \
608 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
609 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
612 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
613 const IV offset = loc - RExC_precomp; \
614 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
615 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
619 /* Allow for side effects in s */
620 #define REGC(c,s) STMT_START { \
621 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
624 /* Macros for recording node offsets. 20001227 mjd@plover.com
625 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
626 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
627 * Element 0 holds the number n.
628 * Position is 1 indexed.
630 #ifndef RE_TRACK_PATTERN_OFFSETS
631 #define Set_Node_Offset_To_R(node,byte)
632 #define Set_Node_Offset(node,byte)
633 #define Set_Cur_Node_Offset
634 #define Set_Node_Length_To_R(node,len)
635 #define Set_Node_Length(node,len)
636 #define Set_Node_Cur_Length(node)
637 #define Node_Offset(n)
638 #define Node_Length(n)
639 #define Set_Node_Offset_Length(node,offset,len)
640 #define ProgLen(ri) ri->u.proglen
641 #define SetProgLen(ri,x) ri->u.proglen = x
643 #define ProgLen(ri) ri->u.offsets[0]
644 #define SetProgLen(ri,x) ri->u.offsets[0] = x
645 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
647 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
648 __LINE__, (int)(node), (int)(byte))); \
650 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
652 RExC_offsets[2*(node)-1] = (byte); \
657 #define Set_Node_Offset(node,byte) \
658 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
659 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
661 #define Set_Node_Length_To_R(node,len) STMT_START { \
663 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
664 __LINE__, (int)(node), (int)(len))); \
666 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
668 RExC_offsets[2*(node)] = (len); \
673 #define Set_Node_Length(node,len) \
674 Set_Node_Length_To_R((node)-RExC_emit_start, len)
675 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
676 #define Set_Node_Cur_Length(node) \
677 Set_Node_Length(node, RExC_parse - parse_start)
679 /* Get offsets and lengths */
680 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
681 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
683 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
684 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
685 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
689 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
690 #define EXPERIMENTAL_INPLACESCAN
691 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
693 #define DEBUG_STUDYDATA(str,data,depth) \
694 DEBUG_OPTIMISE_MORE_r(if(data){ \
695 PerlIO_printf(Perl_debug_log, \
696 "%*s" str "Pos:%"IVdf"/%"IVdf \
697 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
698 (int)(depth)*2, "", \
699 (IV)((data)->pos_min), \
700 (IV)((data)->pos_delta), \
701 (UV)((data)->flags), \
702 (IV)((data)->whilem_c), \
703 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
704 is_inf ? "INF " : "" \
706 if ((data)->last_found) \
707 PerlIO_printf(Perl_debug_log, \
708 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
709 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
710 SvPVX_const((data)->last_found), \
711 (IV)((data)->last_end), \
712 (IV)((data)->last_start_min), \
713 (IV)((data)->last_start_max), \
714 ((data)->longest && \
715 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
716 SvPVX_const((data)->longest_fixed), \
717 (IV)((data)->offset_fixed), \
718 ((data)->longest && \
719 (data)->longest==&((data)->longest_float)) ? "*" : "", \
720 SvPVX_const((data)->longest_float), \
721 (IV)((data)->offset_float_min), \
722 (IV)((data)->offset_float_max) \
724 PerlIO_printf(Perl_debug_log,"\n"); \
727 /* Mark that we cannot extend a found fixed substring at this point.
728 Update the longest found anchored substring and the longest found
729 floating substrings if needed. */
732 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
734 const STRLEN l = CHR_SVLEN(data->last_found);
735 const STRLEN old_l = CHR_SVLEN(*data->longest);
736 GET_RE_DEBUG_FLAGS_DECL;
738 PERL_ARGS_ASSERT_SCAN_COMMIT;
740 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
741 SvSetMagicSV(*data->longest, data->last_found);
742 if (*data->longest == data->longest_fixed) {
743 data->offset_fixed = l ? data->last_start_min : data->pos_min;
744 if (data->flags & SF_BEFORE_EOL)
746 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
748 data->flags &= ~SF_FIX_BEFORE_EOL;
749 data->minlen_fixed=minlenp;
750 data->lookbehind_fixed=0;
752 else { /* *data->longest == data->longest_float */
753 data->offset_float_min = l ? data->last_start_min : data->pos_min;
754 data->offset_float_max = (l
755 ? data->last_start_max
756 : (data->pos_delta == I32_MAX ? I32_MAX : data->pos_min + data->pos_delta));
757 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
758 data->offset_float_max = I32_MAX;
759 if (data->flags & SF_BEFORE_EOL)
761 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
763 data->flags &= ~SF_FL_BEFORE_EOL;
764 data->minlen_float=minlenp;
765 data->lookbehind_float=0;
768 SvCUR_set(data->last_found, 0);
770 SV * const sv = data->last_found;
771 if (SvUTF8(sv) && SvMAGICAL(sv)) {
772 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
778 data->flags &= ~SF_BEFORE_EOL;
779 DEBUG_STUDYDATA("commit: ",data,0);
782 /* These macros set, clear and test whether the synthetic start class ('ssc',
783 * given by the parameter) matches an empty string (EOS). This uses the
784 * 'next_off' field in the node, to save a bit in the flags field. The ssc
785 * stands alone, so there is never a next_off, so this field is otherwise
786 * unused. The EOS information is used only for compilation, but theoretically
787 * it could be passed on to the execution code. This could be used to store
788 * more than one bit of information, but only this one is currently used. */
789 #define SET_SSC_EOS(node) STMT_START { (node)->next_off = TRUE; } STMT_END
790 #define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
791 #define TEST_SSC_EOS(node) cBOOL((node)->next_off)
793 /* Can match anything (initialization) */
795 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
797 PERL_ARGS_ASSERT_CL_ANYTHING;
799 ANYOF_BITMAP_SETALL(cl);
800 cl->flags = ANYOF_UNICODE_ALL;
803 /* If any portion of the regex is to operate under locale rules,
804 * initialization includes it. The reason this isn't done for all regexes
805 * is that the optimizer was written under the assumption that locale was
806 * all-or-nothing. Given the complexity and lack of documentation in the
807 * optimizer, and that there are inadequate test cases for locale, so many
808 * parts of it may not work properly, it is safest to avoid locale unless
810 if (RExC_contains_locale) {
811 ANYOF_CLASS_SETALL(cl); /* /l uses class */
812 cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD;
815 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
819 /* Can match anything (initialization) */
821 S_cl_is_anything(const struct regnode_charclass_class *cl)
825 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
827 for (value = 0; value < ANYOF_MAX; value += 2)
828 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
830 if (!(cl->flags & ANYOF_UNICODE_ALL))
832 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
837 /* Can match anything (initialization) */
839 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
841 PERL_ARGS_ASSERT_CL_INIT;
843 Zero(cl, 1, struct regnode_charclass_class);
845 cl_anything(pRExC_state, cl);
846 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
849 /* These two functions currently do the exact same thing */
850 #define cl_init_zero S_cl_init
852 /* 'AND' a given class with another one. Can create false positives. 'cl'
853 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
854 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
856 S_cl_and(struct regnode_charclass_class *cl,
857 const struct regnode_charclass_class *and_with)
859 PERL_ARGS_ASSERT_CL_AND;
861 assert(PL_regkind[and_with->type] == ANYOF);
863 /* I (khw) am not sure all these restrictions are necessary XXX */
864 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
865 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
866 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
867 && !(and_with->flags & ANYOF_LOC_FOLD)
868 && !(cl->flags & ANYOF_LOC_FOLD)) {
871 if (and_with->flags & ANYOF_INVERT)
872 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
873 cl->bitmap[i] &= ~and_with->bitmap[i];
875 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
876 cl->bitmap[i] &= and_with->bitmap[i];
877 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
879 if (and_with->flags & ANYOF_INVERT) {
881 /* Here, the and'ed node is inverted. Get the AND of the flags that
882 * aren't affected by the inversion. Those that are affected are
883 * handled individually below */
884 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
885 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
886 cl->flags |= affected_flags;
888 /* We currently don't know how to deal with things that aren't in the
889 * bitmap, but we know that the intersection is no greater than what
890 * is already in cl, so let there be false positives that get sorted
891 * out after the synthetic start class succeeds, and the node is
892 * matched for real. */
894 /* The inversion of these two flags indicate that the resulting
895 * intersection doesn't have them */
896 if (and_with->flags & ANYOF_UNICODE_ALL) {
897 cl->flags &= ~ANYOF_UNICODE_ALL;
899 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
900 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
903 else { /* and'd node is not inverted */
904 U8 outside_bitmap_but_not_utf8; /* Temp variable */
906 if (! ANYOF_NONBITMAP(and_with)) {
908 /* Here 'and_with' doesn't match anything outside the bitmap
909 * (except possibly ANYOF_UNICODE_ALL), which means the
910 * intersection can't either, except for ANYOF_UNICODE_ALL, in
911 * which case we don't know what the intersection is, but it's no
912 * greater than what cl already has, so can just leave it alone,
913 * with possible false positives */
914 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
915 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
916 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
919 else if (! ANYOF_NONBITMAP(cl)) {
921 /* Here, 'and_with' does match something outside the bitmap, and cl
922 * doesn't have a list of things to match outside the bitmap. If
923 * cl can match all code points above 255, the intersection will
924 * be those above-255 code points that 'and_with' matches. If cl
925 * can't match all Unicode code points, it means that it can't
926 * match anything outside the bitmap (since the 'if' that got us
927 * into this block tested for that), so we leave the bitmap empty.
929 if (cl->flags & ANYOF_UNICODE_ALL) {
930 ARG_SET(cl, ARG(and_with));
932 /* and_with's ARG may match things that don't require UTF8.
933 * And now cl's will too, in spite of this being an 'and'. See
934 * the comments below about the kludge */
935 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
939 /* Here, both 'and_with' and cl match something outside the
940 * bitmap. Currently we do not do the intersection, so just match
941 * whatever cl had at the beginning. */
945 /* Take the intersection of the two sets of flags. However, the
946 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
947 * kludge around the fact that this flag is not treated like the others
948 * which are initialized in cl_anything(). The way the optimizer works
949 * is that the synthetic start class (SSC) is initialized to match
950 * anything, and then the first time a real node is encountered, its
951 * values are AND'd with the SSC's with the result being the values of
952 * the real node. However, there are paths through the optimizer where
953 * the AND never gets called, so those initialized bits are set
954 * inappropriately, which is not usually a big deal, as they just cause
955 * false positives in the SSC, which will just mean a probably
956 * imperceptible slow down in execution. However this bit has a
957 * higher false positive consequence in that it can cause utf8.pm,
958 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
959 * bigger slowdown and also causes significant extra memory to be used.
960 * In order to prevent this, the code now takes a different tack. The
961 * bit isn't set unless some part of the regular expression needs it,
962 * but once set it won't get cleared. This means that these extra
963 * modules won't get loaded unless there was some path through the
964 * pattern that would have required them anyway, and so any false
965 * positives that occur by not ANDing them out when they could be
966 * aren't as severe as they would be if we treated this bit like all
968 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
969 & ANYOF_NONBITMAP_NON_UTF8;
970 cl->flags &= and_with->flags;
971 cl->flags |= outside_bitmap_but_not_utf8;
975 /* 'OR' a given class with another one. Can create false positives. 'cl'
976 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
977 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
979 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
981 PERL_ARGS_ASSERT_CL_OR;
983 if (or_with->flags & ANYOF_INVERT) {
985 /* Here, the or'd node is to be inverted. This means we take the
986 * complement of everything not in the bitmap, but currently we don't
987 * know what that is, so give up and match anything */
988 if (ANYOF_NONBITMAP(or_with)) {
989 cl_anything(pRExC_state, cl);
992 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
993 * <= (B1 | !B2) | (CL1 | !CL2)
994 * which is wasteful if CL2 is small, but we ignore CL2:
995 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
996 * XXXX Can we handle case-fold? Unclear:
997 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
998 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
1000 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1001 && !(or_with->flags & ANYOF_LOC_FOLD)
1002 && !(cl->flags & ANYOF_LOC_FOLD) ) {
1005 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1006 cl->bitmap[i] |= ~or_with->bitmap[i];
1007 } /* XXXX: logic is complicated otherwise */
1009 cl_anything(pRExC_state, cl);
1012 /* And, we can just take the union of the flags that aren't affected
1013 * by the inversion */
1014 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
1016 /* For the remaining flags:
1017 ANYOF_UNICODE_ALL and inverted means to not match anything above
1018 255, which means that the union with cl should just be
1019 what cl has in it, so can ignore this flag
1020 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
1021 is 127-255 to match them, but then invert that, so the
1022 union with cl should just be what cl has in it, so can
1025 } else { /* 'or_with' is not inverted */
1026 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
1027 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1028 && (!(or_with->flags & ANYOF_LOC_FOLD)
1029 || (cl->flags & ANYOF_LOC_FOLD)) ) {
1032 /* OR char bitmap and class bitmap separately */
1033 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1034 cl->bitmap[i] |= or_with->bitmap[i];
1035 if (or_with->flags & ANYOF_CLASS) {
1036 ANYOF_CLASS_OR(or_with, cl);
1039 else { /* XXXX: logic is complicated, leave it along for a moment. */
1040 cl_anything(pRExC_state, cl);
1043 if (ANYOF_NONBITMAP(or_with)) {
1045 /* Use the added node's outside-the-bit-map match if there isn't a
1046 * conflict. If there is a conflict (both nodes match something
1047 * outside the bitmap, but what they match outside is not the same
1048 * pointer, and hence not easily compared until XXX we extend
1049 * inversion lists this far), give up and allow the start class to
1050 * match everything outside the bitmap. If that stuff is all above
1051 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1052 if (! ANYOF_NONBITMAP(cl)) {
1053 ARG_SET(cl, ARG(or_with));
1055 else if (ARG(cl) != ARG(or_with)) {
1057 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1058 cl_anything(pRExC_state, cl);
1061 cl->flags |= ANYOF_UNICODE_ALL;
1066 /* Take the union */
1067 cl->flags |= or_with->flags;
1071 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1072 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1073 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1074 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1079 dump_trie(trie,widecharmap,revcharmap)
1080 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1081 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1083 These routines dump out a trie in a somewhat readable format.
1084 The _interim_ variants are used for debugging the interim
1085 tables that are used to generate the final compressed
1086 representation which is what dump_trie expects.
1088 Part of the reason for their existence is to provide a form
1089 of documentation as to how the different representations function.
1094 Dumps the final compressed table form of the trie to Perl_debug_log.
1095 Used for debugging make_trie().
1099 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1100 AV *revcharmap, U32 depth)
1103 SV *sv=sv_newmortal();
1104 int colwidth= widecharmap ? 6 : 4;
1106 GET_RE_DEBUG_FLAGS_DECL;
1108 PERL_ARGS_ASSERT_DUMP_TRIE;
1110 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1111 (int)depth * 2 + 2,"",
1112 "Match","Base","Ofs" );
1114 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1115 SV ** const tmp = av_fetch( revcharmap, state, 0);
1117 PerlIO_printf( Perl_debug_log, "%*s",
1119 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1120 PL_colors[0], PL_colors[1],
1121 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1122 PERL_PV_ESCAPE_FIRSTCHAR
1127 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1128 (int)depth * 2 + 2,"");
1130 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1131 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1132 PerlIO_printf( Perl_debug_log, "\n");
1134 for( state = 1 ; state < trie->statecount ; state++ ) {
1135 const U32 base = trie->states[ state ].trans.base;
1137 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1139 if ( trie->states[ state ].wordnum ) {
1140 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1142 PerlIO_printf( Perl_debug_log, "%6s", "" );
1145 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1150 while( ( base + ofs < trie->uniquecharcount ) ||
1151 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1152 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1155 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1157 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1158 if ( ( base + ofs >= trie->uniquecharcount ) &&
1159 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1160 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1162 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1164 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1166 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1170 PerlIO_printf( Perl_debug_log, "]");
1173 PerlIO_printf( Perl_debug_log, "\n" );
1175 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1176 for (word=1; word <= trie->wordcount; word++) {
1177 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1178 (int)word, (int)(trie->wordinfo[word].prev),
1179 (int)(trie->wordinfo[word].len));
1181 PerlIO_printf(Perl_debug_log, "\n" );
1184 Dumps a fully constructed but uncompressed trie in list form.
1185 List tries normally only are used for construction when the number of
1186 possible chars (trie->uniquecharcount) is very high.
1187 Used for debugging make_trie().
1190 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1191 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1195 SV *sv=sv_newmortal();
1196 int colwidth= widecharmap ? 6 : 4;
1197 GET_RE_DEBUG_FLAGS_DECL;
1199 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1201 /* print out the table precompression. */
1202 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1203 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1204 "------:-----+-----------------\n" );
1206 for( state=1 ; state < next_alloc ; state ++ ) {
1209 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1210 (int)depth * 2 + 2,"", (UV)state );
1211 if ( ! trie->states[ state ].wordnum ) {
1212 PerlIO_printf( Perl_debug_log, "%5s| ","");
1214 PerlIO_printf( Perl_debug_log, "W%4x| ",
1215 trie->states[ state ].wordnum
1218 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1219 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1221 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1223 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1224 PL_colors[0], PL_colors[1],
1225 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1226 PERL_PV_ESCAPE_FIRSTCHAR
1228 TRIE_LIST_ITEM(state,charid).forid,
1229 (UV)TRIE_LIST_ITEM(state,charid).newstate
1232 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1233 (int)((depth * 2) + 14), "");
1236 PerlIO_printf( Perl_debug_log, "\n");
1241 Dumps a fully constructed but uncompressed trie in table form.
1242 This is the normal DFA style state transition table, with a few
1243 twists to facilitate compression later.
1244 Used for debugging make_trie().
1247 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1248 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1253 SV *sv=sv_newmortal();
1254 int colwidth= widecharmap ? 6 : 4;
1255 GET_RE_DEBUG_FLAGS_DECL;
1257 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1260 print out the table precompression so that we can do a visual check
1261 that they are identical.
1264 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1266 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1267 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1269 PerlIO_printf( Perl_debug_log, "%*s",
1271 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1272 PL_colors[0], PL_colors[1],
1273 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1274 PERL_PV_ESCAPE_FIRSTCHAR
1280 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1282 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1283 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1286 PerlIO_printf( Perl_debug_log, "\n" );
1288 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1290 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1291 (int)depth * 2 + 2,"",
1292 (UV)TRIE_NODENUM( state ) );
1294 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1295 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1297 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1299 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1301 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1302 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1304 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1305 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1313 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1314 startbranch: the first branch in the whole branch sequence
1315 first : start branch of sequence of branch-exact nodes.
1316 May be the same as startbranch
1317 last : Thing following the last branch.
1318 May be the same as tail.
1319 tail : item following the branch sequence
1320 count : words in the sequence
1321 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1322 depth : indent depth
1324 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1326 A trie is an N'ary tree where the branches are determined by digital
1327 decomposition of the key. IE, at the root node you look up the 1st character and
1328 follow that branch repeat until you find the end of the branches. Nodes can be
1329 marked as "accepting" meaning they represent a complete word. Eg:
1333 would convert into the following structure. Numbers represent states, letters
1334 following numbers represent valid transitions on the letter from that state, if
1335 the number is in square brackets it represents an accepting state, otherwise it
1336 will be in parenthesis.
1338 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1342 (1) +-i->(6)-+-s->[7]
1344 +-s->(3)-+-h->(4)-+-e->[5]
1346 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1348 This shows that when matching against the string 'hers' we will begin at state 1
1349 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1350 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1351 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1352 single traverse. We store a mapping from accepting to state to which word was
1353 matched, and then when we have multiple possibilities we try to complete the
1354 rest of the regex in the order in which they occured in the alternation.
1356 The only prior NFA like behaviour that would be changed by the TRIE support is
1357 the silent ignoring of duplicate alternations which are of the form:
1359 / (DUPE|DUPE) X? (?{ ... }) Y /x
1361 Thus EVAL blocks following a trie may be called a different number of times with
1362 and without the optimisation. With the optimisations dupes will be silently
1363 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1364 the following demonstrates:
1366 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1368 which prints out 'word' three times, but
1370 'words'=~/(word|word|word)(?{ print $1 })S/
1372 which doesnt print it out at all. This is due to other optimisations kicking in.
1374 Example of what happens on a structural level:
1376 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1378 1: CURLYM[1] {1,32767}(18)
1389 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1390 and should turn into:
1392 1: CURLYM[1] {1,32767}(18)
1394 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1402 Cases where tail != last would be like /(?foo|bar)baz/:
1412 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1413 and would end up looking like:
1416 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1423 d = uvuni_to_utf8_flags(d, uv, 0);
1425 is the recommended Unicode-aware way of saying
1430 #define TRIE_STORE_REVCHAR(val) \
1433 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1434 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1435 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1436 SvCUR_set(zlopp, kapow - flrbbbbb); \
1439 av_push(revcharmap, zlopp); \
1441 char ooooff = (char)val; \
1442 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1446 #define TRIE_READ_CHAR STMT_START { \
1449 /* if it is UTF then it is either already folded, or does not need folding */ \
1450 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
1452 else if (folder == PL_fold_latin1) { \
1453 /* if we use this folder we have to obey unicode rules on latin-1 data */ \
1454 if ( foldlen > 0 ) { \
1455 uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
1461 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1462 skiplen = UNISKIP(uvc); \
1463 foldlen -= skiplen; \
1464 scan = foldbuf + skiplen; \
1467 /* raw data, will be folded later if needed */ \
1475 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1476 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1477 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1478 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1480 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1481 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1482 TRIE_LIST_CUR( state )++; \
1485 #define TRIE_LIST_NEW(state) STMT_START { \
1486 Newxz( trie->states[ state ].trans.list, \
1487 4, reg_trie_trans_le ); \
1488 TRIE_LIST_CUR( state ) = 1; \
1489 TRIE_LIST_LEN( state ) = 4; \
1492 #define TRIE_HANDLE_WORD(state) STMT_START { \
1493 U16 dupe= trie->states[ state ].wordnum; \
1494 regnode * const noper_next = regnext( noper ); \
1497 /* store the word for dumping */ \
1499 if (OP(noper) != NOTHING) \
1500 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1502 tmp = newSVpvn_utf8( "", 0, UTF ); \
1503 av_push( trie_words, tmp ); \
1507 trie->wordinfo[curword].prev = 0; \
1508 trie->wordinfo[curword].len = wordlen; \
1509 trie->wordinfo[curword].accept = state; \
1511 if ( noper_next < tail ) { \
1513 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1514 trie->jump[curword] = (U16)(noper_next - convert); \
1516 jumper = noper_next; \
1518 nextbranch= regnext(cur); \
1522 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1523 /* chain, so that when the bits of chain are later */\
1524 /* linked together, the dups appear in the chain */\
1525 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1526 trie->wordinfo[dupe].prev = curword; \
1528 /* we haven't inserted this word yet. */ \
1529 trie->states[ state ].wordnum = curword; \
1534 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1535 ( ( base + charid >= ucharcount \
1536 && base + charid < ubound \
1537 && state == trie->trans[ base - ucharcount + charid ].check \
1538 && trie->trans[ base - ucharcount + charid ].next ) \
1539 ? trie->trans[ base - ucharcount + charid ].next \
1540 : ( state==1 ? special : 0 ) \
1544 #define MADE_JUMP_TRIE 2
1545 #define MADE_EXACT_TRIE 4
1548 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1551 /* first pass, loop through and scan words */
1552 reg_trie_data *trie;
1553 HV *widecharmap = NULL;
1554 AV *revcharmap = newAV();
1556 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1561 regnode *jumper = NULL;
1562 regnode *nextbranch = NULL;
1563 regnode *convert = NULL;
1564 U32 *prev_states; /* temp array mapping each state to previous one */
1565 /* we just use folder as a flag in utf8 */
1566 const U8 * folder = NULL;
1569 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1570 AV *trie_words = NULL;
1571 /* along with revcharmap, this only used during construction but both are
1572 * useful during debugging so we store them in the struct when debugging.
1575 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1576 STRLEN trie_charcount=0;
1578 SV *re_trie_maxbuff;
1579 GET_RE_DEBUG_FLAGS_DECL;
1581 PERL_ARGS_ASSERT_MAKE_TRIE;
1583 PERL_UNUSED_ARG(depth);
1590 case EXACTFU_TRICKYFOLD:
1591 case EXACTFU: folder = PL_fold_latin1; break;
1592 case EXACTF: folder = PL_fold; break;
1593 case EXACTFL: folder = PL_fold_locale; break;
1594 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1597 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1599 trie->startstate = 1;
1600 trie->wordcount = word_count;
1601 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1602 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1604 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1605 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1606 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1609 trie_words = newAV();
1612 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1613 if (!SvIOK(re_trie_maxbuff)) {
1614 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1616 DEBUG_TRIE_COMPILE_r({
1617 PerlIO_printf( Perl_debug_log,
1618 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1619 (int)depth * 2 + 2, "",
1620 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1621 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1625 /* Find the node we are going to overwrite */
1626 if ( first == startbranch && OP( last ) != BRANCH ) {
1627 /* whole branch chain */
1630 /* branch sub-chain */
1631 convert = NEXTOPER( first );
1634 /* -- First loop and Setup --
1636 We first traverse the branches and scan each word to determine if it
1637 contains widechars, and how many unique chars there are, this is
1638 important as we have to build a table with at least as many columns as we
1641 We use an array of integers to represent the character codes 0..255
1642 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1643 native representation of the character value as the key and IV's for the
1646 *TODO* If we keep track of how many times each character is used we can
1647 remap the columns so that the table compression later on is more
1648 efficient in terms of memory by ensuring the most common value is in the
1649 middle and the least common are on the outside. IMO this would be better
1650 than a most to least common mapping as theres a decent chance the most
1651 common letter will share a node with the least common, meaning the node
1652 will not be compressible. With a middle is most common approach the worst
1653 case is when we have the least common nodes twice.
1657 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1658 regnode *noper = NEXTOPER( cur );
1659 const U8 *uc = (U8*)STRING( noper );
1660 const U8 *e = uc + STR_LEN( noper );
1662 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1664 const U8 *scan = (U8*)NULL;
1665 U32 wordlen = 0; /* required init */
1667 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1669 if (OP(noper) == NOTHING) {
1670 regnode *noper_next= regnext(noper);
1671 if (noper_next != tail && OP(noper_next) == flags) {
1673 uc= (U8*)STRING(noper);
1674 e= uc + STR_LEN(noper);
1675 trie->minlen= STR_LEN(noper);
1682 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1683 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1684 regardless of encoding */
1685 if (OP( noper ) == EXACTFU_SS) {
1686 /* false positives are ok, so just set this */
1687 TRIE_BITMAP_SET(trie,0xDF);
1690 for ( ; uc < e ; uc += len ) {
1691 TRIE_CHARCOUNT(trie)++;
1696 U8 folded= folder[ (U8) uvc ];
1697 if ( !trie->charmap[ folded ] ) {
1698 trie->charmap[ folded ]=( ++trie->uniquecharcount );
1699 TRIE_STORE_REVCHAR( folded );
1702 if ( !trie->charmap[ uvc ] ) {
1703 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1704 TRIE_STORE_REVCHAR( uvc );
1707 /* store the codepoint in the bitmap, and its folded
1709 TRIE_BITMAP_SET(trie, uvc);
1711 /* store the folded codepoint */
1712 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1715 /* store first byte of utf8 representation of
1716 variant codepoints */
1717 if (! UNI_IS_INVARIANT(uvc)) {
1718 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1721 set_bit = 0; /* We've done our bit :-) */
1726 widecharmap = newHV();
1728 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1731 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1733 if ( !SvTRUE( *svpp ) ) {
1734 sv_setiv( *svpp, ++trie->uniquecharcount );
1735 TRIE_STORE_REVCHAR(uvc);
1739 if( cur == first ) {
1740 trie->minlen = chars;
1741 trie->maxlen = chars;
1742 } else if (chars < trie->minlen) {
1743 trie->minlen = chars;
1744 } else if (chars > trie->maxlen) {
1745 trie->maxlen = chars;
1747 if (OP( noper ) == EXACTFU_SS) {
1748 /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1749 if (trie->minlen > 1)
1752 if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1753 /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1754 * - We assume that any such sequence might match a 2 byte string */
1755 if (trie->minlen > 2 )
1759 } /* end first pass */
1760 DEBUG_TRIE_COMPILE_r(
1761 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1762 (int)depth * 2 + 2,"",
1763 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1764 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1765 (int)trie->minlen, (int)trie->maxlen )
1769 We now know what we are dealing with in terms of unique chars and
1770 string sizes so we can calculate how much memory a naive
1771 representation using a flat table will take. If it's over a reasonable
1772 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1773 conservative but potentially much slower representation using an array
1776 At the end we convert both representations into the same compressed
1777 form that will be used in regexec.c for matching with. The latter
1778 is a form that cannot be used to construct with but has memory
1779 properties similar to the list form and access properties similar
1780 to the table form making it both suitable for fast searches and
1781 small enough that its feasable to store for the duration of a program.
1783 See the comment in the code where the compressed table is produced
1784 inplace from the flat tabe representation for an explanation of how
1785 the compression works.
1790 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1793 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1795 Second Pass -- Array Of Lists Representation
1797 Each state will be represented by a list of charid:state records
1798 (reg_trie_trans_le) the first such element holds the CUR and LEN
1799 points of the allocated array. (See defines above).
1801 We build the initial structure using the lists, and then convert
1802 it into the compressed table form which allows faster lookups
1803 (but cant be modified once converted).
1806 STRLEN transcount = 1;
1808 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1809 "%*sCompiling trie using list compiler\n",
1810 (int)depth * 2 + 2, ""));
1812 trie->states = (reg_trie_state *)
1813 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1814 sizeof(reg_trie_state) );
1818 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1820 regnode *noper = NEXTOPER( cur );
1821 U8 *uc = (U8*)STRING( noper );
1822 const U8 *e = uc + STR_LEN( noper );
1823 U32 state = 1; /* required init */
1824 U16 charid = 0; /* sanity init */
1825 U8 *scan = (U8*)NULL; /* sanity init */
1826 STRLEN foldlen = 0; /* required init */
1827 U32 wordlen = 0; /* required init */
1828 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1831 if (OP(noper) == NOTHING) {
1832 regnode *noper_next= regnext(noper);
1833 if (noper_next != tail && OP(noper_next) == flags) {
1835 uc= (U8*)STRING(noper);
1836 e= uc + STR_LEN(noper);
1840 if (OP(noper) != NOTHING) {
1841 for ( ; uc < e ; uc += len ) {
1846 charid = trie->charmap[ uvc ];
1848 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1852 charid=(U16)SvIV( *svpp );
1855 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1862 if ( !trie->states[ state ].trans.list ) {
1863 TRIE_LIST_NEW( state );
1865 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1866 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1867 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1872 newstate = next_alloc++;
1873 prev_states[newstate] = state;
1874 TRIE_LIST_PUSH( state, charid, newstate );
1879 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1883 TRIE_HANDLE_WORD(state);
1885 } /* end second pass */
1887 /* next alloc is the NEXT state to be allocated */
1888 trie->statecount = next_alloc;
1889 trie->states = (reg_trie_state *)
1890 PerlMemShared_realloc( trie->states,
1892 * sizeof(reg_trie_state) );
1894 /* and now dump it out before we compress it */
1895 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1896 revcharmap, next_alloc,
1900 trie->trans = (reg_trie_trans *)
1901 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1908 for( state=1 ; state < next_alloc ; state ++ ) {
1912 DEBUG_TRIE_COMPILE_MORE_r(
1913 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1917 if (trie->states[state].trans.list) {
1918 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1922 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1923 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1924 if ( forid < minid ) {
1926 } else if ( forid > maxid ) {
1930 if ( transcount < tp + maxid - minid + 1) {
1932 trie->trans = (reg_trie_trans *)
1933 PerlMemShared_realloc( trie->trans,
1935 * sizeof(reg_trie_trans) );
1936 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1938 base = trie->uniquecharcount + tp - minid;
1939 if ( maxid == minid ) {
1941 for ( ; zp < tp ; zp++ ) {
1942 if ( ! trie->trans[ zp ].next ) {
1943 base = trie->uniquecharcount + zp - minid;
1944 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1945 trie->trans[ zp ].check = state;
1951 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1952 trie->trans[ tp ].check = state;
1957 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1958 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1959 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1960 trie->trans[ tid ].check = state;
1962 tp += ( maxid - minid + 1 );
1964 Safefree(trie->states[ state ].trans.list);
1967 DEBUG_TRIE_COMPILE_MORE_r(
1968 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1971 trie->states[ state ].trans.base=base;
1973 trie->lasttrans = tp + 1;
1977 Second Pass -- Flat Table Representation.
1979 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1980 We know that we will need Charcount+1 trans at most to store the data
1981 (one row per char at worst case) So we preallocate both structures
1982 assuming worst case.
1984 We then construct the trie using only the .next slots of the entry
1987 We use the .check field of the first entry of the node temporarily to
1988 make compression both faster and easier by keeping track of how many non
1989 zero fields are in the node.
1991 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1994 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1995 number representing the first entry of the node, and state as a
1996 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1997 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1998 are 2 entrys per node. eg:
2006 The table is internally in the right hand, idx form. However as we also
2007 have to deal with the states array which is indexed by nodenum we have to
2008 use TRIE_NODENUM() to convert.
2011 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2012 "%*sCompiling trie using table compiler\n",
2013 (int)depth * 2 + 2, ""));
2015 trie->trans = (reg_trie_trans *)
2016 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2017 * trie->uniquecharcount + 1,
2018 sizeof(reg_trie_trans) );
2019 trie->states = (reg_trie_state *)
2020 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2021 sizeof(reg_trie_state) );
2022 next_alloc = trie->uniquecharcount + 1;
2025 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2027 regnode *noper = NEXTOPER( cur );
2028 const U8 *uc = (U8*)STRING( noper );
2029 const U8 *e = uc + STR_LEN( noper );
2031 U32 state = 1; /* required init */
2033 U16 charid = 0; /* sanity init */
2034 U32 accept_state = 0; /* sanity init */
2035 U8 *scan = (U8*)NULL; /* sanity init */
2037 STRLEN foldlen = 0; /* required init */
2038 U32 wordlen = 0; /* required init */
2040 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2042 if (OP(noper) == NOTHING) {
2043 regnode *noper_next= regnext(noper);
2044 if (noper_next != tail && OP(noper_next) == flags) {
2046 uc= (U8*)STRING(noper);
2047 e= uc + STR_LEN(noper);
2051 if ( OP(noper) != NOTHING ) {
2052 for ( ; uc < e ; uc += len ) {
2057 charid = trie->charmap[ uvc ];
2059 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2060 charid = svpp ? (U16)SvIV(*svpp) : 0;
2064 if ( !trie->trans[ state + charid ].next ) {
2065 trie->trans[ state + charid ].next = next_alloc;
2066 trie->trans[ state ].check++;
2067 prev_states[TRIE_NODENUM(next_alloc)]
2068 = TRIE_NODENUM(state);
2069 next_alloc += trie->uniquecharcount;
2071 state = trie->trans[ state + charid ].next;
2073 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2075 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2078 accept_state = TRIE_NODENUM( state );
2079 TRIE_HANDLE_WORD(accept_state);
2081 } /* end second pass */
2083 /* and now dump it out before we compress it */
2084 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2086 next_alloc, depth+1));
2090 * Inplace compress the table.*
2092 For sparse data sets the table constructed by the trie algorithm will
2093 be mostly 0/FAIL transitions or to put it another way mostly empty.
2094 (Note that leaf nodes will not contain any transitions.)
2096 This algorithm compresses the tables by eliminating most such
2097 transitions, at the cost of a modest bit of extra work during lookup:
2099 - Each states[] entry contains a .base field which indicates the
2100 index in the state[] array wheres its transition data is stored.
2102 - If .base is 0 there are no valid transitions from that node.
2104 - If .base is nonzero then charid is added to it to find an entry in
2107 -If trans[states[state].base+charid].check!=state then the
2108 transition is taken to be a 0/Fail transition. Thus if there are fail
2109 transitions at the front of the node then the .base offset will point
2110 somewhere inside the previous nodes data (or maybe even into a node
2111 even earlier), but the .check field determines if the transition is
2115 The following process inplace converts the table to the compressed
2116 table: We first do not compress the root node 1,and mark all its
2117 .check pointers as 1 and set its .base pointer as 1 as well. This
2118 allows us to do a DFA construction from the compressed table later,
2119 and ensures that any .base pointers we calculate later are greater
2122 - We set 'pos' to indicate the first entry of the second node.
2124 - We then iterate over the columns of the node, finding the first and
2125 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2126 and set the .check pointers accordingly, and advance pos
2127 appropriately and repreat for the next node. Note that when we copy
2128 the next pointers we have to convert them from the original
2129 NODEIDX form to NODENUM form as the former is not valid post
2132 - If a node has no transitions used we mark its base as 0 and do not
2133 advance the pos pointer.
2135 - If a node only has one transition we use a second pointer into the
2136 structure to fill in allocated fail transitions from other states.
2137 This pointer is independent of the main pointer and scans forward
2138 looking for null transitions that are allocated to a state. When it
2139 finds one it writes the single transition into the "hole". If the
2140 pointer doesnt find one the single transition is appended as normal.
2142 - Once compressed we can Renew/realloc the structures to release the
2145 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2146 specifically Fig 3.47 and the associated pseudocode.
2150 const U32 laststate = TRIE_NODENUM( next_alloc );
2153 trie->statecount = laststate;
2155 for ( state = 1 ; state < laststate ; state++ ) {
2157 const U32 stateidx = TRIE_NODEIDX( state );
2158 const U32 o_used = trie->trans[ stateidx ].check;
2159 U32 used = trie->trans[ stateidx ].check;
2160 trie->trans[ stateidx ].check = 0;
2162 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2163 if ( flag || trie->trans[ stateidx + charid ].next ) {
2164 if ( trie->trans[ stateidx + charid ].next ) {
2166 for ( ; zp < pos ; zp++ ) {
2167 if ( ! trie->trans[ zp ].next ) {
2171 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2172 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2173 trie->trans[ zp ].check = state;
2174 if ( ++zp > pos ) pos = zp;
2181 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2183 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2184 trie->trans[ pos ].check = state;
2189 trie->lasttrans = pos + 1;
2190 trie->states = (reg_trie_state *)
2191 PerlMemShared_realloc( trie->states, laststate
2192 * sizeof(reg_trie_state) );
2193 DEBUG_TRIE_COMPILE_MORE_r(
2194 PerlIO_printf( Perl_debug_log,
2195 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2196 (int)depth * 2 + 2,"",
2197 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2200 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2203 } /* end table compress */
2205 DEBUG_TRIE_COMPILE_MORE_r(
2206 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2207 (int)depth * 2 + 2, "",
2208 (UV)trie->statecount,
2209 (UV)trie->lasttrans)
2211 /* resize the trans array to remove unused space */
2212 trie->trans = (reg_trie_trans *)
2213 PerlMemShared_realloc( trie->trans, trie->lasttrans
2214 * sizeof(reg_trie_trans) );
2216 { /* Modify the program and insert the new TRIE node */
2217 U8 nodetype =(U8)(flags & 0xFF);
2221 regnode *optimize = NULL;
2222 #ifdef RE_TRACK_PATTERN_OFFSETS
2225 U32 mjd_nodelen = 0;
2226 #endif /* RE_TRACK_PATTERN_OFFSETS */
2227 #endif /* DEBUGGING */
2229 This means we convert either the first branch or the first Exact,
2230 depending on whether the thing following (in 'last') is a branch
2231 or not and whther first is the startbranch (ie is it a sub part of
2232 the alternation or is it the whole thing.)
2233 Assuming its a sub part we convert the EXACT otherwise we convert
2234 the whole branch sequence, including the first.
2236 /* Find the node we are going to overwrite */
2237 if ( first != startbranch || OP( last ) == BRANCH ) {
2238 /* branch sub-chain */
2239 NEXT_OFF( first ) = (U16)(last - first);
2240 #ifdef RE_TRACK_PATTERN_OFFSETS
2242 mjd_offset= Node_Offset((convert));
2243 mjd_nodelen= Node_Length((convert));
2246 /* whole branch chain */
2248 #ifdef RE_TRACK_PATTERN_OFFSETS
2251 const regnode *nop = NEXTOPER( convert );
2252 mjd_offset= Node_Offset((nop));
2253 mjd_nodelen= Node_Length((nop));
2257 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2258 (int)depth * 2 + 2, "",
2259 (UV)mjd_offset, (UV)mjd_nodelen)
2262 /* But first we check to see if there is a common prefix we can
2263 split out as an EXACT and put in front of the TRIE node. */
2264 trie->startstate= 1;
2265 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2267 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2271 const U32 base = trie->states[ state ].trans.base;
2273 if ( trie->states[state].wordnum )
2276 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2277 if ( ( base + ofs >= trie->uniquecharcount ) &&
2278 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2279 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2281 if ( ++count > 1 ) {
2282 SV **tmp = av_fetch( revcharmap, ofs, 0);
2283 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2284 if ( state == 1 ) break;
2286 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2288 PerlIO_printf(Perl_debug_log,
2289 "%*sNew Start State=%"UVuf" Class: [",
2290 (int)depth * 2 + 2, "",
2293 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2294 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2296 TRIE_BITMAP_SET(trie,*ch);
2298 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2300 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2304 TRIE_BITMAP_SET(trie,*ch);
2306 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2307 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2313 SV **tmp = av_fetch( revcharmap, idx, 0);
2315 char *ch = SvPV( *tmp, len );
2317 SV *sv=sv_newmortal();
2318 PerlIO_printf( Perl_debug_log,
2319 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2320 (int)depth * 2 + 2, "",
2322 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2323 PL_colors[0], PL_colors[1],
2324 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2325 PERL_PV_ESCAPE_FIRSTCHAR
2330 OP( convert ) = nodetype;
2331 str=STRING(convert);
2334 STR_LEN(convert) += len;
2340 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2345 trie->prefixlen = (state-1);
2347 regnode *n = convert+NODE_SZ_STR(convert);
2348 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2349 trie->startstate = state;
2350 trie->minlen -= (state - 1);
2351 trie->maxlen -= (state - 1);
2353 /* At least the UNICOS C compiler choked on this
2354 * being argument to DEBUG_r(), so let's just have
2357 #ifdef PERL_EXT_RE_BUILD
2363 regnode *fix = convert;
2364 U32 word = trie->wordcount;
2366 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2367 while( ++fix < n ) {
2368 Set_Node_Offset_Length(fix, 0, 0);
2371 SV ** const tmp = av_fetch( trie_words, word, 0 );
2373 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2374 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2376 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2384 NEXT_OFF(convert) = (U16)(tail - convert);
2385 DEBUG_r(optimize= n);
2391 if ( trie->maxlen ) {
2392 NEXT_OFF( convert ) = (U16)(tail - convert);
2393 ARG_SET( convert, data_slot );
2394 /* Store the offset to the first unabsorbed branch in
2395 jump[0], which is otherwise unused by the jump logic.
2396 We use this when dumping a trie and during optimisation. */
2398 trie->jump[0] = (U16)(nextbranch - convert);
2400 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2401 * and there is a bitmap
2402 * and the first "jump target" node we found leaves enough room
2403 * then convert the TRIE node into a TRIEC node, with the bitmap
2404 * embedded inline in the opcode - this is hypothetically faster.
2406 if ( !trie->states[trie->startstate].wordnum
2408 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2410 OP( convert ) = TRIEC;
2411 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2412 PerlMemShared_free(trie->bitmap);
2415 OP( convert ) = TRIE;
2417 /* store the type in the flags */
2418 convert->flags = nodetype;
2422 + regarglen[ OP( convert ) ];
2424 /* XXX We really should free up the resource in trie now,
2425 as we won't use them - (which resources?) dmq */
2427 /* needed for dumping*/
2428 DEBUG_r(if (optimize) {
2429 regnode *opt = convert;
2431 while ( ++opt < optimize) {
2432 Set_Node_Offset_Length(opt,0,0);
2435 Try to clean up some of the debris left after the
2438 while( optimize < jumper ) {
2439 mjd_nodelen += Node_Length((optimize));
2440 OP( optimize ) = OPTIMIZED;
2441 Set_Node_Offset_Length(optimize,0,0);
2444 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2446 } /* end node insert */
2448 /* Finish populating the prev field of the wordinfo array. Walk back
2449 * from each accept state until we find another accept state, and if
2450 * so, point the first word's .prev field at the second word. If the
2451 * second already has a .prev field set, stop now. This will be the
2452 * case either if we've already processed that word's accept state,
2453 * or that state had multiple words, and the overspill words were
2454 * already linked up earlier.
2461 for (word=1; word <= trie->wordcount; word++) {
2463 if (trie->wordinfo[word].prev)
2465 state = trie->wordinfo[word].accept;
2467 state = prev_states[state];
2470 prev = trie->states[state].wordnum;
2474 trie->wordinfo[word].prev = prev;
2476 Safefree(prev_states);
2480 /* and now dump out the compressed format */
2481 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2483 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2485 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2486 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2488 SvREFCNT_dec_NN(revcharmap);
2492 : trie->startstate>1
2498 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2500 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2502 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2503 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2506 We find the fail state for each state in the trie, this state is the longest proper
2507 suffix of the current state's 'word' that is also a proper prefix of another word in our
2508 trie. State 1 represents the word '' and is thus the default fail state. This allows
2509 the DFA not to have to restart after its tried and failed a word at a given point, it
2510 simply continues as though it had been matching the other word in the first place.
2512 'abcdgu'=~/abcdefg|cdgu/
2513 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2514 fail, which would bring us to the state representing 'd' in the second word where we would
2515 try 'g' and succeed, proceeding to match 'cdgu'.
2517 /* add a fail transition */
2518 const U32 trie_offset = ARG(source);
2519 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2521 const U32 ucharcount = trie->uniquecharcount;
2522 const U32 numstates = trie->statecount;
2523 const U32 ubound = trie->lasttrans + ucharcount;
2527 U32 base = trie->states[ 1 ].trans.base;
2530 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2531 GET_RE_DEBUG_FLAGS_DECL;
2533 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2535 PERL_UNUSED_ARG(depth);
2539 ARG_SET( stclass, data_slot );
2540 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2541 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2542 aho->trie=trie_offset;
2543 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2544 Copy( trie->states, aho->states, numstates, reg_trie_state );
2545 Newxz( q, numstates, U32);
2546 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2549 /* initialize fail[0..1] to be 1 so that we always have
2550 a valid final fail state */
2551 fail[ 0 ] = fail[ 1 ] = 1;
2553 for ( charid = 0; charid < ucharcount ; charid++ ) {
2554 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2556 q[ q_write ] = newstate;
2557 /* set to point at the root */
2558 fail[ q[ q_write++ ] ]=1;
2561 while ( q_read < q_write) {
2562 const U32 cur = q[ q_read++ % numstates ];
2563 base = trie->states[ cur ].trans.base;
2565 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2566 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2568 U32 fail_state = cur;
2571 fail_state = fail[ fail_state ];
2572 fail_base = aho->states[ fail_state ].trans.base;
2573 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2575 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2576 fail[ ch_state ] = fail_state;
2577 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2579 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2581 q[ q_write++ % numstates] = ch_state;
2585 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2586 when we fail in state 1, this allows us to use the
2587 charclass scan to find a valid start char. This is based on the principle
2588 that theres a good chance the string being searched contains lots of stuff
2589 that cant be a start char.
2591 fail[ 0 ] = fail[ 1 ] = 0;
2592 DEBUG_TRIE_COMPILE_r({
2593 PerlIO_printf(Perl_debug_log,
2594 "%*sStclass Failtable (%"UVuf" states): 0",
2595 (int)(depth * 2), "", (UV)numstates
2597 for( q_read=1; q_read<numstates; q_read++ ) {
2598 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2600 PerlIO_printf(Perl_debug_log, "\n");
2603 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2608 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2609 * These need to be revisited when a newer toolchain becomes available.
2611 #if defined(__sparc64__) && defined(__GNUC__)
2612 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2613 # undef SPARC64_GCC_WORKAROUND
2614 # define SPARC64_GCC_WORKAROUND 1
2618 #define DEBUG_PEEP(str,scan,depth) \
2619 DEBUG_OPTIMISE_r({if (scan){ \
2620 SV * const mysv=sv_newmortal(); \
2621 regnode *Next = regnext(scan); \
2622 regprop(RExC_rx, mysv, scan); \
2623 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2624 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2625 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2629 /* The below joins as many adjacent EXACTish nodes as possible into a single
2630 * one. The regop may be changed if the node(s) contain certain sequences that
2631 * require special handling. The joining is only done if:
2632 * 1) there is room in the current conglomerated node to entirely contain the
2634 * 2) they are the exact same node type
2636 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2637 * these get optimized out
2639 * If a node is to match under /i (folded), the number of characters it matches
2640 * can be different than its character length if it contains a multi-character
2641 * fold. *min_subtract is set to the total delta of the input nodes.
2643 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2644 * and contains LATIN SMALL LETTER SHARP S
2646 * This is as good a place as any to discuss the design of handling these
2647 * multi-character fold sequences. It's been wrong in Perl for a very long
2648 * time. There are three code points in Unicode whose multi-character folds
2649 * were long ago discovered to mess things up. The previous designs for
2650 * dealing with these involved assigning a special node for them. This
2651 * approach doesn't work, as evidenced by this example:
2652 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2653 * Both these fold to "sss", but if the pattern is parsed to create a node that
2654 * would match just the \xDF, it won't be able to handle the case where a
2655 * successful match would have to cross the node's boundary. The new approach
2656 * that hopefully generally solves the problem generates an EXACTFU_SS node
2659 * It turns out that there are problems with all multi-character folds, and not
2660 * just these three. Now the code is general, for all such cases, but the
2661 * three still have some special handling. The approach taken is:
2662 * 1) This routine examines each EXACTFish node that could contain multi-
2663 * character fold sequences. It returns in *min_subtract how much to
2664 * subtract from the the actual length of the string to get a real minimum
2665 * match length; it is 0 if there are no multi-char folds. This delta is
2666 * used by the caller to adjust the min length of the match, and the delta
2667 * between min and max, so that the optimizer doesn't reject these
2668 * possibilities based on size constraints.
2669 * 2) Certain of these sequences require special handling by the trie code,
2670 * so, if found, this code changes the joined node type to special ops:
2671 * EXACTFU_TRICKYFOLD and EXACTFU_SS.
2672 * 3) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2673 * is used for an EXACTFU node that contains at least one "ss" sequence in
2674 * it. For non-UTF-8 patterns and strings, this is the only case where
2675 * there is a possible fold length change. That means that a regular
2676 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
2677 * with length changes, and so can be processed faster. regexec.c takes
2678 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
2679 * pre-folded by regcomp.c. This saves effort in regex matching.
2680 * However, the pre-folding isn't done for non-UTF8 patterns because the
2681 * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2682 * down by forcing the pattern into UTF8 unless necessary. Also what
2683 * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold
2684 * possibilities for the non-UTF8 patterns are quite simple, except for
2685 * the sharp s. All the ones that don't involve a UTF-8 target string are
2686 * members of a fold-pair, and arrays are set up for all of them so that
2687 * the other member of the pair can be found quickly. Code elsewhere in
2688 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2689 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
2690 * described in the next item.
2691 * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
2692 * 'ss' or not is not knowable at compile time. It will match iff the
2693 * target string is in UTF-8, unlike the EXACTFU nodes, where it always
2694 * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
2695 * it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2696 * described in item 3). An assumption that the optimizer part of
2697 * regexec.c (probably unwittingly) makes is that a character in the
2698 * pattern corresponds to at most a single character in the target string.
2699 * (And I do mean character, and not byte here, unlike other parts of the
2700 * documentation that have never been updated to account for multibyte
2701 * Unicode.) This assumption is wrong only in this case, as all other
2702 * cases are either 1-1 folds when no UTF-8 is involved; or is true by
2703 * virtue of having this file pre-fold UTF-8 patterns. I'm
2704 * reluctant to try to change this assumption, so instead the code punts.
2705 * This routine examines EXACTF nodes for the sharp s, and returns a
2706 * boolean indicating whether or not the node is an EXACTF node that
2707 * contains a sharp s. When it is true, the caller sets a flag that later
2708 * causes the optimizer in this file to not set values for the floating
2709 * and fixed string lengths, and thus avoids the optimizer code in
2710 * regexec.c that makes the invalid assumption. Thus, there is no
2711 * optimization based on string lengths for EXACTF nodes that contain the
2712 * sharp s. This only happens for /id rules (which means the pattern
2716 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2717 if (PL_regkind[OP(scan)] == EXACT) \
2718 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2721 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2722 /* Merge several consecutive EXACTish nodes into one. */
2723 regnode *n = regnext(scan);
2725 regnode *next = scan + NODE_SZ_STR(scan);
2729 regnode *stop = scan;
2730 GET_RE_DEBUG_FLAGS_DECL;
2732 PERL_UNUSED_ARG(depth);
2735 PERL_ARGS_ASSERT_JOIN_EXACT;
2736 #ifndef EXPERIMENTAL_INPLACESCAN
2737 PERL_UNUSED_ARG(flags);
2738 PERL_UNUSED_ARG(val);
2740 DEBUG_PEEP("join",scan,depth);
2742 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2743 * EXACT ones that are mergeable to the current one. */
2745 && (PL_regkind[OP(n)] == NOTHING
2746 || (stringok && OP(n) == OP(scan)))
2748 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2751 if (OP(n) == TAIL || n > next)
2753 if (PL_regkind[OP(n)] == NOTHING) {
2754 DEBUG_PEEP("skip:",n,depth);
2755 NEXT_OFF(scan) += NEXT_OFF(n);
2756 next = n + NODE_STEP_REGNODE;
2763 else if (stringok) {
2764 const unsigned int oldl = STR_LEN(scan);
2765 regnode * const nnext = regnext(n);
2767 /* XXX I (khw) kind of doubt that this works on platforms where
2768 * U8_MAX is above 255 because of lots of other assumptions */
2769 /* Don't join if the sum can't fit into a single node */
2770 if (oldl + STR_LEN(n) > U8_MAX)
2773 DEBUG_PEEP("merg",n,depth);
2776 NEXT_OFF(scan) += NEXT_OFF(n);
2777 STR_LEN(scan) += STR_LEN(n);
2778 next = n + NODE_SZ_STR(n);
2779 /* Now we can overwrite *n : */
2780 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2788 #ifdef EXPERIMENTAL_INPLACESCAN
2789 if (flags && !NEXT_OFF(n)) {
2790 DEBUG_PEEP("atch", val, depth);
2791 if (reg_off_by_arg[OP(n)]) {
2792 ARG_SET(n, val - n);
2795 NEXT_OFF(n) = val - n;
2803 *has_exactf_sharp_s = FALSE;
2805 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2806 * can now analyze for sequences of problematic code points. (Prior to
2807 * this final joining, sequences could have been split over boundaries, and
2808 * hence missed). The sequences only happen in folding, hence for any
2809 * non-EXACT EXACTish node */
2810 if (OP(scan) != EXACT) {
2811 const U8 * const s0 = (U8*) STRING(scan);
2813 const U8 * const s_end = s0 + STR_LEN(scan);
2815 /* One pass is made over the node's string looking for all the
2816 * possibilities. to avoid some tests in the loop, there are two main
2817 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2821 /* Examine the string for a multi-character fold sequence. UTF-8
2822 * patterns have all characters pre-folded by the time this code is
2824 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2825 length sequence we are looking for is 2 */
2828 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2829 if (! len) { /* Not a multi-char fold: get next char */
2834 /* Nodes with 'ss' require special handling, except for EXACTFL
2835 * and EXACTFA for which there is no multi-char fold to this */
2836 if (len == 2 && *s == 's' && *(s+1) == 's'
2837 && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2840 OP(scan) = EXACTFU_SS;
2843 else if (len == 6 /* len is the same in both ASCII and EBCDIC for these */
2844 && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
2845 COMBINING_DIAERESIS_UTF8
2846 COMBINING_ACUTE_ACCENT_UTF8,
2848 || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
2849 COMBINING_DIAERESIS_UTF8
2850 COMBINING_ACUTE_ACCENT_UTF8,
2855 /* These two folds require special handling by trie's, so
2856 * change the node type to indicate this. If EXACTFA and
2857 * EXACTFL were ever to be handled by trie's, this would
2858 * have to be changed. If this node has already been
2859 * changed to EXACTFU_SS in this loop, leave it as is. (I
2860 * (khw) think it doesn't matter in regexec.c for UTF
2861 * patterns, but no need to change it */
2862 if (OP(scan) == EXACTFU) {
2863 OP(scan) = EXACTFU_TRICKYFOLD;
2867 else { /* Here is a generic multi-char fold. */
2868 const U8* multi_end = s + len;
2870 /* Count how many characters in it. In the case of /l and
2871 * /aa, no folds which contain ASCII code points are
2872 * allowed, so check for those, and skip if found. (In
2873 * EXACTFL, no folds are allowed to any Latin1 code point,
2874 * not just ASCII. But there aren't any of these
2875 * currently, nor ever likely, so don't take the time to
2876 * test for them. The code that generates the
2877 * is_MULTI_foo() macros croaks should one actually get put
2878 * into Unicode .) */
2879 if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2880 count = utf8_length(s, multi_end);
2884 while (s < multi_end) {
2887 goto next_iteration;
2897 /* The delta is how long the sequence is minus 1 (1 is how long
2898 * the character that folds to the sequence is) */
2899 *min_subtract += count - 1;
2903 else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2905 /* Here, the pattern is not UTF-8. Look for the multi-char folds
2906 * that are all ASCII. As in the above case, EXACTFL and EXACTFA
2907 * nodes can't have multi-char folds to this range (and there are
2908 * no existing ones in the upper latin1 range). In the EXACTF
2909 * case we look also for the sharp s, which can be in the final
2910 * position. Otherwise we can stop looking 1 byte earlier because
2911 * have to find at least two characters for a multi-fold */
2912 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2914 /* The below is perhaps overboard, but this allows us to save a
2915 * test each time through the loop at the expense of a mask. This
2916 * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
2917 * by a single bit. On ASCII they are 32 apart; on EBCDIC, they
2918 * are 64. This uses an exclusive 'or' to find that bit and then
2919 * inverts it to form a mask, with just a single 0, in the bit
2920 * position where 'S' and 's' differ. */
2921 const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2922 const U8 s_masked = 's' & S_or_s_mask;
2925 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2926 if (! len) { /* Not a multi-char fold. */
2927 if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2929 *has_exactf_sharp_s = TRUE;
2936 && ((*s & S_or_s_mask) == s_masked)
2937 && ((*(s+1) & S_or_s_mask) == s_masked))
2940 /* EXACTF nodes need to know that the minimum length
2941 * changed so that a sharp s in the string can match this
2942 * ss in the pattern, but they remain EXACTF nodes, as they
2943 * won't match this unless the target string is is UTF-8,
2944 * which we don't know until runtime */
2945 if (OP(scan) != EXACTF) {
2946 OP(scan) = EXACTFU_SS;
2950 *min_subtract += len - 1;
2957 /* Allow dumping but overwriting the collection of skipped
2958 * ops and/or strings with fake optimized ops */
2959 n = scan + NODE_SZ_STR(scan);
2967 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2971 /* REx optimizer. Converts nodes into quicker variants "in place".
2972 Finds fixed substrings. */
2974 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2975 to the position after last scanned or to NULL. */
2977 #define INIT_AND_WITHP \
2978 assert(!and_withp); \
2979 Newx(and_withp,1,struct regnode_charclass_class); \
2980 SAVEFREEPV(and_withp)
2982 /* this is a chain of data about sub patterns we are processing that
2983 need to be handled separately/specially in study_chunk. Its so
2984 we can simulate recursion without losing state. */
2986 typedef struct scan_frame {
2987 regnode *last; /* last node to process in this frame */
2988 regnode *next; /* next node to process when last is reached */
2989 struct scan_frame *prev; /*previous frame*/
2990 I32 stop; /* what stopparen do we use */
2994 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2997 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2998 I32 *minlenp, I32 *deltap,
3003 struct regnode_charclass_class *and_withp,
3004 U32 flags, U32 depth)
3005 /* scanp: Start here (read-write). */
3006 /* deltap: Write maxlen-minlen here. */
3007 /* last: Stop before this one. */
3008 /* data: string data about the pattern */
3009 /* stopparen: treat close N as END */
3010 /* recursed: which subroutines have we recursed into */
3011 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3014 I32 min = 0; /* There must be at least this number of characters to match */
3016 regnode *scan = *scanp, *next;
3018 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3019 int is_inf_internal = 0; /* The studied chunk is infinite */
3020 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3021 scan_data_t data_fake;
3022 SV *re_trie_maxbuff = NULL;
3023 regnode *first_non_open = scan;
3024 I32 stopmin = I32_MAX;
3025 scan_frame *frame = NULL;
3026 GET_RE_DEBUG_FLAGS_DECL;
3028 PERL_ARGS_ASSERT_STUDY_CHUNK;
3031 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3035 while (first_non_open && OP(first_non_open) == OPEN)
3036 first_non_open=regnext(first_non_open);
3041 while ( scan && OP(scan) != END && scan < last ){
3042 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3043 node length to get a real minimum (because
3044 the folded version may be shorter) */
3045 bool has_exactf_sharp_s = FALSE;
3046 /* Peephole optimizer: */
3047 DEBUG_STUDYDATA("Peep:", data,depth);
3048 DEBUG_PEEP("Peep",scan,depth);
3050 /* Its not clear to khw or hv why this is done here, and not in the
3051 * clauses that deal with EXACT nodes. khw's guess is that it's
3052 * because of a previous design */
3053 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3055 /* Follow the next-chain of the current node and optimize
3056 away all the NOTHINGs from it. */
3057 if (OP(scan) != CURLYX) {
3058 const int max = (reg_off_by_arg[OP(scan)]
3060 /* I32 may be smaller than U16 on CRAYs! */
3061 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3062 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3066 /* Skip NOTHING and LONGJMP. */
3067 while ((n = regnext(n))
3068 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3069 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3070 && off + noff < max)
3072 if (reg_off_by_arg[OP(scan)])
3075 NEXT_OFF(scan) = off;
3080 /* The principal pseudo-switch. Cannot be a switch, since we
3081 look into several different things. */
3082 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3083 || OP(scan) == IFTHEN) {
3084 next = regnext(scan);
3086 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3088 if (OP(next) == code || code == IFTHEN) {
3089 /* NOTE - There is similar code to this block below for handling
3090 TRIE nodes on a re-study. If you change stuff here check there
3092 I32 max1 = 0, min1 = I32_MAX, num = 0;
3093 struct regnode_charclass_class accum;
3094 regnode * const startbranch=scan;
3096 if (flags & SCF_DO_SUBSTR)
3097 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3098 if (flags & SCF_DO_STCLASS)
3099 cl_init_zero(pRExC_state, &accum);
3101 while (OP(scan) == code) {
3102 I32 deltanext, minnext, f = 0, fake;
3103 struct regnode_charclass_class this_class;
3106 data_fake.flags = 0;
3108 data_fake.whilem_c = data->whilem_c;
3109 data_fake.last_closep = data->last_closep;
3112 data_fake.last_closep = &fake;
3114 data_fake.pos_delta = delta;
3115 next = regnext(scan);
3116 scan = NEXTOPER(scan);
3118 scan = NEXTOPER(scan);
3119 if (flags & SCF_DO_STCLASS) {
3120 cl_init(pRExC_state, &this_class);
3121 data_fake.start_class = &this_class;
3122 f = SCF_DO_STCLASS_AND;
3124 if (flags & SCF_WHILEM_VISITED_POS)
3125 f |= SCF_WHILEM_VISITED_POS;
3127 /* we suppose the run is continuous, last=next...*/
3128 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3130 stopparen, recursed, NULL, f,depth+1);
3133 if (deltanext == I32_MAX) {
3134 is_inf = is_inf_internal = 1;
3136 } else if (max1 < minnext + deltanext)
3137 max1 = minnext + deltanext;
3139 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3141 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3142 if ( stopmin > minnext)
3143 stopmin = min + min1;
3144 flags &= ~SCF_DO_SUBSTR;
3146 data->flags |= SCF_SEEN_ACCEPT;
3149 if (data_fake.flags & SF_HAS_EVAL)
3150 data->flags |= SF_HAS_EVAL;
3151 data->whilem_c = data_fake.whilem_c;
3153 if (flags & SCF_DO_STCLASS)
3154 cl_or(pRExC_state, &accum, &this_class);
3156 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3158 if (flags & SCF_DO_SUBSTR) {
3159 data->pos_min += min1;
3160 if (data->pos_delta >= I32_MAX - (max1 - min1))
3161 data->pos_delta = I32_MAX;
3163 data->pos_delta += max1 - min1;
3164 if (max1 != min1 || is_inf)
3165 data->longest = &(data->longest_float);
3168 if (delta == I32_MAX || I32_MAX - delta - (max1 - min1) < 0)
3171 delta += max1 - min1;
3172 if (flags & SCF_DO_STCLASS_OR) {
3173 cl_or(pRExC_state, data->start_class, &accum);
3175 cl_and(data->start_class, and_withp);
3176 flags &= ~SCF_DO_STCLASS;
3179 else if (flags & SCF_DO_STCLASS_AND) {
3181 cl_and(data->start_class, &accum);
3182 flags &= ~SCF_DO_STCLASS;
3185 /* Switch to OR mode: cache the old value of
3186 * data->start_class */
3188 StructCopy(data->start_class, and_withp,
3189 struct regnode_charclass_class);
3190 flags &= ~SCF_DO_STCLASS_AND;
3191 StructCopy(&accum, data->start_class,
3192 struct regnode_charclass_class);
3193 flags |= SCF_DO_STCLASS_OR;
3194 SET_SSC_EOS(data->start_class);
3198 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3201 Assuming this was/is a branch we are dealing with: 'scan' now
3202 points at the item that follows the branch sequence, whatever
3203 it is. We now start at the beginning of the sequence and look
3210 which would be constructed from a pattern like /A|LIST|OF|WORDS/
3212 If we can find such a subsequence we need to turn the first
3213 element into a trie and then add the subsequent branch exact
3214 strings to the trie.
3218 1. patterns where the whole set of branches can be converted.
3220 2. patterns where only a subset can be converted.
3222 In case 1 we can replace the whole set with a single regop
3223 for the trie. In case 2 we need to keep the start and end
3226 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3227 becomes BRANCH TRIE; BRANCH X;
3229 There is an additional case, that being where there is a
3230 common prefix, which gets split out into an EXACT like node
3231 preceding the TRIE node.
3233 If x(1..n)==tail then we can do a simple trie, if not we make
3234 a "jump" trie, such that when we match the appropriate word
3235 we "jump" to the appropriate tail node. Essentially we turn
3236 a nested if into a case structure of sorts.
3241 if (!re_trie_maxbuff) {
3242 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3243 if (!SvIOK(re_trie_maxbuff))
3244 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3246 if ( SvIV(re_trie_maxbuff)>=0 ) {
3248 regnode *first = (regnode *)NULL;
3249 regnode *last = (regnode *)NULL;
3250 regnode *tail = scan;
3255 SV * const mysv = sv_newmortal(); /* for dumping */
3257 /* var tail is used because there may be a TAIL
3258 regop in the way. Ie, the exacts will point to the
3259 thing following the TAIL, but the last branch will
3260 point at the TAIL. So we advance tail. If we
3261 have nested (?:) we may have to move through several
3265 while ( OP( tail ) == TAIL ) {
3266 /* this is the TAIL generated by (?:) */
3267 tail = regnext( tail );
3271 DEBUG_TRIE_COMPILE_r({
3272 regprop(RExC_rx, mysv, tail );
3273 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3274 (int)depth * 2 + 2, "",
3275 "Looking for TRIE'able sequences. Tail node is: ",
3276 SvPV_nolen_const( mysv )
3282 Step through the branches
3283 cur represents each branch,
3284 noper is the first thing to be matched as part of that branch
3285 noper_next is the regnext() of that node.
3287 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3288 via a "jump trie" but we also support building with NOJUMPTRIE,
3289 which restricts the trie logic to structures like /FOO|BAR/.
3291 If noper is a trieable nodetype then the branch is a possible optimization
3292 target. If we are building under NOJUMPTRIE then we require that noper_next
3293 is the same as scan (our current position in the regex program).
3295 Once we have two or more consecutive such branches we can create a
3296 trie of the EXACT's contents and stitch it in place into the program.
3298 If the sequence represents all of the branches in the alternation we
3299 replace the entire thing with a single TRIE node.
3301 Otherwise when it is a subsequence we need to stitch it in place and
3302 replace only the relevant branches. This means the first branch has
3303 to remain as it is used by the alternation logic, and its next pointer,
3304 and needs to be repointed at the item on the branch chain following
3305 the last branch we have optimized away.
3307 This could be either a BRANCH, in which case the subsequence is internal,
3308 or it could be the item following the branch sequence in which case the
3309 subsequence is at the end (which does not necessarily mean the first node
3310 is the start of the alternation).
3312 TRIE_TYPE(X) is a define which maps the optype to a trietype.
3315 ----------------+-----------
3319 EXACTFU_SS | EXACTFU
3320 EXACTFU_TRICKYFOLD | EXACTFU
3325 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3326 ( EXACT == (X) ) ? EXACT : \
3327 ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
3330 /* dont use tail as the end marker for this traverse */
3331 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3332 regnode * const noper = NEXTOPER( cur );
3333 U8 noper_type = OP( noper );
3334 U8 noper_trietype = TRIE_TYPE( noper_type );
3335 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3336 regnode * const noper_next = regnext( noper );
3337 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3338 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3341 DEBUG_TRIE_COMPILE_r({
3342 regprop(RExC_rx, mysv, cur);
3343 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3344 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3346 regprop(RExC_rx, mysv, noper);
3347 PerlIO_printf( Perl_debug_log, " -> %s",
3348 SvPV_nolen_const(mysv));
3351 regprop(RExC_rx, mysv, noper_next );
3352 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3353 SvPV_nolen_const(mysv));
3355 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3356 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3357 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3361 /* Is noper a trieable nodetype that can be merged with the
3362 * current trie (if there is one)? */
3366 ( noper_trietype == NOTHING)
3367 || ( trietype == NOTHING )
3368 || ( trietype == noper_trietype )
3371 && noper_next == tail
3375 /* Handle mergable triable node
3376 * Either we are the first node in a new trieable sequence,
3377 * in which case we do some bookkeeping, otherwise we update
3378 * the end pointer. */
3381 if ( noper_trietype == NOTHING ) {
3382 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3383 regnode * const noper_next = regnext( noper );
3384 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3385 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3388 if ( noper_next_trietype ) {
3389 trietype = noper_next_trietype;
3390 } else if (noper_next_type) {
3391 /* a NOTHING regop is 1 regop wide. We need at least two
3392 * for a trie so we can't merge this in */
3396 trietype = noper_trietype;
3399 if ( trietype == NOTHING )
3400 trietype = noper_trietype;
3405 } /* end handle mergable triable node */
3407 /* handle unmergable node -
3408 * noper may either be a triable node which can not be tried
3409 * together with the current trie, or a non triable node */
3411 /* If last is set and trietype is not NOTHING then we have found
3412 * at least two triable branch sequences in a row of a similar
3413 * trietype so we can turn them into a trie. If/when we
3414 * allow NOTHING to start a trie sequence this condition will be
3415 * required, and it isn't expensive so we leave it in for now. */
3416 if ( trietype && trietype != NOTHING )
3417 make_trie( pRExC_state,
3418 startbranch, first, cur, tail, count,
3419 trietype, depth+1 );
3420 last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3424 && noper_next == tail
3427 /* noper is triable, so we can start a new trie sequence */
3430 trietype = noper_trietype;
3432 /* if we already saw a first but the current node is not triable then we have
3433 * to reset the first information. */
3438 } /* end handle unmergable node */
3439 } /* loop over branches */
3440 DEBUG_TRIE_COMPILE_r({
3441 regprop(RExC_rx, mysv, cur);
3442 PerlIO_printf( Perl_debug_log,
3443 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3444 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3447 if ( last && trietype ) {
3448 if ( trietype != NOTHING ) {
3449 /* the last branch of the sequence was part of a trie,
3450 * so we have to construct it here outside of the loop
3452 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3453 #ifdef TRIE_STUDY_OPT
3454 if ( ((made == MADE_EXACT_TRIE &&
3455 startbranch == first)
3456 || ( first_non_open == first )) &&
3458 flags |= SCF_TRIE_RESTUDY;
3459 if ( startbranch == first
3462 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3467 /* at this point we know whatever we have is a NOTHING sequence/branch
3468 * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3470 if ( startbranch == first ) {
3472 /* the entire thing is a NOTHING sequence, something like this:
3473 * (?:|) So we can turn it into a plain NOTHING op. */
3474 DEBUG_TRIE_COMPILE_r({
3475 regprop(RExC_rx, mysv, cur);
3476 PerlIO_printf( Perl_debug_log,
3477 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3478 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3481 OP(startbranch)= NOTHING;
3482 NEXT_OFF(startbranch)= tail - startbranch;
3483 for ( opt= startbranch + 1; opt < tail ; opt++ )
3487 } /* end if ( last) */
3488 } /* TRIE_MAXBUF is non zero */
3493 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3494 scan = NEXTOPER(NEXTOPER(scan));
3495 } else /* single branch is optimized. */
3496 scan = NEXTOPER(scan);
3498 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3499 scan_frame *newframe = NULL;
3504 if (OP(scan) != SUSPEND) {
3505 /* set the pointer */
3506 if (OP(scan) == GOSUB) {
3508 RExC_recurse[ARG2L(scan)] = scan;
3509 start = RExC_open_parens[paren-1];
3510 end = RExC_close_parens[paren-1];
3513 start = RExC_rxi->program + 1;
3517 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3518 SAVEFREEPV(recursed);
3520 if (!PAREN_TEST(recursed,paren+1)) {
3521 PAREN_SET(recursed,paren+1);
3522 Newx(newframe,1,scan_frame);
3524 if (flags & SCF_DO_SUBSTR) {
3525 SCAN_COMMIT(pRExC_state,data,minlenp);
3526 data->longest = &(data->longest_float);
3528 is_inf = is_inf_internal = 1;
3529 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3530 cl_anything(pRExC_state, data->start_class);
3531 flags &= ~SCF_DO_STCLASS;
3534 Newx(newframe,1,scan_frame);
3537 end = regnext(scan);
3542 SAVEFREEPV(newframe);
3543 newframe->next = regnext(scan);
3544 newframe->last = last;
3545 newframe->stop = stopparen;
3546 newframe->prev = frame;
3556 else if (OP(scan) == EXACT) {
3557 I32 l = STR_LEN(scan);
3560 const U8 * const s = (U8*)STRING(scan);
3561 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3562 l = utf8_length(s, s + l);
3564 uc = *((U8*)STRING(scan));
3567 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3568 /* The code below prefers earlier match for fixed
3569 offset, later match for variable offset. */
3570 if (data->last_end == -1) { /* Update the start info. */
3571 data->last_start_min = data->pos_min;
3572 data->last_start_max = is_inf
3573 ? I32_MAX : data->pos_min + data->pos_delta;
3575 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3577 SvUTF8_on(data->last_found);
3579 SV * const sv = data->last_found;
3580 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3581 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3582 if (mg && mg->mg_len >= 0)
3583 mg->mg_len += utf8_length((U8*)STRING(scan),
3584 (U8*)STRING(scan)+STR_LEN(scan));
3586 data->last_end = data->pos_min + l;
3587 data->pos_min += l; /* As in the first entry. */
3588 data->flags &= ~SF_BEFORE_EOL;
3590 if (flags & SCF_DO_STCLASS_AND) {
3591 /* Check whether it is compatible with what we know already! */
3595 /* If compatible, we or it in below. It is compatible if is
3596 * in the bitmp and either 1) its bit or its fold is set, or 2)
3597 * it's for a locale. Even if there isn't unicode semantics
3598 * here, at runtime there may be because of matching against a
3599 * utf8 string, so accept a possible false positive for
3600 * latin1-range folds */
3602 (!(data->start_class->flags & ANYOF_LOCALE)
3603 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3604 && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3605 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3610 ANYOF_CLASS_ZERO(data->start_class);
3611 ANYOF_BITMAP_ZERO(data->start_class);
3613 ANYOF_BITMAP_SET(data->start_class, uc);
3614 else if (uc >= 0x100) {
3617 /* Some Unicode code points fold to the Latin1 range; as
3618 * XXX temporary code, instead of figuring out if this is
3619 * one, just assume it is and set all the start class bits
3620 * that could be some such above 255 code point's fold
3621 * which will generate fals positives. As the code
3622 * elsewhere that does compute the fold settles down, it
3623 * can be extracted out and re-used here */
3624 for (i = 0; i < 256; i++){
3625 if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3626 ANYOF_BITMAP_SET(data->start_class, i);
3630 CLEAR_SSC_EOS(data->start_class);
3632 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3634 else if (flags & SCF_DO_STCLASS_OR) {
3635 /* false positive possible if the class is case-folded */
3637 ANYOF_BITMAP_SET(data->start_class, uc);
3639 data->start_class->flags |= ANYOF_UNICODE_ALL;
3640 CLEAR_SSC_EOS(data->start_class);
3641 cl_and(data->start_class, and_withp);
3643 flags &= ~SCF_DO_STCLASS;
3645 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3646 I32 l = STR_LEN(scan);
3647 UV uc = *((U8*)STRING(scan));
3649 /* Search for fixed substrings supports EXACT only. */
3650 if (flags & SCF_DO_SUBSTR) {
3652 SCAN_COMMIT(pRExC_state, data, minlenp);
3655 const U8 * const s = (U8 *)STRING(scan);
3656 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3657 l = utf8_length(s, s + l);
3659 if (has_exactf_sharp_s) {
3660 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3662 min += l - min_subtract;
3664 delta += min_subtract;
3665 if (flags & SCF_DO_SUBSTR) {
3666 data->pos_min += l - min_subtract;
3667 if (data->pos_min < 0) {
3670 data->pos_delta += min_subtract;
3672 data->longest = &(data->longest_float);
3675 if (flags & SCF_DO_STCLASS_AND) {
3676 /* Check whether it is compatible with what we know already! */
3679 (!(data->start_class->flags & ANYOF_LOCALE)
3680 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3681 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3685 ANYOF_CLASS_ZERO(data->start_class);
3686 ANYOF_BITMAP_ZERO(data->start_class);
3688 ANYOF_BITMAP_SET(data->start_class, uc);
3689 CLEAR_SSC_EOS(data->start_class);
3690 if (OP(scan) == EXACTFL) {
3691 /* XXX This set is probably no longer necessary, and
3692 * probably wrong as LOCALE now is on in the initial
3694 data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3698 /* Also set the other member of the fold pair. In case
3699 * that unicode semantics is called for at runtime, use
3700 * the full latin1 fold. (Can't do this for locale,
3701 * because not known until runtime) */
3702 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3704 /* All other (EXACTFL handled above) folds except under
3705 * /iaa that include s, S, and sharp_s also may include
3707 if (OP(scan) != EXACTFA) {
3708 if (uc == 's' || uc == 'S') {
3709 ANYOF_BITMAP_SET(data->start_class,
3710 LATIN_SMALL_LETTER_SHARP_S);
3712 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3713 ANYOF_BITMAP_SET(data->start_class, 's');
3714 ANYOF_BITMAP_SET(data->start_class, 'S');
3719 else if (uc >= 0x100) {
3721 for (i = 0; i < 256; i++){
3722 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3723 ANYOF_BITMAP_SET(data->start_class, i);
3728 else if (flags & SCF_DO_STCLASS_OR) {
3729 if (data->start_class->flags & ANYOF_LOC_FOLD) {
3730 /* false positive possible if the class is case-folded.
3731 Assume that the locale settings are the same... */
3733 ANYOF_BITMAP_SET(data->start_class, uc);
3734 if (OP(scan) != EXACTFL) {
3736 /* And set the other member of the fold pair, but
3737 * can't do that in locale because not known until
3739 ANYOF_BITMAP_SET(data->start_class,
3740 PL_fold_latin1[uc]);
3742 /* All folds except under /iaa that include s, S,
3743 * and sharp_s also may include the others */
3744 if (OP(scan) != EXACTFA) {
3745 if (uc == 's' || uc == 'S') {
3746 ANYOF_BITMAP_SET(data->start_class,
3747 LATIN_SMALL_LETTER_SHARP_S);
3749 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3750 ANYOF_BITMAP_SET(data->start_class, 's');
3751 ANYOF_BITMAP_SET(data->start_class, 'S');
3756 CLEAR_SSC_EOS(data->start_class);
3758 cl_and(data->start_class, and_withp);
3760 flags &= ~SCF_DO_STCLASS;
3762 else if (REGNODE_VARIES(OP(scan))) {
3763 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3764 I32 f = flags, pos_before = 0;
3765 regnode * const oscan = scan;
3766 struct regnode_charclass_class this_class;
3767 struct regnode_charclass_class *oclass = NULL;
3768 I32 next_is_eval = 0;
3770 switch (PL_regkind[OP(scan)]) {
3771 case WHILEM: /* End of (?:...)* . */
3772 scan = NEXTOPER(scan);
3775 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3776 next = NEXTOPER(scan);
3777 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3779 maxcount = REG_INFTY;
3780 next = regnext(scan);
3781 scan = NEXTOPER(scan);
3785 if (flags & SCF_DO_SUBSTR)
3790 if (flags & SCF_DO_STCLASS) {
3792 maxcount = REG_INFTY;
3793 next = regnext(scan);
3794 scan = NEXTOPER(scan);
3797 is_inf = is_inf_internal = 1;
3798 scan = regnext(scan);
3799 if (flags & SCF_DO_SUBSTR) {
3800 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3801 data->longest = &(data->longest_float);
3803 goto optimize_curly_tail;
3805 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3806 && (scan->flags == stopparen))
3811 mincount = ARG1(scan);
3812 maxcount = ARG2(scan);
3814 next = regnext(scan);
3815 if (OP(scan) == CURLYX) {
3816 I32 lp = (data ? *(data->last_closep) : 0);
3817 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3819 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3820 next_is_eval = (OP(scan) == EVAL);
3822 if (flags & SCF_DO_SUBSTR) {
3823 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3824 pos_before = data->pos_min;
3828 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3830 data->flags |= SF_IS_INF;
3832 if (flags & SCF_DO_STCLASS) {
3833 cl_init(pRExC_state, &this_class);
3834 oclass = data->start_class;
3835 data->start_class = &this_class;
3836 f |= SCF_DO_STCLASS_AND;
3837 f &= ~SCF_DO_STCLASS_OR;
3839 /* Exclude from super-linear cache processing any {n,m}
3840 regops for which the combination of input pos and regex
3841 pos is not enough information to determine if a match
3844 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3845 regex pos at the \s*, the prospects for a match depend not
3846 only on the input position but also on how many (bar\s*)
3847 repeats into the {4,8} we are. */
3848 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3849 f &= ~SCF_WHILEM_VISITED_POS;
3851 /* This will finish on WHILEM, setting scan, or on NULL: */
3852 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3853 last, data, stopparen, recursed, NULL,
3855 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3857 if (flags & SCF_DO_STCLASS)
3858 data->start_class = oclass;
3859 if (mincount == 0 || minnext == 0) {
3860 if (flags & SCF_DO_STCLASS_OR) {
3861 cl_or(pRExC_state, data->start_class, &this_class);
3863 else if (flags & SCF_DO_STCLASS_AND) {
3864 /* Switch to OR mode: cache the old value of
3865 * data->start_class */
3867 StructCopy(data->start_class, and_withp,
3868 struct regnode_charclass_class);
3869 flags &= ~SCF_DO_STCLASS_AND;
3870 StructCopy(&this_class, data->start_class,
3871 struct regnode_charclass_class);
3872 flags |= SCF_DO_STCLASS_OR;
3873 SET_SSC_EOS(data->start_class);
3875 } else { /* Non-zero len */
3876 if (flags & SCF_DO_STCLASS_OR) {
3877 cl_or(pRExC_state, data->start_class, &this_class);
3878 cl_and(data->start_class, and_withp);
3880 else if (flags & SCF_DO_STCLASS_AND)
3881 cl_and(data->start_class, &this_class);
3882 flags &= ~SCF_DO_STCLASS;
3884 if (!scan) /* It was not CURLYX, but CURLY. */
3886 if ( /* ? quantifier ok, except for (?{ ... }) */
3887 (next_is_eval || !(mincount == 0 && maxcount == 1))
3888 && (minnext == 0) && (deltanext == 0)
3889 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3890 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3892 /* Fatal warnings may leak the regexp without this: */
3893 SAVEFREESV(RExC_rx_sv);
3894 ckWARNreg(RExC_parse,
3895 "Quantifier unexpected on zero-length expression");
3896 (void)ReREFCNT_inc(RExC_rx_sv);
3899 min += minnext * mincount;
3900 is_inf_internal |= deltanext == I32_MAX
3901 || (maxcount == REG_INFTY && minnext + deltanext > 0);
3902 is_inf |= is_inf_internal;
3906 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3908 /* Try powerful optimization CURLYX => CURLYN. */
3909 if ( OP(oscan) == CURLYX && data
3910 && data->flags & SF_IN_PAR
3911 && !(data->flags & SF_HAS_EVAL)
3912 && !deltanext && minnext == 1 ) {
3913 /* Try to optimize to CURLYN. */
3914 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3915 regnode * const nxt1 = nxt;
3922 if (!REGNODE_SIMPLE(OP(nxt))
3923 && !(PL_regkind[OP(nxt)] == EXACT
3924 && STR_LEN(nxt) == 1))
3930 if (OP(nxt) != CLOSE)
3932 if (RExC_open_parens) {
3933 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3934 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3936 /* Now we know that nxt2 is the only contents: */
3937 oscan->flags = (U8)ARG(nxt);
3939 OP(nxt1) = NOTHING; /* was OPEN. */
3942 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3943 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3944 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3945 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3946 OP(nxt + 1) = OPTIMIZED; /* was count. */
3947 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3952 /* Try optimization CURLYX => CURLYM. */
3953 if ( OP(oscan) == CURLYX && data
3954 && !(data->flags & SF_HAS_PAR)
3955 && !(data->flags & SF_HAS_EVAL)
3956 && !deltanext /* atom is fixed width */
3957 && minnext != 0 /* CURLYM can't handle zero width */
3958 && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3960 /* XXXX How to optimize if data == 0? */
3961 /* Optimize to a simpler form. */
3962 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3966 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3967 && (OP(nxt2) != WHILEM))
3969 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3970 /* Need to optimize away parenths. */
3971 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3972 /* Set the parenth number. */
3973 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3975 oscan->flags = (U8)ARG(nxt);
3976 if (RExC_open_parens) {
3977 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3978 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3980 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3981 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3984 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3985 OP(nxt + 1) = OPTIMIZED; /* was count. */
3986 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3987 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3990 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3991 regnode *nnxt = regnext(nxt1);
3993 if (reg_off_by_arg[OP(nxt1)])
3994 ARG_SET(nxt1, nxt2 - nxt1);
3995 else if (nxt2 - nxt1 < U16_MAX)
3996 NEXT_OFF(nxt1) = nxt2 - nxt1;
3998 OP(nxt) = NOTHING; /* Cannot beautify */
4003 /* Optimize again: */
4004 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4005 NULL, stopparen, recursed, NULL, 0,depth+1);
4010 else if ((OP(oscan) == CURLYX)
4011 && (flags & SCF_WHILEM_VISITED_POS)
4012 /* See the comment on a similar expression above.
4013 However, this time it's not a subexpression
4014 we care about, but the expression itself. */
4015 && (maxcount == REG_INFTY)
4016 && data && ++data->whilem_c < 16) {
4017 /* This stays as CURLYX, we can put the count/of pair. */
4018 /* Find WHILEM (as in regexec.c) */
4019 regnode *nxt = oscan + NEXT_OFF(oscan);
4021 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4023 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4024 | (RExC_whilem_seen << 4)); /* On WHILEM */
4026 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4028 if (flags & SCF_DO_SUBSTR) {
4029 SV *last_str = NULL;
4030 int counted = mincount != 0;
4032 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4033 #if defined(SPARC64_GCC_WORKAROUND)
4036 const char *s = NULL;
4039 if (pos_before >= data->last_start_min)
4042 b = data->last_start_min;
4045 s = SvPV_const(data->last_found, l);
4046 old = b - data->last_start_min;
4049 I32 b = pos_before >= data->last_start_min
4050 ? pos_before : data->last_start_min;
4052 const char * const s = SvPV_const(data->last_found, l);
4053 I32 old = b - data->last_start_min;
4057 old = utf8_hop((U8*)s, old) - (U8*)s;
4059 /* Get the added string: */
4060 last_str = newSVpvn_utf8(s + old, l, UTF);
4061 if (deltanext == 0 && pos_before == b) {
4062 /* What was added is a constant string */
4064 SvGROW(last_str, (mincount * l) + 1);
4065 repeatcpy(SvPVX(last_str) + l,
4066 SvPVX_const(last_str), l, mincount - 1);
4067 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4068 /* Add additional parts. */
4069 SvCUR_set(data->last_found,
4070 SvCUR(data->last_found) - l);
4071 sv_catsv(data->last_found, last_str);
4073 SV * sv = data->last_found;
4075 SvUTF8(sv) && SvMAGICAL(sv) ?
4076 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4077 if (mg && mg->mg_len >= 0)
4078 mg->mg_len += CHR_SVLEN(last_str) - l;
4080 data->last_end += l * (mincount - 1);
4083 /* start offset must point into the last copy */
4084 data->last_start_min += minnext * (mincount - 1);
4085 data->last_start_max += is_inf ? I32_MAX
4086 : (maxcount - 1) * (minnext + data->pos_delta);
4089 /* It is counted once already... */
4090 data->pos_min += minnext * (mincount - counted);
4092 PerlIO_printf(Perl_debug_log, "counted=%d deltanext=%d I32_MAX=%d minnext=%d maxcount=%d mincount=%d\n",
4093 counted, deltanext, I32_MAX, minnext, maxcount, mincount);
4094 if (deltanext != I32_MAX)
4095 PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount, I32_MAX - data->pos_delta);
4097 if (deltanext == I32_MAX || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= I32_MAX - data->pos_delta)
4098 data->pos_delta = I32_MAX;
4100 data->pos_delta += - counted * deltanext +
4101 (minnext + deltanext) * maxcount - minnext * mincount;
4102 if (mincount != maxcount) {
4103 /* Cannot extend fixed substrings found inside
4105 SCAN_COMMIT(pRExC_state,data,minlenp);
4106 if (mincount && last_str) {
4107 SV * const sv = data->last_found;
4108 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4109 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4113 sv_setsv(sv, last_str);
4114 data->last_end = data->pos_min;
4115 data->last_start_min =
4116 data->pos_min - CHR_SVLEN(last_str);
4117 data->last_start_max = is_inf
4119 : data->pos_min + data->pos_delta
4120 - CHR_SVLEN(last_str);
4122 data->longest = &(data->longest_float);
4124 SvREFCNT_dec(last_str);
4126 if (data && (fl & SF_HAS_EVAL))
4127 data->flags |= SF_HAS_EVAL;
4128 optimize_curly_tail:
4129 if (OP(oscan) != CURLYX) {
4130 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4132 NEXT_OFF(oscan) += NEXT_OFF(next);
4135 default: /* REF, and CLUMP only? */
4136 if (flags & SCF_DO_SUBSTR) {
4137 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4138 data->longest = &(data->longest_float);
4140 is_inf = is_inf_internal = 1;
4141 if (flags & SCF_DO_STCLASS_OR)
4142 cl_anything(pRExC_state, data->start_class);
4143 flags &= ~SCF_DO_STCLASS;
4147 else if (OP(scan) == LNBREAK) {
4148 if (flags & SCF_DO_STCLASS) {
4150 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4151 if (flags & SCF_DO_STCLASS_AND) {
4152 for (value = 0; value < 256; value++)
4153 if (!is_VERTWS_cp(value))
4154 ANYOF_BITMAP_CLEAR(data->start_class, value);
4157 for (value = 0; value < 256; value++)
4158 if (is_VERTWS_cp(value))
4159 ANYOF_BITMAP_SET(data->start_class, value);
4161 if (flags & SCF_DO_STCLASS_OR)
4162 cl_and(data->start_class, and_withp);
4163 flags &= ~SCF_DO_STCLASS;
4166 delta++; /* Because of the 2 char string cr-lf */
4167 if (flags & SCF_DO_SUBSTR) {
4168 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4170 data->pos_delta += 1;
4171 data->longest = &(data->longest_float);
4174 else if (REGNODE_SIMPLE(OP(scan))) {
4177 if (flags & SCF_DO_SUBSTR) {
4178 SCAN_COMMIT(pRExC_state,data,minlenp);
4182 if (flags & SCF_DO_STCLASS) {
4184 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4186 /* Some of the logic below assumes that switching
4187 locale on will only add false positives. */
4188 switch (PL_regkind[OP(scan)]) {
4194 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4197 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4198 cl_anything(pRExC_state, data->start_class);
4201 if (OP(scan) == SANY)
4203 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4204 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4205 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4206 cl_anything(pRExC_state, data->start_class);
4208 if (flags & SCF_DO_STCLASS_AND || !value)
4209 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4212 if (flags & SCF_DO_STCLASS_AND)
4213 cl_and(data->start_class,
4214 (struct regnode_charclass_class*)scan);
4216 cl_or(pRExC_state, data->start_class,
4217 (struct regnode_charclass_class*)scan);
4225 classnum = FLAGS(scan);
4226 if (flags & SCF_DO_STCLASS_AND) {
4227 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4228 ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
4229 for (value = 0; value < loop_max; value++) {
4230 if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4231 ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4237 if (data->start_class->flags & ANYOF_LOCALE) {
4238 ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum));
4242 /* Even if under locale, set the bits for non-locale
4243 * in case it isn't a true locale-node. This will
4244 * create false positives if it truly is locale */
4245 for (value = 0; value < loop_max; value++) {
4246 if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4247 ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4259 classnum = FLAGS(scan);
4260 if (flags & SCF_DO_STCLASS_AND) {
4261 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4262 ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
4263 for (value = 0; value < loop_max; value++) {
4264 if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4265 ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4271 if (data->start_class->flags & ANYOF_LOCALE) {
4272 ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1);
4276 /* Even if under locale, set the bits for non-locale in
4277 * case it isn't a true locale-node. This will create
4278 * false positives if it truly is locale */
4279 for (value = 0; value < loop_max; value++) {
4280 if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4281 ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4284 if (PL_regkind[OP(scan)] == NPOSIXD) {
4285 data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
4291 if (flags & SCF_DO_STCLASS_OR)
4292 cl_and(data->start_class, and_withp);
4293 flags &= ~SCF_DO_STCLASS;
4296 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4297 data->flags |= (OP(scan) == MEOL
4300 SCAN_COMMIT(pRExC_state, data, minlenp);
4303 else if ( PL_regkind[OP(scan)] == BRANCHJ
4304 /* Lookbehind, or need to calculate parens/evals/stclass: */
4305 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4306 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4307 if ( OP(scan) == UNLESSM &&
4309 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4310 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4313 regnode *upto= regnext(scan);
4315 SV * const mysv_val=sv_newmortal();
4316 DEBUG_STUDYDATA("OPFAIL",data,depth);
4318 /*DEBUG_PARSE_MSG("opfail");*/
4319 regprop(RExC_rx, mysv_val, upto);
4320 PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4321 SvPV_nolen_const(mysv_val),
4322 (IV)REG_NODE_NUM(upto),
4327 NEXT_OFF(scan) = upto - scan;
4328 for (opt= scan + 1; opt < upto ; opt++)
4329 OP(opt) = OPTIMIZED;
4333 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4334 || OP(scan) == UNLESSM )
4336 /* Negative Lookahead/lookbehind
4337 In this case we can't do fixed string optimisation.
4340 I32 deltanext, minnext, fake = 0;
4342 struct regnode_charclass_class intrnl;
4345 data_fake.flags = 0;
4347 data_fake.whilem_c = data->whilem_c;
4348 data_fake.last_closep = data->last_closep;
4351 data_fake.last_closep = &fake;
4352 data_fake.pos_delta = delta;
4353 if ( flags & SCF_DO_STCLASS && !scan->flags
4354 && OP(scan) == IFMATCH ) { /* Lookahead */
4355 cl_init(pRExC_state, &intrnl);
4356 data_fake.start_class = &intrnl;
4357 f |= SCF_DO_STCLASS_AND;
4359 if (flags & SCF_WHILEM_VISITED_POS)
4360 f |= SCF_WHILEM_VISITED_POS;
4361 next = regnext(scan);
4362 nscan = NEXTOPER(NEXTOPER(scan));
4363 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4364 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4367 FAIL("Variable length lookbehind not implemented");
4369 else if (minnext > (I32)U8_MAX) {
4370 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4372 scan->flags = (U8)minnext;
4375 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4377 if (data_fake.flags & SF_HAS_EVAL)
4378 data->flags |= SF_HAS_EVAL;
4379 data->whilem_c = data_fake.whilem_c;
4381 if (f & SCF_DO_STCLASS_AND) {
4382 if (flags & SCF_DO_STCLASS_OR) {
4383 /* OR before, AND after: ideally we would recurse with
4384 * data_fake to get the AND applied by study of the
4385 * remainder of the pattern, and then derecurse;
4386 * *** HACK *** for now just treat as "no information".
4387 * See [perl #56690].
4389 cl_init(pRExC_state, data->start_class);
4391 /* AND before and after: combine and continue */
4392 const int was = TEST_SSC_EOS(data->start_class);
4394 cl_and(data->start_class, &intrnl);
4396 SET_SSC_EOS(data->start_class);
4400 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4402 /* Positive Lookahead/lookbehind
4403 In this case we can do fixed string optimisation,
4404 but we must be careful about it. Note in the case of
4405 lookbehind the positions will be offset by the minimum
4406 length of the pattern, something we won't know about
4407 until after the recurse.
4409 I32 deltanext, fake = 0;
4411 struct regnode_charclass_class intrnl;
4413 /* We use SAVEFREEPV so that when the full compile
4414 is finished perl will clean up the allocated
4415 minlens when it's all done. This way we don't
4416 have to worry about freeing them when we know
4417 they wont be used, which would be a pain.
4420 Newx( minnextp, 1, I32 );
4421 SAVEFREEPV(minnextp);
4424 StructCopy(data, &data_fake, scan_data_t);
4425 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4428 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4429 data_fake.last_found=newSVsv(data->last_found);
4433 data_fake.last_closep = &fake;
4434 data_fake.flags = 0;
4435 data_fake.pos_delta = delta;
4437 data_fake.flags |= SF_IS_INF;
4438 if ( flags & SCF_DO_STCLASS && !scan->flags
4439 && OP(scan) == IFMATCH ) { /* Lookahead */
4440 cl_init(pRExC_state, &intrnl);
4441 data_fake.start_class = &intrnl;
4442 f |= SCF_DO_STCLASS_AND;
4444 if (flags & SCF_WHILEM_VISITED_POS)
4445 f |= SCF_WHILEM_VISITED_POS;
4446 next = regnext(scan);
4447 nscan = NEXTOPER(NEXTOPER(scan));
4449 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4450 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4453 FAIL("Variable length lookbehind not implemented");
4455 else if (*minnextp > (I32)U8_MAX) {
4456 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4458 scan->flags = (U8)*minnextp;
4463 if (f & SCF_DO_STCLASS_AND) {
4464 const int was = TEST_SSC_EOS(data.start_class);
4466 cl_and(data->start_class, &intrnl);
4468 SET_SSC_EOS(data->start_class);
4471 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4473 if (data_fake.flags & SF_HAS_EVAL)
4474 data->flags |= SF_HAS_EVAL;
4475 data->whilem_c = data_fake.whilem_c;
4476 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4477 if (RExC_rx->minlen<*minnextp)
4478 RExC_rx->minlen=*minnextp;
4479 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4480 SvREFCNT_dec_NN(data_fake.last_found);
4482 if ( data_fake.minlen_fixed != minlenp )
4484 data->offset_fixed= data_fake.offset_fixed;
4485 data->minlen_fixed= data_fake.minlen_fixed;
4486 data->lookbehind_fixed+= scan->flags;
4488 if ( data_fake.minlen_float != minlenp )
4490 data->minlen_float= data_fake.minlen_float;
4491 data->offset_float_min=data_fake.offset_float_min;
4492 data->offset_float_max=data_fake.offset_float_max;
4493 data->lookbehind_float+= scan->flags;
4500 else if (OP(scan) == OPEN) {
4501 if (stopparen != (I32)ARG(scan))
4504 else if (OP(scan) == CLOSE) {
4505 if (stopparen == (I32)ARG(scan)) {
4508 if ((I32)ARG(scan) == is_par) {
4509 next = regnext(scan);
4511 if ( next && (OP(next) != WHILEM) && next < last)
4512 is_par = 0; /* Disable optimization */
4515 *(data->last_closep) = ARG(scan);
4517 else if (OP(scan) == EVAL) {
4519 data->flags |= SF_HAS_EVAL;
4521 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4522 if (flags & SCF_DO_SUBSTR) {
4523 SCAN_COMMIT(pRExC_state,data,minlenp);
4524 flags &= ~SCF_DO_SUBSTR;
4526 if (data && OP(scan)==ACCEPT) {
4527 data->flags |= SCF_SEEN_ACCEPT;
4532 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4534 if (flags & SCF_DO_SUBSTR) {
4535 SCAN_COMMIT(pRExC_state,data,minlenp);
4536 data->longest = &(data->longest_float);
4538 is_inf = is_inf_internal = 1;
4539 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4540 cl_anything(pRExC_state, data->start_class);
4541 flags &= ~SCF_DO_STCLASS;
4543 else if (OP(scan) == GPOS) {
4544 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4545 !(delta || is_inf || (data && data->pos_delta)))
4547 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4548 RExC_rx->extflags |= RXf_ANCH_GPOS;
4549 if (RExC_rx->gofs < (U32)min)
4550 RExC_rx->gofs = min;
4552 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4556 #ifdef TRIE_STUDY_OPT
4557 #ifdef FULL_TRIE_STUDY
4558 else if (PL_regkind[OP(scan)] == TRIE) {
4559 /* NOTE - There is similar code to this block above for handling
4560 BRANCH nodes on the initial study. If you change stuff here
4562 regnode *trie_node= scan;
4563 regnode *tail= regnext(scan);
4564 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4565 I32 max1 = 0, min1 = I32_MAX;
4566 struct regnode_charclass_class accum;
4568 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4569 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4570 if (flags & SCF_DO_STCLASS)
4571 cl_init_zero(pRExC_state, &accum);
4577 const regnode *nextbranch= NULL;
4580 for ( word=1 ; word <= trie->wordcount ; word++)
4582 I32 deltanext=0, minnext=0, f = 0, fake;
4583 struct regnode_charclass_class this_class;
4585 data_fake.flags = 0;
4587 data_fake.whilem_c = data->whilem_c;
4588 data_fake.last_closep = data->last_closep;
4591 data_fake.last_closep = &fake;
4592 data_fake.pos_delta = delta;
4593 if (flags & SCF_DO_STCLASS) {
4594 cl_init(pRExC_state, &this_class);
4595 data_fake.start_class = &this_class;
4596 f = SCF_DO_STCLASS_AND;
4598 if (flags & SCF_WHILEM_VISITED_POS)
4599 f |= SCF_WHILEM_VISITED_POS;
4601 if (trie->jump[word]) {
4603 nextbranch = trie_node + trie->jump[0];
4604 scan= trie_node + trie->jump[word];
4605 /* We go from the jump point to the branch that follows
4606 it. Note this means we need the vestigal unused branches
4607 even though they arent otherwise used.
4609 minnext = study_chunk(pRExC_state, &scan, minlenp,
4610 &deltanext, (regnode *)nextbranch, &data_fake,
4611 stopparen, recursed, NULL, f,depth+1);
4613 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4614 nextbranch= regnext((regnode*)nextbranch);
4616 if (min1 > (I32)(minnext + trie->minlen))
4617 min1 = minnext + trie->minlen;
4618 if (deltanext == I32_MAX) {
4619 is_inf = is_inf_internal = 1;
4621 } else if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4622 max1 = minnext + deltanext + trie->maxlen;
4624 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4626 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4627 if ( stopmin > min + min1)
4628 stopmin = min + min1;
4629 flags &= ~SCF_DO_SUBSTR;
4631 data->flags |= SCF_SEEN_ACCEPT;
4634 if (data_fake.flags & SF_HAS_EVAL)
4635 data->flags |= SF_HAS_EVAL;
4636 data->whilem_c = data_fake.whilem_c;
4638 if (flags & SCF_DO_STCLASS)
4639 cl_or(pRExC_state, &accum, &this_class);
4642 if (flags & SCF_DO_SUBSTR) {
4643 data->pos_min += min1;
4644 data->pos_delta += max1 - min1;
4645 if (max1 != min1 || is_inf)
4646 data->longest = &(data->longest_float);
4649 delta += max1 - min1;
4650 if (flags & SCF_DO_STCLASS_OR) {
4651 cl_or(pRExC_state, data->start_class, &accum);
4653 cl_and(data->start_class, and_withp);
4654 flags &= ~SCF_DO_STCLASS;
4657 else if (flags & SCF_DO_STCLASS_AND) {
4659 cl_and(data->start_class, &accum);
4660 flags &= ~SCF_DO_STCLASS;
4663 /* Switch to OR mode: cache the old value of
4664 * data->start_class */
4666 StructCopy(data->start_class, and_withp,
4667 struct regnode_charclass_class);
4668 flags &= ~SCF_DO_STCLASS_AND;
4669 StructCopy(&accum, data->start_class,
4670 struct regnode_charclass_class);
4671 flags |= SCF_DO_STCLASS_OR;
4672 SET_SSC_EOS(data->start_class);
4679 else if (PL_regkind[OP(scan)] == TRIE) {
4680 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4683 min += trie->minlen;
4684 delta += (trie->maxlen - trie->minlen);
4685 flags &= ~SCF_DO_STCLASS; /* xxx */
4686 if (flags & SCF_DO_SUBSTR) {
4687 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4688 data->pos_min += trie->minlen;
4689 data->pos_delta += (trie->maxlen - trie->minlen);
4690 if (trie->maxlen != trie->minlen)
4691 data->longest = &(data->longest_float);
4693 if (trie->jump) /* no more substrings -- for now /grr*/
4694 flags &= ~SCF_DO_SUBSTR;
4696 #endif /* old or new */
4697 #endif /* TRIE_STUDY_OPT */
4699 /* Else: zero-length, ignore. */
4700 scan = regnext(scan);
4705 stopparen = frame->stop;
4706 frame = frame->prev;
4707 goto fake_study_recurse;
4712 DEBUG_STUDYDATA("pre-fin:",data,depth);
4715 *deltap = is_inf_internal ? I32_MAX : delta;
4716 if (flags & SCF_DO_SUBSTR && is_inf)
4717 data->pos_delta = I32_MAX - data->pos_min;
4718 if (is_par > (I32)U8_MAX)
4720 if (is_par && pars==1 && data) {
4721 data->flags |= SF_IN_PAR;
4722 data->flags &= ~SF_HAS_PAR;
4724 else if (pars && data) {
4725 data->flags |= SF_HAS_PAR;
4726 data->flags &= ~SF_IN_PAR;
4728 if (flags & SCF_DO_STCLASS_OR)
4729 cl_and(data->start_class, and_withp);
4730 if (flags & SCF_TRIE_RESTUDY)
4731 data->flags |= SCF_TRIE_RESTUDY;
4733 DEBUG_STUDYDATA("post-fin:",data,depth);
4735 return min < stopmin ? min : stopmin;
4739 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4741 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4743 PERL_ARGS_ASSERT_ADD_DATA;
4745 Renewc(RExC_rxi->data,
4746 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4747 char, struct reg_data);
4749 Renew(RExC_rxi->data->what, count + n, U8);
4751 Newx(RExC_rxi->data->what, n, U8);
4752 RExC_rxi->data->count = count + n;
4753 Copy(s, RExC_rxi->data->what + count, n, U8);
4757 /*XXX: todo make this not included in a non debugging perl */
4758 #ifndef PERL_IN_XSUB_RE
4760 Perl_reginitcolors(pTHX)
4763 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4765 char *t = savepv(s);
4769 t = strchr(t, '\t');
4775 PL_colors[i] = t = (char *)"";
4780 PL_colors[i++] = (char *)"";
4787 #ifdef TRIE_STUDY_OPT
4788 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
4791 (data.flags & SCF_TRIE_RESTUDY) \
4799 #define CHECK_RESTUDY_GOTO_butfirst
4803 * pregcomp - compile a regular expression into internal code
4805 * Decides which engine's compiler to call based on the hint currently in
4809 #ifndef PERL_IN_XSUB_RE
4811 /* return the currently in-scope regex engine (or the default if none) */
4813 regexp_engine const *
4814 Perl_current_re_engine(pTHX)
4818 if (IN_PERL_COMPILETIME) {
4819 HV * const table = GvHV(PL_hintgv);
4823 return &PL_core_reg_engine;
4824 ptr = hv_fetchs(table, "regcomp", FALSE);
4825 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4826 return &PL_core_reg_engine;
4827 return INT2PTR(regexp_engine*,SvIV(*ptr));
4831 if (!PL_curcop->cop_hints_hash)
4832 return &PL_core_reg_engine;
4833 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4834 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4835 return &PL_core_reg_engine;
4836 return INT2PTR(regexp_engine*,SvIV(ptr));
4842 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4845 regexp_engine const *eng = current_re_engine();
4846 GET_RE_DEBUG_FLAGS_DECL;
4848 PERL_ARGS_ASSERT_PREGCOMP;
4850 /* Dispatch a request to compile a regexp to correct regexp engine. */
4852 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4855 return CALLREGCOMP_ENG(eng, pattern, flags);
4859 /* public(ish) entry point for the perl core's own regex compiling code.
4860 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4861 * pattern rather than a list of OPs, and uses the internal engine rather
4862 * than the current one */
4865 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4867 SV *pat = pattern; /* defeat constness! */
4868 PERL_ARGS_ASSERT_RE_COMPILE;
4869 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4870 #ifdef PERL_IN_XSUB_RE
4873 &PL_core_reg_engine,
4875 NULL, NULL, rx_flags, 0);
4878 /* see if there are any run-time code blocks in the pattern.
4879 * False positives are allowed */
4882 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4883 U32 pm_flags, char *pat, STRLEN plen)
4888 /* avoid infinitely recursing when we recompile the pattern parcelled up
4889 * as qr'...'. A single constant qr// string can't have have any
4890 * run-time component in it, and thus, no runtime code. (A non-qr
4891 * string, however, can, e.g. $x =~ '(?{})') */
4892 if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
4895 for (s = 0; s < plen; s++) {
4896 if (n < pRExC_state->num_code_blocks
4897 && s == pRExC_state->code_blocks[n].start)
4899 s = pRExC_state->code_blocks[n].end;
4903 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
4905 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
4907 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
4914 /* Handle run-time code blocks. We will already have compiled any direct
4915 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
4916 * copy of it, but with any literal code blocks blanked out and
4917 * appropriate chars escaped; then feed it into
4919 * eval "qr'modified_pattern'"
4923 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
4927 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
4929 * After eval_sv()-ing that, grab any new code blocks from the returned qr
4930 * and merge them with any code blocks of the original regexp.
4932 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
4933 * instead, just save the qr and return FALSE; this tells our caller that
4934 * the original pattern needs upgrading to utf8.
4938 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
4939 char *pat, STRLEN plen)
4943 GET_RE_DEBUG_FLAGS_DECL;
4945 if (pRExC_state->runtime_code_qr) {
4946 /* this is the second time we've been called; this should
4947 * only happen if the main pattern got upgraded to utf8
4948 * during compilation; re-use the qr we compiled first time
4949 * round (which should be utf8 too)
4951 qr = pRExC_state->runtime_code_qr;
4952 pRExC_state->runtime_code_qr = NULL;
4953 assert(RExC_utf8 && SvUTF8(qr));
4959 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
4963 /* determine how many extra chars we need for ' and \ escaping */
4964 for (s = 0; s < plen; s++) {
4965 if (pat[s] == '\'' || pat[s] == '\\')
4969 Newx(newpat, newlen, char);
4971 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
4973 for (s = 0; s < plen; s++) {
4974 if (n < pRExC_state->num_code_blocks
4975 && s == pRExC_state->code_blocks[n].start)
4977 /* blank out literal code block */
4978 assert(pat[s] == '(');
4979 while (s <= pRExC_state->code_blocks[n].end) {
4987 if (pat[s] == '\'' || pat[s] == '\\')
4992 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
4996 PerlIO_printf(Perl_debug_log,
4997 "%sre-parsing pattern for runtime code:%s %s\n",
4998 PL_colors[4],PL_colors[5],newpat);
5001 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5007 PUSHSTACKi(PERLSI_REQUIRE);
5008 /* this causes the toker to collapse \\ into \ when parsing
5009 * qr''; normally only q'' does this. It also alters hints
5011 PL_reg_state.re_reparsing = TRUE;
5012 eval_sv(sv, G_SCALAR);
5013 SvREFCNT_dec_NN(sv);
5018 SV * const errsv = ERRSV;
5019 if (SvTRUE_NN(errsv))
5021 Safefree(pRExC_state->code_blocks);
5022 /* use croak_sv ? */
5023 Perl_croak_nocontext("%s", SvPV_nolen_const(errsv));
5026 assert(SvROK(qr_ref));
5028 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5029 /* the leaving below frees the tmp qr_ref.
5030 * Give qr a life of its own */
5038 if (!RExC_utf8 && SvUTF8(qr)) {
5039 /* first time through; the pattern got upgraded; save the
5040 * qr for the next time through */
5041 assert(!pRExC_state->runtime_code_qr);
5042 pRExC_state->runtime_code_qr = qr;
5047 /* extract any code blocks within the returned qr// */
5050 /* merge the main (r1) and run-time (r2) code blocks into one */
5052 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5053 struct reg_code_block *new_block, *dst;
5054 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5057 if (!r2->num_code_blocks) /* we guessed wrong */
5059 SvREFCNT_dec_NN(qr);
5064 r1->num_code_blocks + r2->num_code_blocks,
5065 struct reg_code_block);
5068 while ( i1 < r1->num_code_blocks
5069 || i2 < r2->num_code_blocks)
5071 struct reg_code_block *src;
5074 if (i1 == r1->num_code_blocks) {
5075 src = &r2->code_blocks[i2++];
5078 else if (i2 == r2->num_code_blocks)
5079 src = &r1->code_blocks[i1++];
5080 else if ( r1->code_blocks[i1].start
5081 < r2->code_blocks[i2].start)
5083 src = &r1->code_blocks[i1++];
5084 assert(src->end < r2->code_blocks[i2].start);
5087 assert( r1->code_blocks[i1].start
5088 > r2->code_blocks[i2].start);
5089 src = &r2->code_blocks[i2++];
5091 assert(src->end < r1->code_blocks[i1].start);
5094 assert(pat[src->start] == '(');
5095 assert(pat[src->end] == ')');
5096 dst->start = src->start;
5097 dst->end = src->end;
5098 dst->block = src->block;
5099 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5103 r1->num_code_blocks += r2->num_code_blocks;
5104 Safefree(r1->code_blocks);
5105 r1->code_blocks = new_block;
5108 SvREFCNT_dec_NN(qr);
5114 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5116 /* This is the common code for setting up the floating and fixed length
5117 * string data extracted from Perlre_op_compile() below. Returns a boolean
5118 * as to whether succeeded or not */
5122 if (! (longest_length
5123 || (eol /* Can't have SEOL and MULTI */
5124 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5126 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5127 || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5132 /* copy the information about the longest from the reg_scan_data
5133 over to the program. */
5134 if (SvUTF8(sv_longest)) {
5135 *rx_utf8 = sv_longest;
5138 *rx_substr = sv_longest;
5141 /* end_shift is how many chars that must be matched that
5142 follow this item. We calculate it ahead of time as once the
5143 lookbehind offset is added in we lose the ability to correctly
5145 ml = minlen ? *(minlen) : (I32)longest_length;
5146 *rx_end_shift = ml - offset
5147 - longest_length + (SvTAIL(sv_longest) != 0)
5150 t = (eol/* Can't have SEOL and MULTI */
5151 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5152 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5158 * Perl_re_op_compile - the perl internal RE engine's function to compile a
5159 * regular expression into internal code.
5160 * The pattern may be passed either as:
5161 * a list of SVs (patternp plus pat_count)
5162 * a list of OPs (expr)
5163 * If both are passed, the SV list is used, but the OP list indicates
5164 * which SVs are actually pre-compiled code blocks
5166 * The SVs in the list have magic and qr overloading applied to them (and
5167 * the list may be modified in-place with replacement SVs in the latter
5170 * If the pattern hasn't changed from old_re, then old_re will be
5173 * eng is the current engine. If that engine has an op_comp method, then
5174 * handle directly (i.e. we assume that op_comp was us); otherwise, just
5175 * do the initial concatenation of arguments and pass on to the external
5178 * If is_bare_re is not null, set it to a boolean indicating whether the
5179 * arg list reduced (after overloading) to a single bare regex which has
5180 * been returned (i.e. /$qr/).
5182 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5184 * pm_flags contains the PMf_* flags, typically based on those from the
5185 * pm_flags field of the related PMOP. Currently we're only interested in
5186 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5188 * We can't allocate space until we know how big the compiled form will be,
5189 * but we can't compile it (and thus know how big it is) until we've got a
5190 * place to put the code. So we cheat: we compile it twice, once with code
5191 * generation turned off and size counting turned on, and once "for real".
5192 * This also means that we don't allocate space until we are sure that the
5193 * thing really will compile successfully, and we never have to move the
5194 * code and thus invalidate pointers into it. (Note that it has to be in
5195 * one piece because free() must be able to free it all.) [NB: not true in perl]
5197 * Beware that the optimization-preparation code in here knows about some
5198 * of the structure of the compiled regexp. [I'll say.]
5202 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5203 OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5204 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5209 regexp_internal *ri;
5218 SV * VOL code_blocksv = NULL;
5220 /* these are all flags - maybe they should be turned
5221 * into a single int with different bit masks */
5222 I32 sawlookahead = 0;
5225 bool used_setjump = FALSE;
5226 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5227 bool code_is_utf8 = 0;
5228 bool VOL recompile = 0;
5229 bool runtime_code = 0;
5233 RExC_state_t RExC_state;
5234 RExC_state_t * const pRExC_state = &RExC_state;
5235 #ifdef TRIE_STUDY_OPT
5237 RExC_state_t copyRExC_state;
5239 GET_RE_DEBUG_FLAGS_DECL;
5241 PERL_ARGS_ASSERT_RE_OP_COMPILE;
5243 DEBUG_r(if (!PL_colorset) reginitcolors());
5245 #ifndef PERL_IN_XSUB_RE
5246 /* Initialize these here instead of as-needed, as is quick and avoids
5247 * having to test them each time otherwise */
5248 if (! PL_AboveLatin1) {
5249 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5250 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5251 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5253 PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5254 = _new_invlist_C_array(L1PosixAlnum_invlist);
5255 PL_Posix_ptrs[_CC_ALPHANUMERIC]
5256 = _new_invlist_C_array(PosixAlnum_invlist);
5258 PL_L1Posix_ptrs[_CC_ALPHA]
5259 = _new_invlist_C_array(L1PosixAlpha_invlist);
5260 PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5262 PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5263 PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5265 /* Cased is the same as Alpha in the ASCII range */
5266 PL_L1Posix_ptrs[_CC_CASED] = _new_invlist_C_array(L1Cased_invlist);
5267 PL_Posix_ptrs[_CC_CASED] = _new_invlist_C_array(PosixAlpha_invlist);
5269 PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5270 PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5272 PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5273 PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5275 PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5276 PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5278 PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5279 PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5281 PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5282 PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5284 PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5285 PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5287 PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5288 PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5289 PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5290 PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5292 PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5293 PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
5295 PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
5297 PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
5298 PL_L1Posix_ptrs[_CC_WORDCHAR]
5299 = _new_invlist_C_array(L1PosixWord_invlist);
5301 PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
5302 PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
5304 PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5308 pRExC_state->code_blocks = NULL;
5309 pRExC_state->num_code_blocks = 0;
5312 *is_bare_re = FALSE;
5314 if (expr && (expr->op_type == OP_LIST ||
5315 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5317 /* is the source UTF8, and how many code blocks are there? */
5321 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5322 if (o->op_type == OP_CONST) {
5323 /* skip if we have SVs as well as OPs. In this case,
5324 * a) we decide utf8 based on SVs not OPs;
5325 * b) the current pad may not match that which the ops
5326 * were compiled in, so, so on threaded builds,
5327 * cSVOPo_sv would look in the wrong pad */
5328 if (!pat_count && SvUTF8(cSVOPo_sv))
5331 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5332 /* count of DO blocks */
5336 pRExC_state->num_code_blocks = ncode;
5337 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5342 /* handle a list of SVs */
5346 /* apply magic and RE overloading to each arg */
5347 for (svp = patternp; svp < patternp + pat_count; svp++) {
5350 if (SvROK(rx) && SvAMAGIC(rx)) {
5351 SV *sv = AMG_CALLunary(rx, regexp_amg);
5355 if (SvTYPE(sv) != SVt_REGEXP)
5356 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5362 if (pat_count > 1) {
5363 /* concat multiple args and find any code block indexes */
5368 STRLEN orig_patlen = 0;
5370 if (pRExC_state->num_code_blocks) {
5371 o = cLISTOPx(expr)->op_first;
5372 assert( o->op_type == OP_PUSHMARK
5373 || (o->op_type == OP_NULL && o->op_targ == OP_PUSHMARK)
5374 || o->op_type == OP_PADRANGE);
5378 pat = newSVpvn("", 0);
5381 /* determine if the pattern is going to be utf8 (needed
5382 * in advance to align code block indices correctly).
5383 * XXX This could fail to be detected for an arg with
5384 * overloading but not concat overloading; but the main effect
5385 * in this obscure case is to need a 'use re eval' for a
5386 * literal code block */
5387 for (svp = patternp; svp < patternp + pat_count; svp++) {
5394 for (svp = patternp; svp < patternp + pat_count; svp++) {
5395 SV *sv, *msv = *svp;
5398 /* we make the assumption here that each op in the list of
5399 * op_siblings maps to one SV pushed onto the stack,
5400 * except for code blocks, with have both an OP_NULL and
5402 * This allows us to match up the list of SVs against the
5403 * list of OPs to find the next code block.
5405 * Note that PUSHMARK PADSV PADSV ..
5407 * PADRANGE NULL NULL ..
5408 * so the alignment still works. */
5410 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5411 assert(n < pRExC_state->num_code_blocks);
5412 pRExC_state->code_blocks[n].start = SvCUR(pat);
5413 pRExC_state->code_blocks[n].block = o;
5414 pRExC_state->code_blocks[n].src_regex = NULL;
5417 o = o->op_sibling; /* skip CONST */
5423 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5424 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5427 /* overloading involved: all bets are off over literal
5428 * code. Pretend we haven't seen it */
5429 pRExC_state->num_code_blocks -= n;
5435 while (SvAMAGIC(msv)
5436 && (sv = AMG_CALLunary(msv, string_amg))
5440 && SvRV(msv) == SvRV(sv))
5445 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5447 orig_patlen = SvCUR(pat);
5448 sv_catsv_nomg(pat, msv);
5451 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5454 /* extract any code blocks within any embedded qr//'s */
5455 if (rx && SvTYPE(rx) == SVt_REGEXP
5456 && RX_ENGINE((REGEXP*)rx)->op_comp)
5459 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5460 if (ri->num_code_blocks) {
5462 /* the presence of an embedded qr// with code means
5463 * we should always recompile: the text of the
5464 * qr// may not have changed, but it may be a
5465 * different closure than last time */
5467 Renew(pRExC_state->code_blocks,
5468 pRExC_state->num_code_blocks + ri->num_code_blocks,
5469 struct reg_code_block);
5470 pRExC_state->num_code_blocks += ri->num_code_blocks;
5471 for (i=0; i < ri->num_code_blocks; i++) {
5472 struct reg_code_block *src, *dst;
5473 STRLEN offset = orig_patlen
5474 + ReANY((REGEXP *)rx)->pre_prefix;
5475 assert(n < pRExC_state->num_code_blocks);
5476 src = &ri->code_blocks[i];
5477 dst = &pRExC_state->code_blocks[n];
5478 dst->start = src->start + offset;
5479 dst->end = src->end + offset;
5480 dst->block = src->block;
5481 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5495 while (SvAMAGIC(pat)
5496 && (sv = AMG_CALLunary(pat, string_amg))
5504 /* handle bare regex: foo =~ $re */
5509 if (SvTYPE(re) == SVt_REGEXP) {
5513 Safefree(pRExC_state->code_blocks);
5519 /* not a list of SVs, so must be a list of OPs */
5521 if (expr->op_type == OP_LIST) {
5526 pat = newSVpvn("", 0);
5531 /* given a list of CONSTs and DO blocks in expr, append all
5532 * the CONSTs to pat, and record the start and end of each
5533 * code block in code_blocks[] (each DO{} op is followed by an
5534 * OP_CONST containing the corresponding literal '(?{...})
5537 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5538 if (o->op_type == OP_CONST) {
5539 sv_catsv(pat, cSVOPo_sv);
5541 pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5545 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5546 assert(i+1 < pRExC_state->num_code_blocks);
5547 pRExC_state->code_blocks[++i].start = SvCUR(pat);
5548 pRExC_state->code_blocks[i].block = o;
5549 pRExC_state->code_blocks[i].src_regex = NULL;
5555 assert(expr->op_type == OP_CONST);
5556 pat = cSVOPx_sv(expr);
5560 exp = SvPV_nomg(pat, plen);
5562 if (!eng->op_comp) {
5563 if ((SvUTF8(pat) && IN_BYTES)
5564 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5566 /* make a temporary copy; either to convert to bytes,
5567 * or to avoid repeating get-magic / overloaded stringify */
5568 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5569 (IN_BYTES ? 0 : SvUTF8(pat)));
5571 Safefree(pRExC_state->code_blocks);
5572 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5575 /* ignore the utf8ness if the pattern is 0 length */
5576 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5577 RExC_uni_semantics = 0;
5578 RExC_contains_locale = 0;
5579 pRExC_state->runtime_code_qr = NULL;
5581 /****************** LONG JUMP TARGET HERE***********************/
5582 /* Longjmp back to here if have to switch in midstream to utf8 */
5583 if (! RExC_orig_utf8) {
5584 JMPENV_PUSH(jump_ret);
5585 used_setjump = TRUE;
5588 if (jump_ret == 0) { /* First time through */
5592 SV *dsv= sv_newmortal();
5593 RE_PV_QUOTED_DECL(s, RExC_utf8,
5594 dsv, exp, plen, 60);
5595 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5596 PL_colors[4],PL_colors[5],s);
5599 else { /* longjumped back */
5602 STRLEN s = 0, d = 0;
5605 /* If the cause for the longjmp was other than changing to utf8, pop
5606 * our own setjmp, and longjmp to the correct handler */
5607 if (jump_ret != UTF8_LONGJMP) {
5609 JMPENV_JUMP(jump_ret);
5614 /* It's possible to write a regexp in ascii that represents Unicode
5615 codepoints outside of the byte range, such as via \x{100}. If we
5616 detect such a sequence we have to convert the entire pattern to utf8
5617 and then recompile, as our sizing calculation will have been based
5618 on 1 byte == 1 character, but we will need to use utf8 to encode
5619 at least some part of the pattern, and therefore must convert the whole
5622 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5623 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5625 /* upgrade pattern to UTF8, and if there are code blocks,
5626 * recalculate the indices.
5627 * This is essentially an unrolled Perl_bytes_to_utf8() */
5629 src = (U8*)SvPV_nomg(pat, plen);
5630 Newx(dst, plen * 2 + 1, U8);
5633 const UV uv = NATIVE_TO_ASCII(src[s]);
5634 if (UNI_IS_INVARIANT(uv))
5635 dst[d] = (U8)UTF_TO_NATIVE(uv);
5637 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5638 dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
5640 if (n < pRExC_state->num_code_blocks) {
5641 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5642 pRExC_state->code_blocks[n].start = d;
5643 assert(dst[d] == '(');
5646 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5647 pRExC_state->code_blocks[n].end = d;
5648 assert(dst[d] == ')');
5661 RExC_orig_utf8 = RExC_utf8 = 1;
5664 /* return old regex if pattern hasn't changed */
5668 && !!RX_UTF8(old_re) == !!RExC_utf8
5669 && RX_PRECOMP(old_re)
5670 && RX_PRELEN(old_re) == plen
5671 && memEQ(RX_PRECOMP(old_re), exp, plen))
5673 /* with runtime code, always recompile */
5674 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5676 if (!runtime_code) {
5680 Safefree(pRExC_state->code_blocks);
5684 else if ((pm_flags & PMf_USE_RE_EVAL)
5685 /* this second condition covers the non-regex literal case,
5686 * i.e. $foo =~ '(?{})'. */
5687 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5688 && (PL_hints & HINT_RE_EVAL))
5690 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5693 #ifdef TRIE_STUDY_OPT
5697 rx_flags = orig_rx_flags;
5699 if (initial_charset == REGEX_LOCALE_CHARSET) {
5700 RExC_contains_locale = 1;
5702 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5704 /* Set to use unicode semantics if the pattern is in utf8 and has the
5705 * 'depends' charset specified, as it means unicode when utf8 */
5706 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5710 RExC_flags = rx_flags;
5711 RExC_pm_flags = pm_flags;
5714 if (TAINTING_get && TAINT_get)
5715 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5717 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5718 /* whoops, we have a non-utf8 pattern, whilst run-time code
5719 * got compiled as utf8. Try again with a utf8 pattern */
5720 JMPENV_JUMP(UTF8_LONGJMP);
5723 assert(!pRExC_state->runtime_code_qr);
5728 RExC_in_lookbehind = 0;
5729 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5731 RExC_override_recoding = 0;
5732 RExC_in_multi_char_class = 0;
5734 /* First pass: determine size, legality. */
5742 RExC_emit = &PL_regdummy;
5743 RExC_whilem_seen = 0;
5744 RExC_open_parens = NULL;
5745 RExC_close_parens = NULL;
5747 RExC_paren_names = NULL;
5749 RExC_paren_name_list = NULL;
5751 RExC_recurse = NULL;
5752 RExC_recurse_count = 0;
5753 pRExC_state->code_index = 0;
5755 #if 0 /* REGC() is (currently) a NOP at the first pass.
5756 * Clever compilers notice this and complain. --jhi */
5757 REGC((U8)REG_MAGIC, (char*)RExC_emit);
5760 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5762 RExC_lastparse=NULL;
5764 /* reg may croak on us, not giving us a chance to free
5765 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
5766 need it to survive as long as the regexp (qr/(?{})/).
5767 We must check that code_blocksv is not already set, because we may
5768 have longjmped back. */
5769 if (pRExC_state->code_blocks && !code_blocksv) {
5770 code_blocksv = newSV_type(SVt_PV);
5771 SAVEFREESV(code_blocksv);
5772 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
5773 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
5775 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5776 if (flags & RESTART_UTF8)
5777 JMPENV_JUMP(UTF8_LONGJMP);
5778 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#X", flags);
5781 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
5783 /* Here, finished first pass. Get rid of any added setjmp */
5789 PerlIO_printf(Perl_debug_log,
5790 "Required size %"IVdf" nodes\n"
5791 "Starting second pass (creation)\n",
5794 RExC_lastparse=NULL;
5797 /* The first pass could have found things that force Unicode semantics */
5798 if ((RExC_utf8 || RExC_uni_semantics)
5799 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5801 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5804 /* Small enough for pointer-storage convention?
5805 If extralen==0, this means that we will not need long jumps. */
5806 if (RExC_size >= 0x10000L && RExC_extralen)
5807 RExC_size += RExC_extralen;
5810 if (RExC_whilem_seen > 15)
5811 RExC_whilem_seen = 15;
5813 /* Allocate space and zero-initialize. Note, the two step process
5814 of zeroing when in debug mode, thus anything assigned has to
5815 happen after that */
5816 rx = (REGEXP*) newSV_type(SVt_REGEXP);
5818 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5819 char, regexp_internal);
5820 if ( r == NULL || ri == NULL )
5821 FAIL("Regexp out of space");
5823 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5824 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5826 /* bulk initialize base fields with 0. */
5827 Zero(ri, sizeof(regexp_internal), char);
5830 /* non-zero initialization begins here */
5833 r->extflags = rx_flags;
5834 if (pm_flags & PMf_IS_QR) {
5835 ri->code_blocks = pRExC_state->code_blocks;
5836 ri->num_code_blocks = pRExC_state->num_code_blocks;
5841 for (n = 0; n < pRExC_state->num_code_blocks; n++)
5842 if (pRExC_state->code_blocks[n].src_regex)
5843 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
5844 SAVEFREEPV(pRExC_state->code_blocks);
5848 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5849 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5851 /* The caret is output if there are any defaults: if not all the STD
5852 * flags are set, or if no character set specifier is needed */
5854 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5856 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5857 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5858 >> RXf_PMf_STD_PMMOD_SHIFT);
5859 const char *fptr = STD_PAT_MODS; /*"msix"*/
5861 /* Allocate for the worst case, which is all the std flags are turned
5862 * on. If more precision is desired, we could do a population count of
5863 * the flags set. This could be done with a small lookup table, or by
5864 * shifting, masking and adding, or even, when available, assembly
5865 * language for a machine-language population count.
5866 * We never output a minus, as all those are defaults, so are
5867 * covered by the caret */
5868 const STRLEN wraplen = plen + has_p + has_runon
5869 + has_default /* If needs a caret */
5871 /* If needs a character set specifier */
5872 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5873 + (sizeof(STD_PAT_MODS) - 1)
5874 + (sizeof("(?:)") - 1);
5876 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
5877 r->xpv_len_u.xpvlenu_pv = p;
5879 SvFLAGS(rx) |= SVf_UTF8;
5882 /* If a default, cover it using the caret */
5884 *p++= DEFAULT_PAT_MOD;
5888 const char* const name = get_regex_charset_name(r->extflags, &len);
5889 Copy(name, p, len, char);
5893 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5896 while((ch = *fptr++)) {
5904 Copy(RExC_precomp, p, plen, char);
5905 assert ((RX_WRAPPED(rx) - p) < 16);
5906 r->pre_prefix = p - RX_WRAPPED(rx);
5912 SvCUR_set(rx, p - RX_WRAPPED(rx));
5916 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5918 if (RExC_seen & REG_SEEN_RECURSE) {
5919 Newxz(RExC_open_parens, RExC_npar,regnode *);
5920 SAVEFREEPV(RExC_open_parens);
5921 Newxz(RExC_close_parens,RExC_npar,regnode *);
5922 SAVEFREEPV(RExC_close_parens);
5925 /* Useful during FAIL. */
5926 #ifdef RE_TRACK_PATTERN_OFFSETS
5927 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5928 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5929 "%s %"UVuf" bytes for offset annotations.\n",
5930 ri->u.offsets ? "Got" : "Couldn't get",
5931 (UV)((2*RExC_size+1) * sizeof(U32))));
5933 SetProgLen(ri,RExC_size);
5938 /* Second pass: emit code. */
5939 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
5940 RExC_pm_flags = pm_flags;
5945 RExC_emit_start = ri->program;
5946 RExC_emit = ri->program;
5947 RExC_emit_bound = ri->program + RExC_size + 1;
5948 pRExC_state->code_index = 0;
5950 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5951 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5953 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#X", flags);
5955 /* XXXX To minimize changes to RE engine we always allocate
5956 3-units-long substrs field. */
5957 Newx(r->substrs, 1, struct reg_substr_data);
5958 if (RExC_recurse_count) {
5959 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5960 SAVEFREEPV(RExC_recurse);
5964 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5965 Zero(r->substrs, 1, struct reg_substr_data);
5967 #ifdef TRIE_STUDY_OPT
5969 StructCopy(&zero_scan_data, &data, scan_data_t);
5970 copyRExC_state = RExC_state;
5973 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5975 RExC_state = copyRExC_state;
5976 if (seen & REG_TOP_LEVEL_BRANCHES)
5977 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5979 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5980 StructCopy(&zero_scan_data, &data, scan_data_t);
5983 StructCopy(&zero_scan_data, &data, scan_data_t);
5986 /* Dig out information for optimizations. */
5987 r->extflags = RExC_flags; /* was pm_op */
5988 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5991 SvUTF8_on(rx); /* Unicode in it? */
5992 ri->regstclass = NULL;
5993 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
5994 r->intflags |= PREGf_NAUGHTY;
5995 scan = ri->program + 1; /* First BRANCH. */
5997 /* testing for BRANCH here tells us whether there is "must appear"
5998 data in the pattern. If there is then we can use it for optimisations */
5999 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
6001 STRLEN longest_float_length, longest_fixed_length;
6002 struct regnode_charclass_class ch_class; /* pointed to by data */
6004 I32 last_close = 0; /* pointed to by data */
6005 regnode *first= scan;
6006 regnode *first_next= regnext(first);
6008 * Skip introductions and multiplicators >= 1
6009 * so that we can extract the 'meat' of the pattern that must
6010 * match in the large if() sequence following.
6011 * NOTE that EXACT is NOT covered here, as it is normally
6012 * picked up by the optimiser separately.
6014 * This is unfortunate as the optimiser isnt handling lookahead
6015 * properly currently.
6018 while ((OP(first) == OPEN && (sawopen = 1)) ||
6019 /* An OR of *one* alternative - should not happen now. */
6020 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6021 /* for now we can't handle lookbehind IFMATCH*/
6022 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6023 (OP(first) == PLUS) ||
6024 (OP(first) == MINMOD) ||
6025 /* An {n,m} with n>0 */
6026 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6027 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6030 * the only op that could be a regnode is PLUS, all the rest
6031 * will be regnode_1 or regnode_2.
6034 if (OP(first) == PLUS)
6037 first += regarglen[OP(first)];
6039 first = NEXTOPER(first);
6040 first_next= regnext(first);
6043 /* Starting-point info. */
6045 DEBUG_PEEP("first:",first,0);
6046 /* Ignore EXACT as we deal with it later. */
6047 if (PL_regkind[OP(first)] == EXACT) {
6048 if (OP(first) == EXACT)
6049 NOOP; /* Empty, get anchored substr later. */
6051 ri->regstclass = first;
6054 else if (PL_regkind[OP(first)] == TRIE &&
6055 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6058 /* this can happen only on restudy */
6059 if ( OP(first) == TRIE ) {
6060 struct regnode_1 *trieop = (struct regnode_1 *)
6061 PerlMemShared_calloc(1, sizeof(struct regnode_1));
6062 StructCopy(first,trieop,struct regnode_1);
6063 trie_op=(regnode *)trieop;
6065 struct regnode_charclass *trieop = (struct regnode_charclass *)
6066 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6067 StructCopy(first,trieop,struct regnode_charclass);
6068 trie_op=(regnode *)trieop;
6071 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6072 ri->regstclass = trie_op;
6075 else if (REGNODE_SIMPLE(OP(first)))
6076 ri->regstclass = first;
6077 else if (PL_regkind[OP(first)] == BOUND ||
6078 PL_regkind[OP(first)] == NBOUND)
6079 ri->regstclass = first;
6080 else if (PL_regkind[OP(first)] == BOL) {
6081 r->extflags |= (OP(first) == MBOL
6083 : (OP(first) == SBOL
6086 first = NEXTOPER(first);
6089 else if (OP(first) == GPOS) {
6090 r->extflags |= RXf_ANCH_GPOS;
6091 first = NEXTOPER(first);
6094 else if ((!sawopen || !RExC_sawback) &&
6095 (OP(first) == STAR &&
6096 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6097 !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6099 /* turn .* into ^.* with an implied $*=1 */
6101 (OP(NEXTOPER(first)) == REG_ANY)
6104 r->extflags |= type;
6105 r->intflags |= PREGf_IMPLICIT;
6106 first = NEXTOPER(first);
6109 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6110 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6111 /* x+ must match at the 1st pos of run of x's */
6112 r->intflags |= PREGf_SKIP;
6114 /* Scan is after the zeroth branch, first is atomic matcher. */
6115 #ifdef TRIE_STUDY_OPT
6118 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6119 (IV)(first - scan + 1))
6123 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6124 (IV)(first - scan + 1))
6130 * If there's something expensive in the r.e., find the
6131 * longest literal string that must appear and make it the
6132 * regmust. Resolve ties in favor of later strings, since
6133 * the regstart check works with the beginning of the r.e.
6134 * and avoiding duplication strengthens checking. Not a
6135 * strong reason, but sufficient in the absence of others.
6136 * [Now we resolve ties in favor of the earlier string if
6137 * it happens that c_offset_min has been invalidated, since the
6138 * earlier string may buy us something the later one won't.]
6141 data.longest_fixed = newSVpvs("");
6142 data.longest_float = newSVpvs("");
6143 data.last_found = newSVpvs("");
6144 data.longest = &(data.longest_fixed);
6145 ENTER_with_name("study_chunk");
6146 SAVEFREESV(data.longest_fixed);
6147 SAVEFREESV(data.longest_float);
6148 SAVEFREESV(data.last_found);
6150 if (!ri->regstclass) {
6151 cl_init(pRExC_state, &ch_class);
6152 data.start_class = &ch_class;
6153 stclass_flag = SCF_DO_STCLASS_AND;
6154 } else /* XXXX Check for BOUND? */
6156 data.last_closep = &last_close;
6158 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6159 &data, -1, NULL, NULL,
6160 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6163 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6166 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6167 && data.last_start_min == 0 && data.last_end > 0
6168 && !RExC_seen_zerolen
6169 && !(RExC_seen & REG_SEEN_VERBARG)
6170 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6171 r->extflags |= RXf_CHECK_ALL;
6172 scan_commit(pRExC_state, &data,&minlen,0);
6174 longest_float_length = CHR_SVLEN(data.longest_float);
6176 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6177 && data.offset_fixed == data.offset_float_min
6178 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6179 && S_setup_longest (aTHX_ pRExC_state,
6183 &(r->float_end_shift),
6184 data.lookbehind_float,
6185 data.offset_float_min,
6187 longest_float_length,
6188 cBOOL(data.flags & SF_FL_BEFORE_EOL),
6189 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6191 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6192 r->float_max_offset = data.offset_float_max;
6193 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6194 r->float_max_offset -= data.lookbehind_float;
6195 SvREFCNT_inc_simple_void_NN(data.longest_float);
6198 r->float_substr = r->float_utf8 = NULL;
6199 longest_float_length = 0;
6202 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6204 if (S_setup_longest (aTHX_ pRExC_state,
6206 &(r->anchored_utf8),
6207 &(r->anchored_substr),
6208 &(r->anchored_end_shift),
6209 data.lookbehind_fixed,
6212 longest_fixed_length,
6213 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6214 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6216 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6217 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6220 r->anchored_substr = r->anchored_utf8 = NULL;
6221 longest_fixed_length = 0;
6223 LEAVE_with_name("study_chunk");
6226 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6227 ri->regstclass = NULL;
6229 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6231 && ! TEST_SSC_EOS(data.start_class)
6232 && !cl_is_anything(data.start_class))
6234 const U32 n = add_data(pRExC_state, 1, "f");
6235 OP(data.start_class) = ANYOF_SYNTHETIC;
6237 Newx(RExC_rxi->data->data[n], 1,
6238 struct regnode_charclass_class);
6239 StructCopy(data.start_class,
6240 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6241 struct regnode_charclass_class);
6242 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6243 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6244 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6245 regprop(r, sv, (regnode*)data.start_class);
6246 PerlIO_printf(Perl_debug_log,
6247 "synthetic stclass \"%s\".\n",
6248 SvPVX_const(sv));});
6251 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6252 if (longest_fixed_length > longest_float_length) {
6253 r->check_end_shift = r->anchored_end_shift;
6254 r->check_substr = r->anchored_substr;
6255 r->check_utf8 = r->anchored_utf8;
6256 r->check_offset_min = r->check_offset_max = r->anchored_offset;
6257 if (r->extflags & RXf_ANCH_SINGLE)
6258 r->extflags |= RXf_NOSCAN;
6261 r->check_end_shift = r->float_end_shift;
6262 r->check_substr = r->float_substr;
6263 r->check_utf8 = r->float_utf8;
6264 r->check_offset_min = r->float_min_offset;
6265 r->check_offset_max = r->float_max_offset;
6267 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6268 This should be changed ASAP! */
6269 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6270 r->extflags |= RXf_USE_INTUIT;
6271 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6272 r->extflags |= RXf_INTUIT_TAIL;
6274 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6275 if ( (STRLEN)minlen < longest_float_length )
6276 minlen= longest_float_length;
6277 if ( (STRLEN)minlen < longest_fixed_length )
6278 minlen= longest_fixed_length;
6282 /* Several toplevels. Best we can is to set minlen. */
6284 struct regnode_charclass_class ch_class;
6287 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6289 scan = ri->program + 1;
6290 cl_init(pRExC_state, &ch_class);
6291 data.start_class = &ch_class;
6292 data.last_closep = &last_close;
6295 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6296 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6298 CHECK_RESTUDY_GOTO_butfirst(NOOP);
6300 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6301 = r->float_substr = r->float_utf8 = NULL;
6303 if (! TEST_SSC_EOS(data.start_class)
6304 && !cl_is_anything(data.start_class))
6306 const U32 n = add_data(pRExC_state, 1, "f");
6307 OP(data.start_class) = ANYOF_SYNTHETIC;
6309 Newx(RExC_rxi->data->data[n], 1,
6310 struct regnode_charclass_class);
6311 StructCopy(data.start_class,
6312 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6313 struct regnode_charclass_class);
6314 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6315 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6316 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6317 regprop(r, sv, (regnode*)data.start_class);
6318 PerlIO_printf(Perl_debug_log,
6319 "synthetic stclass \"%s\".\n",
6320 SvPVX_const(sv));});
6324 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6325 the "real" pattern. */
6327 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6328 (IV)minlen, (IV)r->minlen);
6330 r->minlenret = minlen;
6331 if (r->minlen < minlen)
6334 if (RExC_seen & REG_SEEN_GPOS)
6335 r->extflags |= RXf_GPOS_SEEN;
6336 if (RExC_seen & REG_SEEN_LOOKBEHIND)
6337 r->extflags |= RXf_LOOKBEHIND_SEEN;
6338 if (pRExC_state->num_code_blocks)
6339 r->extflags |= RXf_EVAL_SEEN;
6340 if (RExC_seen & REG_SEEN_CANY)
6341 r->extflags |= RXf_CANY_SEEN;
6342 if (RExC_seen & REG_SEEN_VERBARG)
6344 r->intflags |= PREGf_VERBARG_SEEN;
6345 r->extflags |= RXf_MODIFIES_VARS;
6347 if (RExC_seen & REG_SEEN_CUTGROUP)
6348 r->intflags |= PREGf_CUTGROUP_SEEN;
6349 if (pm_flags & PMf_USE_RE_EVAL)
6350 r->intflags |= PREGf_USE_RE_EVAL;
6351 if (RExC_paren_names)
6352 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6354 RXp_PAREN_NAMES(r) = NULL;
6356 #ifdef STUPID_PATTERN_CHECKS
6357 if (RX_PRELEN(rx) == 0)
6358 r->extflags |= RXf_NULL;
6359 if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6360 r->extflags |= RXf_WHITE;
6361 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6362 r->extflags |= RXf_START_ONLY;
6365 regnode *first = ri->program + 1;
6368 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6369 r->extflags |= RXf_NULL;
6370 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6371 r->extflags |= RXf_START_ONLY;
6372 else if (fop == PLUS && PL_regkind[OP(NEXTOPER(first))] == POSIXD && FLAGS(NEXTOPER(first)) == _CC_SPACE
6373 && OP(regnext(first)) == END)
6374 r->extflags |= RXf_WHITE;
6378 if (RExC_paren_names) {
6379 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6380 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6383 ri->name_list_idx = 0;
6385 if (RExC_recurse_count) {
6386 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6387 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6388 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6391 Newxz(r->offs, RExC_npar, regexp_paren_pair);
6392 /* assume we don't need to swap parens around before we match */
6395 PerlIO_printf(Perl_debug_log,"Final program:\n");
6398 #ifdef RE_TRACK_PATTERN_OFFSETS
6399 DEBUG_OFFSETS_r(if (ri->u.offsets) {
6400 const U32 len = ri->u.offsets[0];
6402 GET_RE_DEBUG_FLAGS_DECL;
6403 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6404 for (i = 1; i <= len; i++) {
6405 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6406 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6407 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6409 PerlIO_printf(Perl_debug_log, "\n");
6414 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6415 * by setting the regexp SV to readonly-only instead. If the
6416 * pattern's been recompiled, the USEDness should remain. */
6417 if (old_re && SvREADONLY(old_re))
6425 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6428 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6430 PERL_UNUSED_ARG(value);
6432 if (flags & RXapif_FETCH) {
6433 return reg_named_buff_fetch(rx, key, flags);
6434 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6435 Perl_croak_no_modify();
6437 } else if (flags & RXapif_EXISTS) {
6438 return reg_named_buff_exists(rx, key, flags)
6441 } else if (flags & RXapif_REGNAMES) {
6442 return reg_named_buff_all(rx, flags);
6443 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6444 return reg_named_buff_scalar(rx, flags);
6446 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6452 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6455 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6456 PERL_UNUSED_ARG(lastkey);
6458 if (flags & RXapif_FIRSTKEY)
6459 return reg_named_buff_firstkey(rx, flags);
6460 else if (flags & RXapif_NEXTKEY)
6461 return reg_named_buff_nextkey(rx, flags);
6463 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6469 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6472 AV *retarray = NULL;
6474 struct regexp *const rx = ReANY(r);
6476 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6478 if (flags & RXapif_ALL)
6481 if (rx && RXp_PAREN_NAMES(rx)) {
6482 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6485 SV* sv_dat=HeVAL(he_str);
6486 I32 *nums=(I32*)SvPVX(sv_dat);
6487 for ( i=0; i<SvIVX(sv_dat); i++ ) {
6488 if ((I32)(rx->nparens) >= nums[i]
6489 && rx->offs[nums[i]].start != -1
6490 && rx->offs[nums[i]].end != -1)
6493 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6498 ret = newSVsv(&PL_sv_undef);
6501 av_push(retarray, ret);
6504 return newRV_noinc(MUTABLE_SV(retarray));
6511 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6514 struct regexp *const rx = ReANY(r);
6516 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6518 if (rx && RXp_PAREN_NAMES(rx)) {
6519 if (flags & RXapif_ALL) {
6520 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6522 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6524 SvREFCNT_dec_NN(sv);
6536 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6538 struct regexp *const rx = ReANY(r);
6540 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6542 if ( rx && RXp_PAREN_NAMES(rx) ) {
6543 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6545 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6552 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6554 struct regexp *const rx = ReANY(r);
6555 GET_RE_DEBUG_FLAGS_DECL;
6557 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6559 if (rx && RXp_PAREN_NAMES(rx)) {
6560 HV *hv = RXp_PAREN_NAMES(rx);
6562 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6565 SV* sv_dat = HeVAL(temphe);
6566 I32 *nums = (I32*)SvPVX(sv_dat);
6567 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6568 if ((I32)(rx->lastparen) >= nums[i] &&
6569 rx->offs[nums[i]].start != -1 &&
6570 rx->offs[nums[i]].end != -1)
6576 if (parno || flags & RXapif_ALL) {
6577 return newSVhek(HeKEY_hek(temphe));
6585 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6590 struct regexp *const rx = ReANY(r);
6592 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6594 if (rx && RXp_PAREN_NAMES(rx)) {
6595 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6596 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6597 } else if (flags & RXapif_ONE) {
6598 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6599 av = MUTABLE_AV(SvRV(ret));
6600 length = av_len(av);
6601 SvREFCNT_dec_NN(ret);
6602 return newSViv(length + 1);
6604 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6608 return &PL_sv_undef;
6612 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6614 struct regexp *const rx = ReANY(r);
6617 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6619 if (rx && RXp_PAREN_NAMES(rx)) {
6620 HV *hv= RXp_PAREN_NAMES(rx);
6622 (void)hv_iterinit(hv);
6623 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6626 SV* sv_dat = HeVAL(temphe);
6627 I32 *nums = (I32*)SvPVX(sv_dat);
6628 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6629 if ((I32)(rx->lastparen) >= nums[i] &&
6630 rx->offs[nums[i]].start != -1 &&
6631 rx->offs[nums[i]].end != -1)
6637 if (parno || flags & RXapif_ALL) {
6638 av_push(av, newSVhek(HeKEY_hek(temphe)));
6643 return newRV_noinc(MUTABLE_SV(av));
6647 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6650 struct regexp *const rx = ReANY(r);
6656 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6658 if ( ( n == RX_BUFF_IDX_CARET_PREMATCH
6659 || n == RX_BUFF_IDX_CARET_FULLMATCH
6660 || n == RX_BUFF_IDX_CARET_POSTMATCH
6662 && !(rx->extflags & RXf_PMf_KEEPCOPY)
6669 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6670 /* no need to distinguish between them any more */
6671 n = RX_BUFF_IDX_FULLMATCH;
6673 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6674 && rx->offs[0].start != -1)
6676 /* $`, ${^PREMATCH} */
6677 i = rx->offs[0].start;
6681 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6682 && rx->offs[0].end != -1)
6684 /* $', ${^POSTMATCH} */
6685 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6686 i = rx->sublen + rx->suboffset - rx->offs[0].end;
6689 if ( 0 <= n && n <= (I32)rx->nparens &&
6690 (s1 = rx->offs[n].start) != -1 &&
6691 (t1 = rx->offs[n].end) != -1)
6693 /* $&, ${^MATCH}, $1 ... */
6695 s = rx->subbeg + s1 - rx->suboffset;
6700 assert(s >= rx->subbeg);
6701 assert(rx->sublen >= (s - rx->subbeg) + i );
6703 #if NO_TAINT_SUPPORT
6704 sv_setpvn(sv, s, i);
6706 const int oldtainted = TAINT_get;
6708 sv_setpvn(sv, s, i);
6709 TAINT_set(oldtainted);
6711 if ( (rx->extflags & RXf_CANY_SEEN)
6712 ? (RXp_MATCH_UTF8(rx)
6713 && (!i || is_utf8_string((U8*)s, i)))
6714 : (RXp_MATCH_UTF8(rx)) )
6721 if (RXp_MATCH_TAINTED(rx)) {
6722 if (SvTYPE(sv) >= SVt_PVMG) {
6723 MAGIC* const mg = SvMAGIC(sv);
6726 SvMAGIC_set(sv, mg->mg_moremagic);
6728 if ((mgt = SvMAGIC(sv))) {
6729 mg->mg_moremagic = mgt;
6730 SvMAGIC_set(sv, mg);
6741 sv_setsv(sv,&PL_sv_undef);
6747 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6748 SV const * const value)
6750 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6752 PERL_UNUSED_ARG(rx);
6753 PERL_UNUSED_ARG(paren);
6754 PERL_UNUSED_ARG(value);
6757 Perl_croak_no_modify();
6761 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6764 struct regexp *const rx = ReANY(r);
6768 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6770 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6772 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6773 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6777 case RX_BUFF_IDX_PREMATCH: /* $` */
6778 if (rx->offs[0].start != -1) {
6779 i = rx->offs[0].start;
6788 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6789 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6791 case RX_BUFF_IDX_POSTMATCH: /* $' */
6792 if (rx->offs[0].end != -1) {
6793 i = rx->sublen - rx->offs[0].end;
6795 s1 = rx->offs[0].end;
6802 case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
6803 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6807 /* $& / ${^MATCH}, $1, $2, ... */
6809 if (paren <= (I32)rx->nparens &&
6810 (s1 = rx->offs[paren].start) != -1 &&
6811 (t1 = rx->offs[paren].end) != -1)
6817 if (ckWARN(WARN_UNINITIALIZED))
6818 report_uninit((const SV *)sv);
6823 if (i > 0 && RXp_MATCH_UTF8(rx)) {
6824 const char * const s = rx->subbeg - rx->suboffset + s1;
6829 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6836 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6838 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6839 PERL_UNUSED_ARG(rx);
6843 return newSVpvs("Regexp");
6846 /* Scans the name of a named buffer from the pattern.
6847 * If flags is REG_RSN_RETURN_NULL returns null.
6848 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6849 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6850 * to the parsed name as looked up in the RExC_paren_names hash.
6851 * If there is an error throws a vFAIL().. type exception.
6854 #define REG_RSN_RETURN_NULL 0
6855 #define REG_RSN_RETURN_NAME 1
6856 #define REG_RSN_RETURN_DATA 2
6859 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6861 char *name_start = RExC_parse;
6863 PERL_ARGS_ASSERT_REG_SCAN_NAME;
6865 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6866 /* skip IDFIRST by using do...while */
6869 RExC_parse += UTF8SKIP(RExC_parse);
6870 } while (isWORDCHAR_utf8((U8*)RExC_parse));
6874 } while (isWORDCHAR(*RExC_parse));
6876 RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6877 vFAIL("Group name must start with a non-digit word character");
6881 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6882 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6883 if ( flags == REG_RSN_RETURN_NAME)
6885 else if (flags==REG_RSN_RETURN_DATA) {
6888 if ( ! sv_name ) /* should not happen*/
6889 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6890 if (RExC_paren_names)
6891 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6893 sv_dat = HeVAL(he_str);
6895 vFAIL("Reference to nonexistent named group");
6899 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6900 (unsigned long) flags);
6902 assert(0); /* NOT REACHED */
6907 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
6908 int rem=(int)(RExC_end - RExC_parse); \
6917 if (RExC_lastparse!=RExC_parse) \
6918 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
6921 iscut ? "..." : "<" \
6924 PerlIO_printf(Perl_debug_log,"%16s",""); \
6927 num = RExC_size + 1; \
6929 num=REG_NODE_NUM(RExC_emit); \
6930 if (RExC_lastnum!=num) \
6931 PerlIO_printf(Perl_debug_log,"|%4d",num); \
6933 PerlIO_printf(Perl_debug_log,"|%4s",""); \
6934 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
6935 (int)((depth*2)), "", \
6939 RExC_lastparse=RExC_parse; \
6944 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
6945 DEBUG_PARSE_MSG((funcname)); \
6946 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
6948 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
6949 DEBUG_PARSE_MSG((funcname)); \
6950 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
6953 /* This section of code defines the inversion list object and its methods. The
6954 * interfaces are highly subject to change, so as much as possible is static to
6955 * this file. An inversion list is here implemented as a malloc'd C UV array
6956 * with some added info that is placed as UVs at the beginning in a header
6957 * portion. An inversion list for Unicode is an array of code points, sorted
6958 * by ordinal number. The zeroth element is the first code point in the list.
6959 * The 1th element is the first element beyond that not in the list. In other
6960 * words, the first range is
6961 * invlist[0]..(invlist[1]-1)
6962 * The other ranges follow. Thus every element whose index is divisible by two
6963 * marks the beginning of a range that is in the list, and every element not
6964 * divisible by two marks the beginning of a range not in the list. A single
6965 * element inversion list that contains the single code point N generally
6966 * consists of two elements
6969 * (The exception is when N is the highest representable value on the
6970 * machine, in which case the list containing just it would be a single
6971 * element, itself. By extension, if the last range in the list extends to
6972 * infinity, then the first element of that range will be in the inversion list
6973 * at a position that is divisible by two, and is the final element in the
6975 * Taking the complement (inverting) an inversion list is quite simple, if the
6976 * first element is 0, remove it; otherwise add a 0 element at the beginning.
6977 * This implementation reserves an element at the beginning of each inversion
6978 * list to contain 0 when the list contains 0, and contains 1 otherwise. The
6979 * actual beginning of the list is either that element if 0, or the next one if
6982 * More about inversion lists can be found in "Unicode Demystified"
6983 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6984 * More will be coming when functionality is added later.
6986 * The inversion list data structure is currently implemented as an SV pointing
6987 * to an array of UVs that the SV thinks are bytes. This allows us to have an
6988 * array of UV whose memory management is automatically handled by the existing
6989 * facilities for SV's.
6991 * Some of the methods should always be private to the implementation, and some
6992 * should eventually be made public */
6994 /* The header definitions are in F<inline_invlist.c> */
6995 #define TO_INTERNAL_SIZE(x) (((x) + HEADER_LENGTH) * sizeof(UV))
6996 #define FROM_INTERNAL_SIZE(x) (((x)/ sizeof(UV)) - HEADER_LENGTH)
6998 #define INVLIST_INITIAL_LEN 10
7000 PERL_STATIC_INLINE UV*
7001 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7003 /* Returns a pointer to the first element in the inversion list's array.
7004 * This is called upon initialization of an inversion list. Where the
7005 * array begins depends on whether the list has the code point U+0000
7006 * in it or not. The other parameter tells it whether the code that
7007 * follows this call is about to put a 0 in the inversion list or not.
7008 * The first element is either the element with 0, if 0, or the next one,
7011 UV* zero = get_invlist_zero_addr(invlist);
7013 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7016 assert(! *_get_invlist_len_addr(invlist));
7018 /* 1^1 = 0; 1^0 = 1 */
7019 *zero = 1 ^ will_have_0;
7020 return zero + *zero;
7023 PERL_STATIC_INLINE UV*
7024 S_invlist_array(pTHX_ SV* const invlist)
7026 /* Returns the pointer to the inversion list's array. Every time the
7027 * length changes, this needs to be called in case malloc or realloc moved
7030 PERL_ARGS_ASSERT_INVLIST_ARRAY;
7032 /* Must not be empty. If these fail, you probably didn't check for <len>
7033 * being non-zero before trying to get the array */
7034 assert(*_get_invlist_len_addr(invlist));
7035 assert(*get_invlist_zero_addr(invlist) == 0
7036 || *get_invlist_zero_addr(invlist) == 1);
7038 /* The array begins either at the element reserved for zero if the
7039 * list contains 0 (that element will be set to 0), or otherwise the next
7040 * element (in which case the reserved element will be set to 1). */
7041 return (UV *) (get_invlist_zero_addr(invlist)
7042 + *get_invlist_zero_addr(invlist));
7045 PERL_STATIC_INLINE void
7046 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7048 /* Sets the current number of elements stored in the inversion list */
7050 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7052 *_get_invlist_len_addr(invlist) = len;
7054 assert(len <= SvLEN(invlist));
7056 SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7057 /* If the list contains U+0000, that element is part of the header,
7058 * and should not be counted as part of the array. It will contain
7059 * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
7061 * SvCUR_set(invlist,
7062 * TO_INTERNAL_SIZE(len
7063 * - (*get_invlist_zero_addr(inv_list) ^ 1)));
7064 * But, this is only valid if len is not 0. The consequences of not doing
7065 * this is that the memory allocation code may think that 1 more UV is
7066 * being used than actually is, and so might do an unnecessary grow. That
7067 * seems worth not bothering to make this the precise amount.
7069 * Note that when inverting, SvCUR shouldn't change */
7072 PERL_STATIC_INLINE IV*
7073 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7075 /* Return the address of the UV that is reserved to hold the cached index
7078 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7080 return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
7083 PERL_STATIC_INLINE IV
7084 S_invlist_previous_index(pTHX_ SV* const invlist)
7086 /* Returns cached index of previous search */
7088 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7090 return *get_invlist_previous_index_addr(invlist);
7093 PERL_STATIC_INLINE void
7094 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7096 /* Caches <index> for later retrieval */
7098 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7100 assert(index == 0 || index < (int) _invlist_len(invlist));
7102 *get_invlist_previous_index_addr(invlist) = index;
7105 PERL_STATIC_INLINE UV
7106 S_invlist_max(pTHX_ SV* const invlist)
7108 /* Returns the maximum number of elements storable in the inversion list's
7109 * array, without having to realloc() */
7111 PERL_ARGS_ASSERT_INVLIST_MAX;
7113 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
7114 ? _invlist_len(invlist)
7115 : FROM_INTERNAL_SIZE(SvLEN(invlist));
7118 PERL_STATIC_INLINE UV*
7119 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7121 /* Return the address of the UV that is reserved to hold 0 if the inversion
7122 * list contains 0. This has to be the last element of the heading, as the
7123 * list proper starts with either it if 0, or the next element if not.
7124 * (But we force it to contain either 0 or 1) */
7126 PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7128 return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7131 #ifndef PERL_IN_XSUB_RE
7133 Perl__new_invlist(pTHX_ IV initial_size)
7136 /* Return a pointer to a newly constructed inversion list, with enough
7137 * space to store 'initial_size' elements. If that number is negative, a
7138 * system default is used instead */
7142 if (initial_size < 0) {
7143 initial_size = INVLIST_INITIAL_LEN;
7146 /* Allocate the initial space */
7147 new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7148 invlist_set_len(new_list, 0);
7150 /* Force iterinit() to be used to get iteration to work */
7151 *get_invlist_iter_addr(new_list) = UV_MAX;
7153 /* This should force a segfault if a method doesn't initialize this
7155 *get_invlist_zero_addr(new_list) = UV_MAX;
7157 *get_invlist_previous_index_addr(new_list) = 0;
7158 *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7159 #if HEADER_LENGTH != 5
7160 # error Need to regenerate INVLIST_VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7168 S__new_invlist_C_array(pTHX_ UV* list)
7170 /* Return a pointer to a newly constructed inversion list, initialized to
7171 * point to <list>, which has to be in the exact correct inversion list
7172 * form, including internal fields. Thus this is a dangerous routine that
7173 * should not be used in the wrong hands */
7175 SV* invlist = newSV_type(SVt_PV);
7177 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7179 SvPV_set(invlist, (char *) list);
7180 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
7181 shouldn't touch it */
7182 SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
7184 if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7185 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7188 /* Initialize the iteration pointer.
7189 * XXX This could be done at compile time in charclass_invlists.h, but I
7190 * (khw) am not confident that the suffixes for specifying the C constant
7191 * UV_MAX are portable, e.g. 'ull' on a 32 bit machine that is configured
7192 * to use 64 bits; might need a Configure probe */
7193 invlist_iterfinish(invlist);
7199 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7201 /* Grow the maximum size of an inversion list */
7203 PERL_ARGS_ASSERT_INVLIST_EXTEND;
7205 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7208 PERL_STATIC_INLINE void
7209 S_invlist_trim(pTHX_ SV* const invlist)
7211 PERL_ARGS_ASSERT_INVLIST_TRIM;
7213 /* Change the length of the inversion list to how many entries it currently
7216 SvPV_shrink_to_cur((SV *) invlist);
7219 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7222 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7224 /* Subject to change or removal. Append the range from 'start' to 'end' at
7225 * the end of the inversion list. The range must be above any existing
7229 UV max = invlist_max(invlist);
7230 UV len = _invlist_len(invlist);
7232 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7234 if (len == 0) { /* Empty lists must be initialized */
7235 array = _invlist_array_init(invlist, start == 0);
7238 /* Here, the existing list is non-empty. The current max entry in the
7239 * list is generally the first value not in the set, except when the
7240 * set extends to the end of permissible values, in which case it is
7241 * the first entry in that final set, and so this call is an attempt to
7242 * append out-of-order */
7244 UV final_element = len - 1;
7245 array = invlist_array(invlist);
7246 if (array[final_element] > start
7247 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7249 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7250 array[final_element], start,
7251 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7254 /* Here, it is a legal append. If the new range begins with the first
7255 * value not in the set, it is extending the set, so the new first
7256 * value not in the set is one greater than the newly extended range.
7258 if (array[final_element] == start) {
7259 if (end != UV_MAX) {
7260 array[final_element] = end + 1;
7263 /* But if the end is the maximum representable on the machine,
7264 * just let the range that this would extend to have no end */
7265 invlist_set_len(invlist, len - 1);
7271 /* Here the new range doesn't extend any existing set. Add it */
7273 len += 2; /* Includes an element each for the start and end of range */
7275 /* If overflows the existing space, extend, which may cause the array to be
7278 invlist_extend(invlist, len);
7279 invlist_set_len(invlist, len); /* Have to set len here to avoid assert
7280 failure in invlist_array() */
7281 array = invlist_array(invlist);
7284 invlist_set_len(invlist, len);
7287 /* The next item on the list starts the range, the one after that is
7288 * one past the new range. */
7289 array[len - 2] = start;
7290 if (end != UV_MAX) {
7291 array[len - 1] = end + 1;
7294 /* But if the end is the maximum representable on the machine, just let
7295 * the range have no end */
7296 invlist_set_len(invlist, len - 1);
7300 #ifndef PERL_IN_XSUB_RE
7303 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7305 /* Searches the inversion list for the entry that contains the input code
7306 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
7307 * return value is the index into the list's array of the range that
7312 IV high = _invlist_len(invlist);
7313 const IV highest_element = high - 1;
7316 PERL_ARGS_ASSERT__INVLIST_SEARCH;
7318 /* If list is empty, return failure. */
7323 /* (We can't get the array unless we know the list is non-empty) */
7324 array = invlist_array(invlist);
7326 mid = invlist_previous_index(invlist);
7327 assert(mid >=0 && mid <= highest_element);
7329 /* <mid> contains the cache of the result of the previous call to this
7330 * function (0 the first time). See if this call is for the same result,
7331 * or if it is for mid-1. This is under the theory that calls to this
7332 * function will often be for related code points that are near each other.
7333 * And benchmarks show that caching gives better results. We also test
7334 * here if the code point is within the bounds of the list. These tests
7335 * replace others that would have had to be made anyway to make sure that
7336 * the array bounds were not exceeded, and these give us extra information
7337 * at the same time */
7338 if (cp >= array[mid]) {
7339 if (cp >= array[highest_element]) {
7340 return highest_element;
7343 /* Here, array[mid] <= cp < array[highest_element]. This means that
7344 * the final element is not the answer, so can exclude it; it also
7345 * means that <mid> is not the final element, so can refer to 'mid + 1'
7347 if (cp < array[mid + 1]) {
7353 else { /* cp < aray[mid] */
7354 if (cp < array[0]) { /* Fail if outside the array */
7358 if (cp >= array[mid - 1]) {
7363 /* Binary search. What we are looking for is <i> such that
7364 * array[i] <= cp < array[i+1]
7365 * The loop below converges on the i+1. Note that there may not be an
7366 * (i+1)th element in the array, and things work nonetheless */
7367 while (low < high) {
7368 mid = (low + high) / 2;
7369 assert(mid <= highest_element);
7370 if (array[mid] <= cp) { /* cp >= array[mid] */
7373 /* We could do this extra test to exit the loop early.
7374 if (cp < array[low]) {
7379 else { /* cp < array[mid] */
7386 invlist_set_previous_index(invlist, high);
7391 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7393 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7394 * but is used when the swash has an inversion list. This makes this much
7395 * faster, as it uses a binary search instead of a linear one. This is
7396 * intimately tied to that function, and perhaps should be in utf8.c,
7397 * except it is intimately tied to inversion lists as well. It assumes
7398 * that <swatch> is all 0's on input */
7401 const IV len = _invlist_len(invlist);
7405 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7407 if (len == 0) { /* Empty inversion list */
7411 array = invlist_array(invlist);
7413 /* Find which element it is */
7414 i = _invlist_search(invlist, start);
7416 /* We populate from <start> to <end> */
7417 while (current < end) {
7420 /* The inversion list gives the results for every possible code point
7421 * after the first one in the list. Only those ranges whose index is
7422 * even are ones that the inversion list matches. For the odd ones,
7423 * and if the initial code point is not in the list, we have to skip
7424 * forward to the next element */
7425 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7427 if (i >= len) { /* Finished if beyond the end of the array */
7431 if (current >= end) { /* Finished if beyond the end of what we
7433 if (LIKELY(end < UV_MAX)) {
7437 /* We get here when the upper bound is the maximum
7438 * representable on the machine, and we are looking for just
7439 * that code point. Have to special case it */
7441 goto join_end_of_list;
7444 assert(current >= start);
7446 /* The current range ends one below the next one, except don't go past
7449 upper = (i < len && array[i] < end) ? array[i] : end;
7451 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
7452 * for each code point in it */
7453 for (; current < upper; current++) {
7454 const STRLEN offset = (STRLEN)(current - start);
7455 swatch[offset >> 3] |= 1 << (offset & 7);
7460 /* Quit if at the end of the list */
7463 /* But first, have to deal with the highest possible code point on
7464 * the platform. The previous code assumes that <end> is one
7465 * beyond where we want to populate, but that is impossible at the
7466 * platform's infinity, so have to handle it specially */
7467 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7469 const STRLEN offset = (STRLEN)(end - start);
7470 swatch[offset >> 3] |= 1 << (offset & 7);
7475 /* Advance to the next range, which will be for code points not in the
7484 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7486 /* Take the union of two inversion lists and point <output> to it. *output
7487 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7488 * the reference count to that list will be decremented. The first list,
7489 * <a>, may be NULL, in which case a copy of the second list is returned.
7490 * If <complement_b> is TRUE, the union is taken of the complement
7491 * (inversion) of <b> instead of b itself.
7493 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7494 * Richard Gillam, published by Addison-Wesley, and explained at some
7495 * length there. The preface says to incorporate its examples into your
7496 * code at your own risk.
7498 * The algorithm is like a merge sort.
7500 * XXX A potential performance improvement is to keep track as we go along
7501 * if only one of the inputs contributes to the result, meaning the other
7502 * is a subset of that one. In that case, we can skip the final copy and
7503 * return the larger of the input lists, but then outside code might need
7504 * to keep track of whether to free the input list or not */
7506 UV* array_a; /* a's array */
7508 UV len_a; /* length of a's array */
7511 SV* u; /* the resulting union */
7515 UV i_a = 0; /* current index into a's array */
7519 /* running count, as explained in the algorithm source book; items are
7520 * stopped accumulating and are output when the count changes to/from 0.
7521 * The count is incremented when we start a range that's in the set, and
7522 * decremented when we start a range that's not in the set. So its range
7523 * is 0 to 2. Only when the count is zero is something not in the set.
7527 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7530 /* If either one is empty, the union is the other one */
7531 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7538 *output = invlist_clone(b);
7540 _invlist_invert(*output);
7542 } /* else *output already = b; */
7545 else if ((len_b = _invlist_len(b)) == 0) {
7550 /* The complement of an empty list is a list that has everything in it,
7551 * so the union with <a> includes everything too */
7556 *output = _new_invlist(1);
7557 _append_range_to_invlist(*output, 0, UV_MAX);
7559 else if (*output != a) {
7560 *output = invlist_clone(a);
7562 /* else *output already = a; */
7566 /* Here both lists exist and are non-empty */
7567 array_a = invlist_array(a);
7568 array_b = invlist_array(b);
7570 /* If are to take the union of 'a' with the complement of b, set it
7571 * up so are looking at b's complement. */
7574 /* To complement, we invert: if the first element is 0, remove it. To
7575 * do this, we just pretend the array starts one later, and clear the
7576 * flag as we don't have to do anything else later */
7577 if (array_b[0] == 0) {
7580 complement_b = FALSE;
7584 /* But if the first element is not zero, we unshift a 0 before the
7585 * array. The data structure reserves a space for that 0 (which
7586 * should be a '1' right now), so physical shifting is unneeded,
7587 * but temporarily change that element to 0. Before exiting the
7588 * routine, we must restore the element to '1' */
7595 /* Size the union for the worst case: that the sets are completely
7597 u = _new_invlist(len_a + len_b);
7599 /* Will contain U+0000 if either component does */
7600 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7601 || (len_b > 0 && array_b[0] == 0));
7603 /* Go through each list item by item, stopping when exhausted one of
7605 while (i_a < len_a && i_b < len_b) {
7606 UV cp; /* The element to potentially add to the union's array */
7607 bool cp_in_set; /* is it in the the input list's set or not */
7609 /* We need to take one or the other of the two inputs for the union.
7610 * Since we are merging two sorted lists, we take the smaller of the
7611 * next items. In case of a tie, we take the one that is in its set
7612 * first. If we took one not in the set first, it would decrement the
7613 * count, possibly to 0 which would cause it to be output as ending the
7614 * range, and the next time through we would take the same number, and
7615 * output it again as beginning the next range. By doing it the
7616 * opposite way, there is no possibility that the count will be
7617 * momentarily decremented to 0, and thus the two adjoining ranges will
7618 * be seamlessly merged. (In a tie and both are in the set or both not
7619 * in the set, it doesn't matter which we take first.) */
7620 if (array_a[i_a] < array_b[i_b]
7621 || (array_a[i_a] == array_b[i_b]
7622 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7624 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7628 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7629 cp = array_b[i_b++];
7632 /* Here, have chosen which of the two inputs to look at. Only output
7633 * if the running count changes to/from 0, which marks the
7634 * beginning/end of a range in that's in the set */
7637 array_u[i_u++] = cp;
7644 array_u[i_u++] = cp;
7649 /* Here, we are finished going through at least one of the lists, which
7650 * means there is something remaining in at most one. We check if the list
7651 * that hasn't been exhausted is positioned such that we are in the middle
7652 * of a range in its set or not. (i_a and i_b point to the element beyond
7653 * the one we care about.) If in the set, we decrement 'count'; if 0, there
7654 * is potentially more to output.
7655 * There are four cases:
7656 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
7657 * in the union is entirely from the non-exhausted set.
7658 * 2) Both were in their sets, count is 2. Nothing further should
7659 * be output, as everything that remains will be in the exhausted
7660 * list's set, hence in the union; decrementing to 1 but not 0 insures
7662 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7663 * Nothing further should be output because the union includes
7664 * everything from the exhausted set. Not decrementing ensures that.
7665 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7666 * decrementing to 0 insures that we look at the remainder of the
7667 * non-exhausted set */
7668 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7669 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7674 /* The final length is what we've output so far, plus what else is about to
7675 * be output. (If 'count' is non-zero, then the input list we exhausted
7676 * has everything remaining up to the machine's limit in its set, and hence
7677 * in the union, so there will be no further output. */
7680 /* At most one of the subexpressions will be non-zero */
7681 len_u += (len_a - i_a) + (len_b - i_b);
7684 /* Set result to final length, which can change the pointer to array_u, so
7686 if (len_u != _invlist_len(u)) {
7687 invlist_set_len(u, len_u);
7689 array_u = invlist_array(u);
7692 /* When 'count' is 0, the list that was exhausted (if one was shorter than
7693 * the other) ended with everything above it not in its set. That means
7694 * that the remaining part of the union is precisely the same as the
7695 * non-exhausted list, so can just copy it unchanged. (If both list were
7696 * exhausted at the same time, then the operations below will be both 0.)
7699 IV copy_count; /* At most one will have a non-zero copy count */
7700 if ((copy_count = len_a - i_a) > 0) {
7701 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7703 else if ((copy_count = len_b - i_b) > 0) {
7704 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7708 /* If we've changed b, restore it */
7713 /* We may be removing a reference to one of the inputs */
7714 if (a == *output || b == *output) {
7715 assert(! invlist_is_iterating(*output));
7716 SvREFCNT_dec_NN(*output);
7724 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7726 /* Take the intersection of two inversion lists and point <i> to it. *i
7727 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7728 * the reference count to that list will be decremented.
7729 * If <complement_b> is TRUE, the result will be the intersection of <a>
7730 * and the complement (or inversion) of <b> instead of <b> directly.
7732 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7733 * Richard Gillam, published by Addison-Wesley, and explained at some
7734 * length there. The preface says to incorporate its examples into your
7735 * code at your own risk. In fact, it had bugs
7737 * The algorithm is like a merge sort, and is essentially the same as the
7741 UV* array_a; /* a's array */
7743 UV len_a; /* length of a's array */
7746 SV* r; /* the resulting intersection */
7750 UV i_a = 0; /* current index into a's array */
7754 /* running count, as explained in the algorithm source book; items are
7755 * stopped accumulating and are output when the count changes to/from 2.
7756 * The count is incremented when we start a range that's in the set, and
7757 * decremented when we start a range that's not in the set. So its range
7758 * is 0 to 2. Only when the count is 2 is something in the intersection.
7762 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7765 /* Special case if either one is empty */
7766 len_a = _invlist_len(a);
7767 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7769 if (len_a != 0 && complement_b) {
7771 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7772 * be empty. Here, also we are using 'b's complement, which hence
7773 * must be every possible code point. Thus the intersection is
7776 *i = invlist_clone(a);
7782 /* else *i is already 'a' */
7786 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
7787 * intersection must be empty */
7794 *i = _new_invlist(0);
7798 /* Here both lists exist and are non-empty */
7799 array_a = invlist_array(a);
7800 array_b = invlist_array(b);
7802 /* If are to take the intersection of 'a' with the complement of b, set it
7803 * up so are looking at b's complement. */
7806 /* To complement, we invert: if the first element is 0, remove it. To
7807 * do this, we just pretend the array starts one later, and clear the
7808 * flag as we don't have to do anything else later */
7809 if (array_b[0] == 0) {
7812 complement_b = FALSE;
7816 /* But if the first element is not zero, we unshift a 0 before the
7817 * array. The data structure reserves a space for that 0 (which
7818 * should be a '1' right now), so physical shifting is unneeded,
7819 * but temporarily change that element to 0. Before exiting the
7820 * routine, we must restore the element to '1' */
7827 /* Size the intersection for the worst case: that the intersection ends up
7828 * fragmenting everything to be completely disjoint */
7829 r= _new_invlist(len_a + len_b);
7831 /* Will contain U+0000 iff both components do */
7832 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7833 && len_b > 0 && array_b[0] == 0);
7835 /* Go through each list item by item, stopping when exhausted one of
7837 while (i_a < len_a && i_b < len_b) {
7838 UV cp; /* The element to potentially add to the intersection's
7840 bool cp_in_set; /* Is it in the input list's set or not */
7842 /* We need to take one or the other of the two inputs for the
7843 * intersection. Since we are merging two sorted lists, we take the
7844 * smaller of the next items. In case of a tie, we take the one that
7845 * is not in its set first (a difference from the union algorithm). If
7846 * we took one in the set first, it would increment the count, possibly
7847 * to 2 which would cause it to be output as starting a range in the
7848 * intersection, and the next time through we would take that same
7849 * number, and output it again as ending the set. By doing it the
7850 * opposite of this, there is no possibility that the count will be
7851 * momentarily incremented to 2. (In a tie and both are in the set or
7852 * both not in the set, it doesn't matter which we take first.) */
7853 if (array_a[i_a] < array_b[i_b]
7854 || (array_a[i_a] == array_b[i_b]
7855 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7857 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7861 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7865 /* Here, have chosen which of the two inputs to look at. Only output
7866 * if the running count changes to/from 2, which marks the
7867 * beginning/end of a range that's in the intersection */
7871 array_r[i_r++] = cp;
7876 array_r[i_r++] = cp;
7882 /* Here, we are finished going through at least one of the lists, which
7883 * means there is something remaining in at most one. We check if the list
7884 * that has been exhausted is positioned such that we are in the middle
7885 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
7886 * the ones we care about.) There are four cases:
7887 * 1) Both weren't in their sets, count is 0, and remains 0. There's
7888 * nothing left in the intersection.
7889 * 2) Both were in their sets, count is 2 and perhaps is incremented to
7890 * above 2. What should be output is exactly that which is in the
7891 * non-exhausted set, as everything it has is also in the intersection
7892 * set, and everything it doesn't have can't be in the intersection
7893 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7894 * gets incremented to 2. Like the previous case, the intersection is
7895 * everything that remains in the non-exhausted set.
7896 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7897 * remains 1. And the intersection has nothing more. */
7898 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7899 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7904 /* The final length is what we've output so far plus what else is in the
7905 * intersection. At most one of the subexpressions below will be non-zero */
7908 len_r += (len_a - i_a) + (len_b - i_b);
7911 /* Set result to final length, which can change the pointer to array_r, so
7913 if (len_r != _invlist_len(r)) {
7914 invlist_set_len(r, len_r);
7916 array_r = invlist_array(r);
7919 /* Finish outputting any remaining */
7920 if (count >= 2) { /* At most one will have a non-zero copy count */
7922 if ((copy_count = len_a - i_a) > 0) {
7923 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7925 else if ((copy_count = len_b - i_b) > 0) {
7926 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7930 /* If we've changed b, restore it */
7935 /* We may be removing a reference to one of the inputs */
7936 if (a == *i || b == *i) {
7937 assert(! invlist_is_iterating(*i));
7938 SvREFCNT_dec_NN(*i);
7946 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7948 /* Add the range from 'start' to 'end' inclusive to the inversion list's
7949 * set. A pointer to the inversion list is returned. This may actually be
7950 * a new list, in which case the passed in one has been destroyed. The
7951 * passed in inversion list can be NULL, in which case a new one is created
7952 * with just the one range in it */
7957 if (invlist == NULL) {
7958 invlist = _new_invlist(2);
7962 len = _invlist_len(invlist);
7965 /* If comes after the final entry actually in the list, can just append it
7968 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
7969 && start >= invlist_array(invlist)[len - 1]))
7971 _append_range_to_invlist(invlist, start, end);
7975 /* Here, can't just append things, create and return a new inversion list
7976 * which is the union of this range and the existing inversion list */
7977 range_invlist = _new_invlist(2);
7978 _append_range_to_invlist(range_invlist, start, end);
7980 _invlist_union(invlist, range_invlist, &invlist);
7982 /* The temporary can be freed */
7983 SvREFCNT_dec_NN(range_invlist);
7990 PERL_STATIC_INLINE SV*
7991 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7992 return _add_range_to_invlist(invlist, cp, cp);
7995 #ifndef PERL_IN_XSUB_RE
7997 Perl__invlist_invert(pTHX_ SV* const invlist)
7999 /* Complement the input inversion list. This adds a 0 if the list didn't
8000 * have a zero; removes it otherwise. As described above, the data
8001 * structure is set up so that this is very efficient */
8003 UV* len_pos = _get_invlist_len_addr(invlist);
8005 PERL_ARGS_ASSERT__INVLIST_INVERT;
8007 assert(! invlist_is_iterating(invlist));
8009 /* The inverse of matching nothing is matching everything */
8010 if (*len_pos == 0) {
8011 _append_range_to_invlist(invlist, 0, UV_MAX);
8015 /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
8016 * zero element was a 0, so it is being removed, so the length decrements
8017 * by 1; and vice-versa. SvCUR is unaffected */
8018 if (*get_invlist_zero_addr(invlist) ^= 1) {
8027 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8029 /* Complement the input inversion list (which must be a Unicode property,
8030 * all of which don't match above the Unicode maximum code point.) And
8031 * Perl has chosen to not have the inversion match above that either. This
8032 * adds a 0x110000 if the list didn't end with it, and removes it if it did
8038 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8040 _invlist_invert(invlist);
8042 len = _invlist_len(invlist);
8044 if (len != 0) { /* If empty do nothing */
8045 array = invlist_array(invlist);
8046 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8047 /* Add 0x110000. First, grow if necessary */
8049 if (invlist_max(invlist) < len) {
8050 invlist_extend(invlist, len);
8051 array = invlist_array(invlist);
8053 invlist_set_len(invlist, len);
8054 array[len - 1] = PERL_UNICODE_MAX + 1;
8056 else { /* Remove the 0x110000 */
8057 invlist_set_len(invlist, len - 1);
8065 PERL_STATIC_INLINE SV*
8066 S_invlist_clone(pTHX_ SV* const invlist)
8069 /* Return a new inversion list that is a copy of the input one, which is
8072 /* Need to allocate extra space to accommodate Perl's addition of a
8073 * trailing NUL to SvPV's, since it thinks they are always strings */
8074 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8075 STRLEN length = SvCUR(invlist);
8077 PERL_ARGS_ASSERT_INVLIST_CLONE;
8079 SvCUR_set(new_invlist, length); /* This isn't done automatically */
8080 Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8085 PERL_STATIC_INLINE UV*
8086 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8088 /* Return the address of the UV that contains the current iteration
8091 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8093 return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8096 PERL_STATIC_INLINE UV*
8097 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8099 /* Return the address of the UV that contains the version id. */
8101 PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8103 return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8106 PERL_STATIC_INLINE void
8107 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8109 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8111 *get_invlist_iter_addr(invlist) = 0;
8114 PERL_STATIC_INLINE void
8115 S_invlist_iterfinish(pTHX_ SV* invlist)
8117 /* Terminate iterator for invlist. This is to catch development errors.
8118 * Any iteration that is interrupted before completed should call this
8119 * function. Functions that add code points anywhere else but to the end
8120 * of an inversion list assert that they are not in the middle of an
8121 * iteration. If they were, the addition would make the iteration
8122 * problematical: if the iteration hadn't reached the place where things
8123 * were being added, it would be ok */
8125 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8127 *get_invlist_iter_addr(invlist) = UV_MAX;
8131 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8133 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8134 * This call sets in <*start> and <*end>, the next range in <invlist>.
8135 * Returns <TRUE> if successful and the next call will return the next
8136 * range; <FALSE> if was already at the end of the list. If the latter,
8137 * <*start> and <*end> are unchanged, and the next call to this function
8138 * will start over at the beginning of the list */
8140 UV* pos = get_invlist_iter_addr(invlist);
8141 UV len = _invlist_len(invlist);
8144 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8147 *pos = UV_MAX; /* Force iterinit() to be required next time */
8151 array = invlist_array(invlist);
8153 *start = array[(*pos)++];
8159 *end = array[(*pos)++] - 1;
8165 PERL_STATIC_INLINE bool
8166 S_invlist_is_iterating(pTHX_ SV* const invlist)
8168 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8170 return *(get_invlist_iter_addr(invlist)) < UV_MAX;
8173 PERL_STATIC_INLINE UV
8174 S_invlist_highest(pTHX_ SV* const invlist)
8176 /* Returns the highest code point that matches an inversion list. This API
8177 * has an ambiguity, as it returns 0 under either the highest is actually
8178 * 0, or if the list is empty. If this distinction matters to you, check
8179 * for emptiness before calling this function */
8181 UV len = _invlist_len(invlist);
8184 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8190 array = invlist_array(invlist);
8192 /* The last element in the array in the inversion list always starts a
8193 * range that goes to infinity. That range may be for code points that are
8194 * matched in the inversion list, or it may be for ones that aren't
8195 * matched. In the latter case, the highest code point in the set is one
8196 * less than the beginning of this range; otherwise it is the final element
8197 * of this range: infinity */
8198 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8200 : array[len - 1] - 1;
8203 #ifndef PERL_IN_XSUB_RE
8205 Perl__invlist_contents(pTHX_ SV* const invlist)
8207 /* Get the contents of an inversion list into a string SV so that they can
8208 * be printed out. It uses the format traditionally done for debug tracing
8212 SV* output = newSVpvs("\n");
8214 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8216 assert(! invlist_is_iterating(invlist));
8218 invlist_iterinit(invlist);
8219 while (invlist_iternext(invlist, &start, &end)) {
8220 if (end == UV_MAX) {
8221 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8223 else if (end != start) {
8224 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8228 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8236 #ifdef PERL_ARGS_ASSERT__INVLIST_DUMP
8238 Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header)
8240 /* Dumps out the ranges in an inversion list. The string 'header'
8241 * if present is output on a line before the first range */
8245 PERL_ARGS_ASSERT__INVLIST_DUMP;
8247 if (header && strlen(header)) {
8248 PerlIO_printf(Perl_debug_log, "%s\n", header);
8250 if (invlist_is_iterating(invlist)) {
8251 PerlIO_printf(Perl_debug_log, "Can't dump because is in middle of iterating\n");
8255 invlist_iterinit(invlist);
8256 while (invlist_iternext(invlist, &start, &end)) {
8257 if (end == UV_MAX) {
8258 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8260 else if (end != start) {
8261 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n",
8265 PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start);
8273 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8275 /* Return a boolean as to if the two passed in inversion lists are
8276 * identical. The final argument, if TRUE, says to take the complement of
8277 * the second inversion list before doing the comparison */
8279 UV* array_a = invlist_array(a);
8280 UV* array_b = invlist_array(b);
8281 UV len_a = _invlist_len(a);
8282 UV len_b = _invlist_len(b);
8284 UV i = 0; /* current index into the arrays */
8285 bool retval = TRUE; /* Assume are identical until proven otherwise */
8287 PERL_ARGS_ASSERT__INVLISTEQ;
8289 /* If are to compare 'a' with the complement of b, set it
8290 * up so are looking at b's complement. */
8293 /* The complement of nothing is everything, so <a> would have to have
8294 * just one element, starting at zero (ending at infinity) */
8296 return (len_a == 1 && array_a[0] == 0);
8298 else if (array_b[0] == 0) {
8300 /* Otherwise, to complement, we invert. Here, the first element is
8301 * 0, just remove it. To do this, we just pretend the array starts
8302 * one later, and clear the flag as we don't have to do anything
8307 complement_b = FALSE;
8311 /* But if the first element is not zero, we unshift a 0 before the
8312 * array. The data structure reserves a space for that 0 (which
8313 * should be a '1' right now), so physical shifting is unneeded,
8314 * but temporarily change that element to 0. Before exiting the
8315 * routine, we must restore the element to '1' */
8322 /* Make sure that the lengths are the same, as well as the final element
8323 * before looping through the remainder. (Thus we test the length, final,
8324 * and first elements right off the bat) */
8325 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8328 else for (i = 0; i < len_a - 1; i++) {
8329 if (array_a[i] != array_b[i]) {
8342 #undef HEADER_LENGTH
8343 #undef INVLIST_INITIAL_LENGTH
8344 #undef TO_INTERNAL_SIZE
8345 #undef FROM_INTERNAL_SIZE
8346 #undef INVLIST_LEN_OFFSET
8347 #undef INVLIST_ZERO_OFFSET
8348 #undef INVLIST_ITER_OFFSET
8349 #undef INVLIST_VERSION_ID
8350 #undef INVLIST_PREVIOUS_INDEX_OFFSET
8352 /* End of inversion list object */
8355 S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state)
8357 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
8358 * constructs, and updates RExC_flags with them. On input, RExC_parse
8359 * should point to the first flag; it is updated on output to point to the
8360 * final ')' or ':'. There needs to be at least one flag, or this will
8363 /* for (?g), (?gc), and (?o) warnings; warning
8364 about (?c) will warn about (?g) -- japhy */
8366 #define WASTED_O 0x01
8367 #define WASTED_G 0x02
8368 #define WASTED_C 0x04
8369 #define WASTED_GC (0x02|0x04)
8370 I32 wastedflags = 0x00;
8371 U32 posflags = 0, negflags = 0;
8372 U32 *flagsp = &posflags;
8373 char has_charset_modifier = '\0';
8375 bool has_use_defaults = FALSE;
8376 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
8378 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
8380 /* '^' as an initial flag sets certain defaults */
8381 if (UCHARAT(RExC_parse) == '^') {
8383 has_use_defaults = TRUE;
8384 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8385 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8386 ? REGEX_UNICODE_CHARSET
8387 : REGEX_DEPENDS_CHARSET);
8390 cs = get_regex_charset(RExC_flags);
8391 if (cs == REGEX_DEPENDS_CHARSET
8392 && (RExC_utf8 || RExC_uni_semantics))
8394 cs = REGEX_UNICODE_CHARSET;
8397 while (*RExC_parse) {
8398 /* && strchr("iogcmsx", *RExC_parse) */
8399 /* (?g), (?gc) and (?o) are useless here
8400 and must be globally applied -- japhy */
8401 switch (*RExC_parse) {
8403 /* Code for the imsx flags */
8404 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8406 case LOCALE_PAT_MOD:
8407 if (has_charset_modifier) {
8408 goto excess_modifier;
8410 else if (flagsp == &negflags) {
8413 cs = REGEX_LOCALE_CHARSET;
8414 has_charset_modifier = LOCALE_PAT_MOD;
8415 RExC_contains_locale = 1;
8417 case UNICODE_PAT_MOD:
8418 if (has_charset_modifier) {
8419 goto excess_modifier;
8421 else if (flagsp == &negflags) {
8424 cs = REGEX_UNICODE_CHARSET;
8425 has_charset_modifier = UNICODE_PAT_MOD;
8427 case ASCII_RESTRICT_PAT_MOD:
8428 if (flagsp == &negflags) {
8431 if (has_charset_modifier) {
8432 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8433 goto excess_modifier;
8435 /* Doubled modifier implies more restricted */
8436 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8439 cs = REGEX_ASCII_RESTRICTED_CHARSET;
8441 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8443 case DEPENDS_PAT_MOD:
8444 if (has_use_defaults) {
8445 goto fail_modifiers;
8447 else if (flagsp == &negflags) {
8450 else if (has_charset_modifier) {
8451 goto excess_modifier;
8454 /* The dual charset means unicode semantics if the
8455 * pattern (or target, not known until runtime) are
8456 * utf8, or something in the pattern indicates unicode
8458 cs = (RExC_utf8 || RExC_uni_semantics)
8459 ? REGEX_UNICODE_CHARSET
8460 : REGEX_DEPENDS_CHARSET;
8461 has_charset_modifier = DEPENDS_PAT_MOD;
8465 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8466 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8468 else if (has_charset_modifier == *(RExC_parse - 1)) {
8469 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8472 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8477 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8479 case ONCE_PAT_MOD: /* 'o' */
8480 case GLOBAL_PAT_MOD: /* 'g' */
8481 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8482 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8483 if (! (wastedflags & wflagbit) ) {
8484 wastedflags |= wflagbit;
8487 "Useless (%s%c) - %suse /%c modifier",
8488 flagsp == &negflags ? "?-" : "?",
8490 flagsp == &negflags ? "don't " : "",
8497 case CONTINUE_PAT_MOD: /* 'c' */
8498 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8499 if (! (wastedflags & WASTED_C) ) {
8500 wastedflags |= WASTED_GC;
8503 "Useless (%sc) - %suse /gc modifier",
8504 flagsp == &negflags ? "?-" : "?",
8505 flagsp == &negflags ? "don't " : ""
8510 case KEEPCOPY_PAT_MOD: /* 'p' */
8511 if (flagsp == &negflags) {
8513 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8515 *flagsp |= RXf_PMf_KEEPCOPY;
8519 /* A flag is a default iff it is following a minus, so
8520 * if there is a minus, it means will be trying to
8521 * re-specify a default which is an error */
8522 if (has_use_defaults || flagsp == &negflags) {
8523 goto fail_modifiers;
8526 wastedflags = 0; /* reset so (?g-c) warns twice */
8530 RExC_flags |= posflags;
8531 RExC_flags &= ~negflags;
8532 set_regex_charset(&RExC_flags, cs);
8538 vFAIL3("Sequence (%.*s...) not recognized",
8539 RExC_parse-seqstart, seqstart);
8548 - reg - regular expression, i.e. main body or parenthesized thing
8550 * Caller must absorb opening parenthesis.
8552 * Combining parenthesis handling with the base level of regular expression
8553 * is a trifle forced, but the need to tie the tails of the branches to what
8554 * follows makes it hard to avoid.
8556 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8558 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8560 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8563 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
8564 flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
8565 needs to be restarted.
8566 Otherwise would only return NULL if regbranch() returns NULL, which
8569 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8570 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8573 regnode *ret; /* Will be the head of the group. */
8576 regnode *ender = NULL;
8579 U32 oregflags = RExC_flags;
8580 bool have_branch = 0;
8582 I32 freeze_paren = 0;
8583 I32 after_freeze = 0;
8585 char * parse_start = RExC_parse; /* MJD */
8586 char * const oregcomp_parse = RExC_parse;
8588 GET_RE_DEBUG_FLAGS_DECL;
8590 PERL_ARGS_ASSERT_REG;
8591 DEBUG_PARSE("reg ");
8593 *flagp = 0; /* Tentatively. */
8596 /* Make an OPEN node, if parenthesized. */
8598 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8599 char *start_verb = RExC_parse;
8600 STRLEN verb_len = 0;
8601 char *start_arg = NULL;
8602 unsigned char op = 0;
8604 int internal_argval = 0; /* internal_argval is only useful if !argok */
8605 while ( *RExC_parse && *RExC_parse != ')' ) {
8606 if ( *RExC_parse == ':' ) {
8607 start_arg = RExC_parse + 1;
8613 verb_len = RExC_parse - start_verb;
8616 while ( *RExC_parse && *RExC_parse != ')' )
8618 if ( *RExC_parse != ')' )
8619 vFAIL("Unterminated verb pattern argument");
8620 if ( RExC_parse == start_arg )
8623 if ( *RExC_parse != ')' )
8624 vFAIL("Unterminated verb pattern");
8627 switch ( *start_verb ) {
8628 case 'A': /* (*ACCEPT) */
8629 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8631 internal_argval = RExC_nestroot;
8634 case 'C': /* (*COMMIT) */
8635 if ( memEQs(start_verb,verb_len,"COMMIT") )
8638 case 'F': /* (*FAIL) */
8639 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8644 case ':': /* (*:NAME) */
8645 case 'M': /* (*MARK:NAME) */
8646 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8651 case 'P': /* (*PRUNE) */
8652 if ( memEQs(start_verb,verb_len,"PRUNE") )
8655 case 'S': /* (*SKIP) */
8656 if ( memEQs(start_verb,verb_len,"SKIP") )
8659 case 'T': /* (*THEN) */
8660 /* [19:06] <TimToady> :: is then */
8661 if ( memEQs(start_verb,verb_len,"THEN") ) {
8663 RExC_seen |= REG_SEEN_CUTGROUP;
8669 vFAIL3("Unknown verb pattern '%.*s'",
8670 verb_len, start_verb);
8673 if ( start_arg && internal_argval ) {
8674 vFAIL3("Verb pattern '%.*s' may not have an argument",
8675 verb_len, start_verb);
8676 } else if ( argok < 0 && !start_arg ) {
8677 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8678 verb_len, start_verb);
8680 ret = reganode(pRExC_state, op, internal_argval);
8681 if ( ! internal_argval && ! SIZE_ONLY ) {
8683 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8684 ARG(ret) = add_data( pRExC_state, 1, "S" );
8685 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8692 if (!internal_argval)
8693 RExC_seen |= REG_SEEN_VERBARG;
8694 } else if ( start_arg ) {
8695 vFAIL3("Verb pattern '%.*s' may not have an argument",
8696 verb_len, start_verb);
8698 ret = reg_node(pRExC_state, op);
8700 nextchar(pRExC_state);
8703 if (*RExC_parse == '?') { /* (?...) */
8704 bool is_logical = 0;
8705 const char * const seqstart = RExC_parse;
8708 paren = *RExC_parse++;
8709 ret = NULL; /* For look-ahead/behind. */
8712 case 'P': /* (?P...) variants for those used to PCRE/Python */
8713 paren = *RExC_parse++;
8714 if ( paren == '<') /* (?P<...>) named capture */
8716 else if (paren == '>') { /* (?P>name) named recursion */
8717 goto named_recursion;
8719 else if (paren == '=') { /* (?P=...) named backref */
8720 /* this pretty much dupes the code for \k<NAME> in regatom(), if
8721 you change this make sure you change that */
8722 char* name_start = RExC_parse;
8724 SV *sv_dat = reg_scan_name(pRExC_state,
8725 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8726 if (RExC_parse == name_start || *RExC_parse != ')')
8727 vFAIL2("Sequence %.3s... not terminated",parse_start);
8730 num = add_data( pRExC_state, 1, "S" );
8731 RExC_rxi->data->data[num]=(void*)sv_dat;
8732 SvREFCNT_inc_simple_void(sv_dat);
8735 ret = reganode(pRExC_state,
8738 : (ASCII_FOLD_RESTRICTED)
8740 : (AT_LEAST_UNI_SEMANTICS)
8748 Set_Node_Offset(ret, parse_start+1);
8749 Set_Node_Cur_Length(ret); /* MJD */
8751 nextchar(pRExC_state);
8755 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8757 case '<': /* (?<...) */
8758 if (*RExC_parse == '!')
8760 else if (*RExC_parse != '=')
8766 case '\'': /* (?'...') */
8767 name_start= RExC_parse;
8768 svname = reg_scan_name(pRExC_state,
8769 SIZE_ONLY ? /* reverse test from the others */
8770 REG_RSN_RETURN_NAME :
8771 REG_RSN_RETURN_NULL);
8772 if (RExC_parse == name_start) {
8774 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8777 if (*RExC_parse != paren)
8778 vFAIL2("Sequence (?%c... not terminated",
8779 paren=='>' ? '<' : paren);
8783 if (!svname) /* shouldn't happen */
8785 "panic: reg_scan_name returned NULL");
8786 if (!RExC_paren_names) {
8787 RExC_paren_names= newHV();
8788 sv_2mortal(MUTABLE_SV(RExC_paren_names));
8790 RExC_paren_name_list= newAV();
8791 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8794 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8796 sv_dat = HeVAL(he_str);
8798 /* croak baby croak */
8800 "panic: paren_name hash element allocation failed");
8801 } else if ( SvPOK(sv_dat) ) {
8802 /* (?|...) can mean we have dupes so scan to check
8803 its already been stored. Maybe a flag indicating
8804 we are inside such a construct would be useful,
8805 but the arrays are likely to be quite small, so
8806 for now we punt -- dmq */
8807 IV count = SvIV(sv_dat);
8808 I32 *pv = (I32*)SvPVX(sv_dat);
8810 for ( i = 0 ; i < count ; i++ ) {
8811 if ( pv[i] == RExC_npar ) {
8817 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8818 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8819 pv[count] = RExC_npar;
8820 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8823 (void)SvUPGRADE(sv_dat,SVt_PVNV);
8824 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8826 SvIV_set(sv_dat, 1);
8829 /* Yes this does cause a memory leak in debugging Perls */
8830 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8831 SvREFCNT_dec_NN(svname);
8834 /*sv_dump(sv_dat);*/
8836 nextchar(pRExC_state);
8838 goto capturing_parens;
8840 RExC_seen |= REG_SEEN_LOOKBEHIND;
8841 RExC_in_lookbehind++;
8843 case '=': /* (?=...) */
8844 RExC_seen_zerolen++;
8846 case '!': /* (?!...) */
8847 RExC_seen_zerolen++;
8848 if (*RExC_parse == ')') {
8849 ret=reg_node(pRExC_state, OPFAIL);
8850 nextchar(pRExC_state);
8854 case '|': /* (?|...) */
8855 /* branch reset, behave like a (?:...) except that
8856 buffers in alternations share the same numbers */
8858 after_freeze = freeze_paren = RExC_npar;
8860 case ':': /* (?:...) */
8861 case '>': /* (?>...) */
8863 case '$': /* (?$...) */
8864 case '@': /* (?@...) */
8865 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8867 case '0' : /* (?0) */
8868 case 'R' : /* (?R) */
8869 if (*RExC_parse != ')')
8870 FAIL("Sequence (?R) not terminated");
8871 ret = reg_node(pRExC_state, GOSTART);
8872 *flagp |= POSTPONED;
8873 nextchar(pRExC_state);
8876 { /* named and numeric backreferences */
8878 case '&': /* (?&NAME) */
8879 parse_start = RExC_parse - 1;
8882 SV *sv_dat = reg_scan_name(pRExC_state,
8883 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8884 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8886 goto gen_recurse_regop;
8887 assert(0); /* NOT REACHED */
8889 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8891 vFAIL("Illegal pattern");
8893 goto parse_recursion;
8895 case '-': /* (?-1) */
8896 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8897 RExC_parse--; /* rewind to let it be handled later */
8901 case '1': case '2': case '3': case '4': /* (?1) */
8902 case '5': case '6': case '7': case '8': case '9':
8905 num = atoi(RExC_parse);
8906 parse_start = RExC_parse - 1; /* MJD */
8907 if (*RExC_parse == '-')
8909 while (isDIGIT(*RExC_parse))
8911 if (*RExC_parse!=')')
8912 vFAIL("Expecting close bracket");
8915 if ( paren == '-' ) {
8917 Diagram of capture buffer numbering.
8918 Top line is the normal capture buffer numbers
8919 Bottom line is the negative indexing as from
8923 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8927 num = RExC_npar + num;
8930 vFAIL("Reference to nonexistent group");
8932 } else if ( paren == '+' ) {
8933 num = RExC_npar + num - 1;
8936 ret = reganode(pRExC_state, GOSUB, num);
8938 if (num > (I32)RExC_rx->nparens) {
8940 vFAIL("Reference to nonexistent group");
8942 ARG2L_SET( ret, RExC_recurse_count++);
8944 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8945 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8949 RExC_seen |= REG_SEEN_RECURSE;
8950 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8951 Set_Node_Offset(ret, parse_start); /* MJD */
8953 *flagp |= POSTPONED;
8954 nextchar(pRExC_state);
8956 } /* named and numeric backreferences */
8957 assert(0); /* NOT REACHED */
8959 case '?': /* (??...) */
8961 if (*RExC_parse != '{') {
8963 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8966 *flagp |= POSTPONED;
8967 paren = *RExC_parse++;
8969 case '{': /* (?{...}) */
8972 struct reg_code_block *cb;
8974 RExC_seen_zerolen++;
8976 if ( !pRExC_state->num_code_blocks
8977 || pRExC_state->code_index >= pRExC_state->num_code_blocks
8978 || pRExC_state->code_blocks[pRExC_state->code_index].start
8979 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8982 if (RExC_pm_flags & PMf_USE_RE_EVAL)
8983 FAIL("panic: Sequence (?{...}): no code block found\n");
8984 FAIL("Eval-group not allowed at runtime, use re 'eval'");
8986 /* this is a pre-compiled code block (?{...}) */
8987 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8988 RExC_parse = RExC_start + cb->end;
8991 if (cb->src_regex) {
8992 n = add_data(pRExC_state, 2, "rl");
8993 RExC_rxi->data->data[n] =
8994 (void*)SvREFCNT_inc((SV*)cb->src_regex);
8995 RExC_rxi->data->data[n+1] = (void*)o;
8998 n = add_data(pRExC_state, 1,
8999 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
9000 RExC_rxi->data->data[n] = (void*)o;
9003 pRExC_state->code_index++;
9004 nextchar(pRExC_state);
9008 ret = reg_node(pRExC_state, LOGICAL);
9009 eval = reganode(pRExC_state, EVAL, n);
9012 /* for later propagation into (??{}) return value */
9013 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9015 REGTAIL(pRExC_state, ret, eval);
9016 /* deal with the length of this later - MJD */
9019 ret = reganode(pRExC_state, EVAL, n);
9020 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9021 Set_Node_Offset(ret, parse_start);
9024 case '(': /* (?(?{...})...) and (?(?=...)...) */
9027 if (RExC_parse[0] == '?') { /* (?(?...)) */
9028 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9029 || RExC_parse[1] == '<'
9030 || RExC_parse[1] == '{') { /* Lookahead or eval. */
9034 ret = reg_node(pRExC_state, LOGICAL);
9038 tail = reg(pRExC_state, 1, &flag, depth+1);
9039 if (flag & RESTART_UTF8) {
9040 *flagp = RESTART_UTF8;
9043 REGTAIL(pRExC_state, ret, tail);
9047 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
9048 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9050 char ch = RExC_parse[0] == '<' ? '>' : '\'';
9051 char *name_start= RExC_parse++;
9053 SV *sv_dat=reg_scan_name(pRExC_state,
9054 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9055 if (RExC_parse == name_start || *RExC_parse != ch)
9056 vFAIL2("Sequence (?(%c... not terminated",
9057 (ch == '>' ? '<' : ch));
9060 num = add_data( pRExC_state, 1, "S" );
9061 RExC_rxi->data->data[num]=(void*)sv_dat;
9062 SvREFCNT_inc_simple_void(sv_dat);
9064 ret = reganode(pRExC_state,NGROUPP,num);
9065 goto insert_if_check_paren;
9067 else if (RExC_parse[0] == 'D' &&
9068 RExC_parse[1] == 'E' &&
9069 RExC_parse[2] == 'F' &&
9070 RExC_parse[3] == 'I' &&
9071 RExC_parse[4] == 'N' &&
9072 RExC_parse[5] == 'E')
9074 ret = reganode(pRExC_state,DEFINEP,0);
9077 goto insert_if_check_paren;
9079 else if (RExC_parse[0] == 'R') {
9082 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9083 parno = atoi(RExC_parse++);
9084 while (isDIGIT(*RExC_parse))
9086 } else if (RExC_parse[0] == '&') {
9089 sv_dat = reg_scan_name(pRExC_state,
9090 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9091 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9093 ret = reganode(pRExC_state,INSUBP,parno);
9094 goto insert_if_check_paren;
9096 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9099 parno = atoi(RExC_parse++);
9101 while (isDIGIT(*RExC_parse))
9103 ret = reganode(pRExC_state, GROUPP, parno);
9105 insert_if_check_paren:
9106 if ((c = *nextchar(pRExC_state)) != ')')
9107 vFAIL("Switch condition not recognized");
9109 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9110 br = regbranch(pRExC_state, &flags, 1,depth+1);
9112 if (flags & RESTART_UTF8) {
9113 *flagp = RESTART_UTF8;
9116 FAIL2("panic: regbranch returned NULL, flags=%#X",
9119 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
9120 c = *nextchar(pRExC_state);
9125 vFAIL("(?(DEFINE)....) does not allow branches");
9126 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
9127 if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9128 if (flags & RESTART_UTF8) {
9129 *flagp = RESTART_UTF8;
9132 FAIL2("panic: regbranch returned NULL, flags=%#X",
9135 REGTAIL(pRExC_state, ret, lastbr);
9138 c = *nextchar(pRExC_state);
9143 vFAIL("Switch (?(condition)... contains too many branches");
9144 ender = reg_node(pRExC_state, TAIL);
9145 REGTAIL(pRExC_state, br, ender);
9147 REGTAIL(pRExC_state, lastbr, ender);
9148 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
9151 REGTAIL(pRExC_state, ret, ender);
9152 RExC_size++; /* XXX WHY do we need this?!!
9153 For large programs it seems to be required
9154 but I can't figure out why. -- dmq*/
9158 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
9161 case '[': /* (?[ ... ]) */
9162 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
9165 RExC_parse--; /* for vFAIL to print correctly */
9166 vFAIL("Sequence (? incomplete");
9168 default: /* e.g., (?i) */
9171 parse_lparen_question_flags(pRExC_state);
9172 if (UCHARAT(RExC_parse) != ':') {
9173 nextchar(pRExC_state);
9178 nextchar(pRExC_state);
9188 ret = reganode(pRExC_state, OPEN, parno);
9191 RExC_nestroot = parno;
9192 if (RExC_seen & REG_SEEN_RECURSE
9193 && !RExC_open_parens[parno-1])
9195 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9196 "Setting open paren #%"IVdf" to %d\n",
9197 (IV)parno, REG_NODE_NUM(ret)));
9198 RExC_open_parens[parno-1]= ret;
9201 Set_Node_Length(ret, 1); /* MJD */
9202 Set_Node_Offset(ret, RExC_parse); /* MJD */
9210 /* Pick up the branches, linking them together. */
9211 parse_start = RExC_parse; /* MJD */
9212 br = regbranch(pRExC_state, &flags, 1,depth+1);
9214 /* branch_len = (paren != 0); */
9217 if (flags & RESTART_UTF8) {
9218 *flagp = RESTART_UTF8;
9221 FAIL2("panic: regbranch returned NULL, flags=%#X", flags);
9223 if (*RExC_parse == '|') {
9224 if (!SIZE_ONLY && RExC_extralen) {
9225 reginsert(pRExC_state, BRANCHJ, br, depth+1);
9228 reginsert(pRExC_state, BRANCH, br, depth+1);
9229 Set_Node_Length(br, paren != 0);
9230 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9234 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
9236 else if (paren == ':') {
9237 *flagp |= flags&SIMPLE;
9239 if (is_open) { /* Starts with OPEN. */
9240 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
9242 else if (paren != '?') /* Not Conditional */
9244 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9246 while (*RExC_parse == '|') {
9247 if (!SIZE_ONLY && RExC_extralen) {
9248 ender = reganode(pRExC_state, LONGJMP,0);
9249 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9252 RExC_extralen += 2; /* Account for LONGJMP. */
9253 nextchar(pRExC_state);
9255 if (RExC_npar > after_freeze)
9256 after_freeze = RExC_npar;
9257 RExC_npar = freeze_paren;
9259 br = regbranch(pRExC_state, &flags, 0, depth+1);
9262 if (flags & RESTART_UTF8) {
9263 *flagp = RESTART_UTF8;
9266 FAIL2("panic: regbranch returned NULL, flags=%#X", flags);
9268 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
9270 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9273 if (have_branch || paren != ':') {
9274 /* Make a closing node, and hook it on the end. */
9277 ender = reg_node(pRExC_state, TAIL);
9280 ender = reganode(pRExC_state, CLOSE, parno);
9281 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9282 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9283 "Setting close paren #%"IVdf" to %d\n",
9284 (IV)parno, REG_NODE_NUM(ender)));
9285 RExC_close_parens[parno-1]= ender;
9286 if (RExC_nestroot == parno)
9289 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9290 Set_Node_Length(ender,1); /* MJD */
9296 *flagp &= ~HASWIDTH;
9299 ender = reg_node(pRExC_state, SUCCEED);
9302 ender = reg_node(pRExC_state, END);
9304 assert(!RExC_opend); /* there can only be one! */
9309 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9310 SV * const mysv_val1=sv_newmortal();
9311 SV * const mysv_val2=sv_newmortal();
9312 DEBUG_PARSE_MSG("lsbr");
9313 regprop(RExC_rx, mysv_val1, lastbr);
9314 regprop(RExC_rx, mysv_val2, ender);
9315 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9316 SvPV_nolen_const(mysv_val1),
9317 (IV)REG_NODE_NUM(lastbr),
9318 SvPV_nolen_const(mysv_val2),
9319 (IV)REG_NODE_NUM(ender),
9320 (IV)(ender - lastbr)
9323 REGTAIL(pRExC_state, lastbr, ender);
9325 if (have_branch && !SIZE_ONLY) {
9328 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9330 /* Hook the tails of the branches to the closing node. */
9331 for (br = ret; br; br = regnext(br)) {
9332 const U8 op = PL_regkind[OP(br)];
9334 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9335 if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9338 else if (op == BRANCHJ) {
9339 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9340 /* for now we always disable this optimisation * /
9341 if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9347 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9348 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9349 SV * const mysv_val1=sv_newmortal();
9350 SV * const mysv_val2=sv_newmortal();
9351 DEBUG_PARSE_MSG("NADA");
9352 regprop(RExC_rx, mysv_val1, ret);
9353 regprop(RExC_rx, mysv_val2, ender);
9354 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9355 SvPV_nolen_const(mysv_val1),
9356 (IV)REG_NODE_NUM(ret),
9357 SvPV_nolen_const(mysv_val2),
9358 (IV)REG_NODE_NUM(ender),
9363 if (OP(ender) == TAIL) {
9368 for ( opt= br + 1; opt < ender ; opt++ )
9370 NEXT_OFF(br)= ender - br;
9378 static const char parens[] = "=!<,>";
9380 if (paren && (p = strchr(parens, paren))) {
9381 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9382 int flag = (p - parens) > 1;
9385 node = SUSPEND, flag = 0;
9386 reginsert(pRExC_state, node,ret, depth+1);
9387 Set_Node_Cur_Length(ret);
9388 Set_Node_Offset(ret, parse_start + 1);
9390 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9394 /* Check for proper termination. */
9396 RExC_flags = oregflags;
9397 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9398 RExC_parse = oregcomp_parse;
9399 vFAIL("Unmatched (");
9402 else if (!paren && RExC_parse < RExC_end) {
9403 if (*RExC_parse == ')') {
9405 vFAIL("Unmatched )");
9408 FAIL("Junk on end of regexp"); /* "Can't happen". */
9409 assert(0); /* NOTREACHED */
9412 if (RExC_in_lookbehind) {
9413 RExC_in_lookbehind--;
9415 if (after_freeze > RExC_npar)
9416 RExC_npar = after_freeze;
9421 - regbranch - one alternative of an | operator
9423 * Implements the concatenation operator.
9425 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9429 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9433 regnode *chain = NULL;
9435 I32 flags = 0, c = 0;
9436 GET_RE_DEBUG_FLAGS_DECL;
9438 PERL_ARGS_ASSERT_REGBRANCH;
9440 DEBUG_PARSE("brnc");
9445 if (!SIZE_ONLY && RExC_extralen)
9446 ret = reganode(pRExC_state, BRANCHJ,0);
9448 ret = reg_node(pRExC_state, BRANCH);
9449 Set_Node_Length(ret, 1);
9453 if (!first && SIZE_ONLY)
9454 RExC_extralen += 1; /* BRANCHJ */
9456 *flagp = WORST; /* Tentatively. */
9459 nextchar(pRExC_state);
9460 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9462 latest = regpiece(pRExC_state, &flags,depth+1);
9463 if (latest == NULL) {
9464 if (flags & TRYAGAIN)
9466 if (flags & RESTART_UTF8) {
9467 *flagp = RESTART_UTF8;
9470 FAIL2("panic: regpiece returned NULL, flags=%#X", flags);
9472 else if (ret == NULL)
9474 *flagp |= flags&(HASWIDTH|POSTPONED);
9475 if (chain == NULL) /* First piece. */
9476 *flagp |= flags&SPSTART;
9479 REGTAIL(pRExC_state, chain, latest);
9484 if (chain == NULL) { /* Loop ran zero times. */
9485 chain = reg_node(pRExC_state, NOTHING);
9490 *flagp |= flags&SIMPLE;
9497 - regpiece - something followed by possible [*+?]
9499 * Note that the branching code sequences used for ? and the general cases
9500 * of * and + are somewhat optimized: they use the same NOTHING node as
9501 * both the endmarker for their branch list and the body of the last branch.
9502 * It might seem that this node could be dispensed with entirely, but the
9503 * endmarker role is not redundant.
9505 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
9507 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9511 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9518 const char * const origparse = RExC_parse;
9520 I32 max = REG_INFTY;
9521 #ifdef RE_TRACK_PATTERN_OFFSETS
9524 const char *maxpos = NULL;
9526 /* Save the original in case we change the emitted regop to a FAIL. */
9527 regnode * const orig_emit = RExC_emit;
9529 GET_RE_DEBUG_FLAGS_DECL;
9531 PERL_ARGS_ASSERT_REGPIECE;
9533 DEBUG_PARSE("piec");
9535 ret = regatom(pRExC_state, &flags,depth+1);
9537 if (flags & (TRYAGAIN|RESTART_UTF8))
9538 *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
9540 FAIL2("panic: regatom returned NULL, flags=%#X", flags);
9546 if (op == '{' && regcurly(RExC_parse, FALSE)) {
9548 #ifdef RE_TRACK_PATTERN_OFFSETS
9549 parse_start = RExC_parse; /* MJD */
9551 next = RExC_parse + 1;
9552 while (isDIGIT(*next) || *next == ',') {
9561 if (*next == '}') { /* got one */
9565 min = atoi(RExC_parse);
9569 maxpos = RExC_parse;
9571 if (!max && *maxpos != '0')
9572 max = REG_INFTY; /* meaning "infinity" */
9573 else if (max >= REG_INFTY)
9574 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9576 nextchar(pRExC_state);
9577 if (max < min) { /* If can't match, warn and optimize to fail
9580 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9582 /* We can't back off the size because we have to reserve
9583 * enough space for all the things we are about to throw
9584 * away, but we can shrink it by the ammount we are about
9586 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9589 RExC_emit = orig_emit;
9591 ret = reg_node(pRExC_state, OPFAIL);
9594 else if (max == 0) { /* replace {0} with a nothing node */
9596 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)NOTHING];
9599 RExC_emit = orig_emit;
9601 ret = reg_node(pRExC_state, NOTHING);
9606 if ((flags&SIMPLE)) {
9607 RExC_naughty += 2 + RExC_naughty / 2;
9608 reginsert(pRExC_state, CURLY, ret, depth+1);
9609 Set_Node_Offset(ret, parse_start+1); /* MJD */
9610 Set_Node_Cur_Length(ret);
9613 regnode * const w = reg_node(pRExC_state, WHILEM);
9616 REGTAIL(pRExC_state, ret, w);
9617 if (!SIZE_ONLY && RExC_extralen) {
9618 reginsert(pRExC_state, LONGJMP,ret, depth+1);
9619 reginsert(pRExC_state, NOTHING,ret, depth+1);
9620 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9622 reginsert(pRExC_state, CURLYX,ret, depth+1);
9624 Set_Node_Offset(ret, parse_start+1);
9625 Set_Node_Length(ret,
9626 op == '{' ? (RExC_parse - parse_start) : 1);
9628 if (!SIZE_ONLY && RExC_extralen)
9629 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
9630 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9632 RExC_whilem_seen++, RExC_extralen += 3;
9633 RExC_naughty += 4 + RExC_naughty; /* compound interest */
9642 ARG1_SET(ret, (U16)min);
9643 ARG2_SET(ret, (U16)max);
9655 #if 0 /* Now runtime fix should be reliable. */
9657 /* if this is reinstated, don't forget to put this back into perldiag:
9659 =item Regexp *+ operand could be empty at {#} in regex m/%s/
9661 (F) The part of the regexp subject to either the * or + quantifier
9662 could match an empty string. The {#} shows in the regular
9663 expression about where the problem was discovered.
9667 if (!(flags&HASWIDTH) && op != '?')
9668 vFAIL("Regexp *+ operand could be empty");
9671 #ifdef RE_TRACK_PATTERN_OFFSETS
9672 parse_start = RExC_parse;
9674 nextchar(pRExC_state);
9676 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9678 if (op == '*' && (flags&SIMPLE)) {
9679 reginsert(pRExC_state, STAR, ret, depth+1);
9683 else if (op == '*') {
9687 else if (op == '+' && (flags&SIMPLE)) {
9688 reginsert(pRExC_state, PLUS, ret, depth+1);
9692 else if (op == '+') {
9696 else if (op == '?') {
9701 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9702 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
9703 ckWARN3reg(RExC_parse,
9704 "%.*s matches null string many times",
9705 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9707 (void)ReREFCNT_inc(RExC_rx_sv);
9710 if (RExC_parse < RExC_end && *RExC_parse == '?') {
9711 nextchar(pRExC_state);
9712 reginsert(pRExC_state, MINMOD, ret, depth+1);
9713 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9715 #ifndef REG_ALLOW_MINMOD_SUSPEND
9718 if (RExC_parse < RExC_end && *RExC_parse == '+') {
9720 nextchar(pRExC_state);
9721 ender = reg_node(pRExC_state, SUCCEED);
9722 REGTAIL(pRExC_state, ret, ender);
9723 reginsert(pRExC_state, SUSPEND, ret, depth+1);
9725 ender = reg_node(pRExC_state, TAIL);
9726 REGTAIL(pRExC_state, ret, ender);
9730 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9732 vFAIL("Nested quantifiers");
9739 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
9740 const bool strict /* Apply stricter parsing rules? */
9744 /* This is expected to be called by a parser routine that has recognized '\N'
9745 and needs to handle the rest. RExC_parse is expected to point at the first
9746 char following the N at the time of the call. On successful return,
9747 RExC_parse has been updated to point to just after the sequence identified
9748 by this routine, and <*flagp> has been updated.
9750 The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9753 \N may begin either a named sequence, or if outside a character class, mean
9754 to match a non-newline. For non single-quoted regexes, the tokenizer has
9755 attempted to decide which, and in the case of a named sequence, converted it
9756 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9757 where c1... are the characters in the sequence. For single-quoted regexes,
9758 the tokenizer passes the \N sequence through unchanged; this code will not
9759 attempt to determine this nor expand those, instead raising a syntax error.
9760 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9761 or there is no '}', it signals that this \N occurrence means to match a
9764 Only the \N{U+...} form should occur in a character class, for the same
9765 reason that '.' inside a character class means to just match a period: it
9766 just doesn't make sense.
9768 The function raises an error (via vFAIL), and doesn't return for various
9769 syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
9770 success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
9771 RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
9772 only possible if node_p is non-NULL.
9775 If <valuep> is non-null, it means the caller can accept an input sequence
9776 consisting of a just a single code point; <*valuep> is set to that value
9777 if the input is such.
9779 If <node_p> is non-null it signifies that the caller can accept any other
9780 legal sequence (i.e., one that isn't just a single code point). <*node_p>
9782 1) \N means not-a-NL: points to a newly created REG_ANY node;
9783 2) \N{}: points to a new NOTHING node;
9784 3) otherwise: points to a new EXACT node containing the resolved
9786 Note that FALSE is returned for single code point sequences if <valuep> is
9790 char * endbrace; /* '}' following the name */
9792 char *endchar; /* Points to '.' or '}' ending cur char in the input
9794 bool has_multiple_chars; /* true if the input stream contains a sequence of
9795 more than one character */
9797 GET_RE_DEBUG_FLAGS_DECL;
9799 PERL_ARGS_ASSERT_GROK_BSLASH_N;
9803 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
9805 /* The [^\n] meaning of \N ignores spaces and comments under the /x
9806 * modifier. The other meaning does not */
9807 p = (RExC_flags & RXf_PMf_EXTENDED)
9808 ? regwhite( pRExC_state, RExC_parse )
9811 /* Disambiguate between \N meaning a named character versus \N meaning
9812 * [^\n]. The former is assumed when it can't be the latter. */
9813 if (*p != '{' || regcurly(p, FALSE)) {
9816 /* no bare \N in a charclass */
9817 if (in_char_class) {
9818 vFAIL("\\N in a character class must be a named character: \\N{...}");
9822 nextchar(pRExC_state);
9823 *node_p = reg_node(pRExC_state, REG_ANY);
9824 *flagp |= HASWIDTH|SIMPLE;
9827 Set_Node_Length(*node_p, 1); /* MJD */
9831 /* Here, we have decided it should be a named character or sequence */
9833 /* The test above made sure that the next real character is a '{', but
9834 * under the /x modifier, it could be separated by space (or a comment and
9835 * \n) and this is not allowed (for consistency with \x{...} and the
9836 * tokenizer handling of \N{NAME}). */
9837 if (*RExC_parse != '{') {
9838 vFAIL("Missing braces on \\N{}");
9841 RExC_parse++; /* Skip past the '{' */
9843 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9844 || ! (endbrace == RExC_parse /* nothing between the {} */
9845 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9846 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9848 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9849 vFAIL("\\N{NAME} must be resolved by the lexer");
9852 if (endbrace == RExC_parse) { /* empty: \N{} */
9855 *node_p = reg_node(pRExC_state,NOTHING);
9857 else if (in_char_class) {
9858 if (SIZE_ONLY && in_char_class) {
9860 RExC_parse++; /* Position after the "}" */
9861 vFAIL("Zero length \\N{}");
9864 ckWARNreg(RExC_parse,
9865 "Ignoring zero length \\N{} in character class");
9873 nextchar(pRExC_state);
9877 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9878 RExC_parse += 2; /* Skip past the 'U+' */
9880 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9882 /* Code points are separated by dots. If none, there is only one code
9883 * point, and is terminated by the brace */
9884 has_multiple_chars = (endchar < endbrace);
9886 if (valuep && (! has_multiple_chars || in_char_class)) {
9887 /* We only pay attention to the first char of
9888 multichar strings being returned in char classes. I kinda wonder
9889 if this makes sense as it does change the behaviour
9890 from earlier versions, OTOH that behaviour was broken
9891 as well. XXX Solution is to recharacterize as
9892 [rest-of-class]|multi1|multi2... */
9894 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9895 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9896 | PERL_SCAN_DISALLOW_PREFIX
9897 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9899 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9901 /* The tokenizer should have guaranteed validity, but it's possible to
9902 * bypass it by using single quoting, so check */
9903 if (length_of_hex == 0
9904 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9906 RExC_parse += length_of_hex; /* Includes all the valid */
9907 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
9908 ? UTF8SKIP(RExC_parse)
9910 /* Guard against malformed utf8 */
9911 if (RExC_parse >= endchar) {
9912 RExC_parse = endchar;
9914 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9917 if (in_char_class && has_multiple_chars) {
9919 RExC_parse = endbrace;
9920 vFAIL("\\N{} in character class restricted to one character");
9923 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9927 RExC_parse = endbrace + 1;
9929 else if (! node_p || ! has_multiple_chars) {
9931 /* Here, the input is legal, but not according to the caller's
9932 * options. We fail without advancing the parse, so that the
9933 * caller can try again */
9939 /* What is done here is to convert this to a sub-pattern of the form
9940 * (?:\x{char1}\x{char2}...)
9941 * and then call reg recursively. That way, it retains its atomicness,
9942 * while not having to worry about special handling that some code
9943 * points may have. toke.c has converted the original Unicode values
9944 * to native, so that we can just pass on the hex values unchanged. We
9945 * do have to set a flag to keep recoding from happening in the
9948 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9950 char *orig_end = RExC_end;
9953 while (RExC_parse < endbrace) {
9955 /* Convert to notation the rest of the code understands */
9956 sv_catpv(substitute_parse, "\\x{");
9957 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9958 sv_catpv(substitute_parse, "}");
9960 /* Point to the beginning of the next character in the sequence. */
9961 RExC_parse = endchar + 1;
9962 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9964 sv_catpv(substitute_parse, ")");
9966 RExC_parse = SvPV(substitute_parse, len);
9968 /* Don't allow empty number */
9970 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9972 RExC_end = RExC_parse + len;
9974 /* The values are Unicode, and therefore not subject to recoding */
9975 RExC_override_recoding = 1;
9977 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
9978 if (flags & RESTART_UTF8) {
9979 *flagp = RESTART_UTF8;
9982 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#X",
9985 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9987 RExC_parse = endbrace;
9988 RExC_end = orig_end;
9989 RExC_override_recoding = 0;
9991 nextchar(pRExC_state);
10001 * It returns the code point in utf8 for the value in *encp.
10002 * value: a code value in the source encoding
10003 * encp: a pointer to an Encode object
10005 * If the result from Encode is not a single character,
10006 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10009 S_reg_recode(pTHX_ const char value, SV **encp)
10012 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10013 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10014 const STRLEN newlen = SvCUR(sv);
10015 UV uv = UNICODE_REPLACEMENT;
10017 PERL_ARGS_ASSERT_REG_RECODE;
10021 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10024 if (!newlen || numlen != newlen) {
10025 uv = UNICODE_REPLACEMENT;
10031 PERL_STATIC_INLINE U8
10032 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10036 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10042 op = get_regex_charset(RExC_flags);
10043 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10044 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10045 been, so there is no hole */
10048 return op + EXACTF;
10051 PERL_STATIC_INLINE void
10052 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
10054 /* This knows the details about sizing an EXACTish node, setting flags for
10055 * it (by setting <*flagp>, and potentially populating it with a single
10058 * If <len> (the length in bytes) is non-zero, this function assumes that
10059 * the node has already been populated, and just does the sizing. In this
10060 * case <code_point> should be the final code point that has already been
10061 * placed into the node. This value will be ignored except that under some
10062 * circumstances <*flagp> is set based on it.
10064 * If <len> is zero, the function assumes that the node is to contain only
10065 * the single character given by <code_point> and calculates what <len>
10066 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
10067 * additionally will populate the node's STRING with <code_point>, if <len>
10068 * is 0. In both cases <*flagp> is appropriately set
10070 * It knows that under FOLD, UTF characters and the Latin Sharp S must be
10071 * folded (the latter only when the rules indicate it can match 'ss') */
10073 bool len_passed_in = cBOOL(len != 0);
10074 U8 character[UTF8_MAXBYTES_CASE+1];
10076 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10078 if (! len_passed_in) {
10081 to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
10084 uvchr_to_utf8( character, code_point);
10085 len = UTF8SKIP(character);
10089 || code_point != LATIN_SMALL_LETTER_SHARP_S
10090 || ASCII_FOLD_RESTRICTED
10091 || ! AT_LEAST_UNI_SEMANTICS)
10093 *character = (U8) code_point;
10098 *(character + 1) = 's';
10104 RExC_size += STR_SZ(len);
10107 RExC_emit += STR_SZ(len);
10108 STR_LEN(node) = len;
10109 if (! len_passed_in) {
10110 Copy((char *) character, STRING(node), len, char);
10114 *flagp |= HASWIDTH;
10116 /* A single character node is SIMPLE, except for the special-cased SHARP S
10118 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10119 && (code_point != LATIN_SMALL_LETTER_SHARP_S
10120 || ! FOLD || ! DEPENDS_SEMANTICS))
10127 - regatom - the lowest level
10129 Try to identify anything special at the start of the pattern. If there
10130 is, then handle it as required. This may involve generating a single regop,
10131 such as for an assertion; or it may involve recursing, such as to
10132 handle a () structure.
10134 If the string doesn't start with something special then we gobble up
10135 as much literal text as we can.
10137 Once we have been able to handle whatever type of thing started the
10138 sequence, we return.
10140 Note: we have to be careful with escapes, as they can be both literal
10141 and special, and in the case of \10 and friends, context determines which.
10143 A summary of the code structure is:
10145 switch (first_byte) {
10146 cases for each special:
10147 handle this special;
10150 switch (2nd byte) {
10151 cases for each unambiguous special:
10152 handle this special;
10154 cases for each ambigous special/literal:
10156 if (special) handle here
10158 default: // unambiguously literal:
10161 default: // is a literal char
10164 create EXACTish node for literal;
10165 while (more input and node isn't full) {
10166 switch (input_byte) {
10167 cases for each special;
10168 make sure parse pointer is set so that the next call to
10169 regatom will see this special first
10170 goto loopdone; // EXACTish node terminated by prev. char
10172 append char to EXACTISH node;
10174 get next input byte;
10178 return the generated node;
10180 Specifically there are two separate switches for handling
10181 escape sequences, with the one for handling literal escapes requiring
10182 a dummy entry for all of the special escapes that are actually handled
10185 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
10187 Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10189 Otherwise does not return NULL.
10193 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10196 regnode *ret = NULL;
10198 char *parse_start = RExC_parse;
10202 GET_RE_DEBUG_FLAGS_DECL;
10204 *flagp = WORST; /* Tentatively. */
10206 DEBUG_PARSE("atom");
10208 PERL_ARGS_ASSERT_REGATOM;
10211 switch ((U8)*RExC_parse) {
10213 RExC_seen_zerolen++;
10214 nextchar(pRExC_state);
10215 if (RExC_flags & RXf_PMf_MULTILINE)
10216 ret = reg_node(pRExC_state, MBOL);
10217 else if (RExC_flags & RXf_PMf_SINGLELINE)
10218 ret = reg_node(pRExC_state, SBOL);
10220 ret = reg_node(pRExC_state, BOL);
10221 Set_Node_Length(ret, 1); /* MJD */
10224 nextchar(pRExC_state);
10226 RExC_seen_zerolen++;
10227 if (RExC_flags & RXf_PMf_MULTILINE)
10228 ret = reg_node(pRExC_state, MEOL);
10229 else if (RExC_flags & RXf_PMf_SINGLELINE)
10230 ret = reg_node(pRExC_state, SEOL);
10232 ret = reg_node(pRExC_state, EOL);
10233 Set_Node_Length(ret, 1); /* MJD */
10236 nextchar(pRExC_state);
10237 if (RExC_flags & RXf_PMf_SINGLELINE)
10238 ret = reg_node(pRExC_state, SANY);
10240 ret = reg_node(pRExC_state, REG_ANY);
10241 *flagp |= HASWIDTH|SIMPLE;
10243 Set_Node_Length(ret, 1); /* MJD */
10247 char * const oregcomp_parse = ++RExC_parse;
10248 ret = regclass(pRExC_state, flagp,depth+1,
10249 FALSE, /* means parse the whole char class */
10250 TRUE, /* allow multi-char folds */
10251 FALSE, /* don't silence non-portable warnings. */
10253 if (*RExC_parse != ']') {
10254 RExC_parse = oregcomp_parse;
10255 vFAIL("Unmatched [");
10258 if (*flagp & RESTART_UTF8)
10260 FAIL2("panic: regclass returned NULL to regatom, flags=%#X",
10263 nextchar(pRExC_state);
10264 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10268 nextchar(pRExC_state);
10269 ret = reg(pRExC_state, 1, &flags,depth+1);
10271 if (flags & TRYAGAIN) {
10272 if (RExC_parse == RExC_end) {
10273 /* Make parent create an empty node if needed. */
10274 *flagp |= TRYAGAIN;
10279 if (flags & RESTART_UTF8) {
10280 *flagp = RESTART_UTF8;
10283 FAIL2("panic: reg returned NULL to regatom, flags=%#X", flags);
10285 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10289 if (flags & TRYAGAIN) {
10290 *flagp |= TRYAGAIN;
10293 vFAIL("Internal urp");
10294 /* Supposed to be caught earlier. */
10297 if (!regcurly(RExC_parse, FALSE)) {
10306 vFAIL("Quantifier follows nothing");
10311 This switch handles escape sequences that resolve to some kind
10312 of special regop and not to literal text. Escape sequnces that
10313 resolve to literal text are handled below in the switch marked
10316 Every entry in this switch *must* have a corresponding entry
10317 in the literal escape switch. However, the opposite is not
10318 required, as the default for this switch is to jump to the
10319 literal text handling code.
10321 switch ((U8)*++RExC_parse) {
10323 /* Special Escapes */
10325 RExC_seen_zerolen++;
10326 ret = reg_node(pRExC_state, SBOL);
10328 goto finish_meta_pat;
10330 ret = reg_node(pRExC_state, GPOS);
10331 RExC_seen |= REG_SEEN_GPOS;
10333 goto finish_meta_pat;
10335 RExC_seen_zerolen++;
10336 ret = reg_node(pRExC_state, KEEPS);
10338 /* XXX:dmq : disabling in-place substitution seems to
10339 * be necessary here to avoid cases of memory corruption, as
10340 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10342 RExC_seen |= REG_SEEN_LOOKBEHIND;
10343 goto finish_meta_pat;
10345 ret = reg_node(pRExC_state, SEOL);
10347 RExC_seen_zerolen++; /* Do not optimize RE away */
10348 goto finish_meta_pat;
10350 ret = reg_node(pRExC_state, EOS);
10352 RExC_seen_zerolen++; /* Do not optimize RE away */
10353 goto finish_meta_pat;
10355 ret = reg_node(pRExC_state, CANY);
10356 RExC_seen |= REG_SEEN_CANY;
10357 *flagp |= HASWIDTH|SIMPLE;
10358 goto finish_meta_pat;
10360 ret = reg_node(pRExC_state, CLUMP);
10361 *flagp |= HASWIDTH;
10362 goto finish_meta_pat;
10368 arg = ANYOF_WORDCHAR;
10372 RExC_seen_zerolen++;
10373 RExC_seen |= REG_SEEN_LOOKBEHIND;
10374 op = BOUND + get_regex_charset(RExC_flags);
10375 if (op > BOUNDA) { /* /aa is same as /a */
10378 ret = reg_node(pRExC_state, op);
10379 FLAGS(ret) = get_regex_charset(RExC_flags);
10381 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10382 ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
10384 goto finish_meta_pat;
10386 RExC_seen_zerolen++;
10387 RExC_seen |= REG_SEEN_LOOKBEHIND;
10388 op = NBOUND + get_regex_charset(RExC_flags);
10389 if (op > NBOUNDA) { /* /aa is same as /a */
10392 ret = reg_node(pRExC_state, op);
10393 FLAGS(ret) = get_regex_charset(RExC_flags);
10395 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10396 ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
10398 goto finish_meta_pat;
10408 ret = reg_node(pRExC_state, LNBREAK);
10409 *flagp |= HASWIDTH|SIMPLE;
10410 goto finish_meta_pat;
10418 goto join_posix_op_known;
10424 arg = ANYOF_VERTWS;
10426 goto join_posix_op_known;
10436 op = POSIXD + get_regex_charset(RExC_flags);
10437 if (op > POSIXA) { /* /aa is same as /a */
10441 join_posix_op_known:
10444 op += NPOSIXD - POSIXD;
10447 ret = reg_node(pRExC_state, op);
10449 FLAGS(ret) = namedclass_to_classnum(arg);
10452 *flagp |= HASWIDTH|SIMPLE;
10456 nextchar(pRExC_state);
10457 Set_Node_Length(ret, 2); /* MJD */
10463 char* parse_start = RExC_parse - 2;
10468 ret = regclass(pRExC_state, flagp,depth+1,
10469 TRUE, /* means just parse this element */
10470 FALSE, /* don't allow multi-char folds */
10471 FALSE, /* don't silence non-portable warnings.
10472 It would be a bug if these returned
10475 /* regclass() can only return RESTART_UTF8 if multi-char folds
10478 FAIL2("panic: regclass returned NULL to regatom, flags=%#X",
10483 Set_Node_Offset(ret, parse_start + 2);
10484 Set_Node_Cur_Length(ret);
10485 nextchar(pRExC_state);
10489 /* Handle \N and \N{NAME} with multiple code points here and not
10490 * below because it can be multicharacter. join_exact() will join
10491 * them up later on. Also this makes sure that things like
10492 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10493 * The options to the grok function call causes it to fail if the
10494 * sequence is just a single code point. We then go treat it as
10495 * just another character in the current EXACT node, and hence it
10496 * gets uniform treatment with all the other characters. The
10497 * special treatment for quantifiers is not needed for such single
10498 * character sequences */
10500 if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
10501 FALSE /* not strict */ )) {
10502 if (*flagp & RESTART_UTF8)
10508 case 'k': /* Handle \k<NAME> and \k'NAME' */
10511 char ch= RExC_parse[1];
10512 if (ch != '<' && ch != '\'' && ch != '{') {
10514 vFAIL2("Sequence %.2s... not terminated",parse_start);
10516 /* this pretty much dupes the code for (?P=...) in reg(), if
10517 you change this make sure you change that */
10518 char* name_start = (RExC_parse += 2);
10520 SV *sv_dat = reg_scan_name(pRExC_state,
10521 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10522 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10523 if (RExC_parse == name_start || *RExC_parse != ch)
10524 vFAIL2("Sequence %.3s... not terminated",parse_start);
10527 num = add_data( pRExC_state, 1, "S" );
10528 RExC_rxi->data->data[num]=(void*)sv_dat;
10529 SvREFCNT_inc_simple_void(sv_dat);
10533 ret = reganode(pRExC_state,
10536 : (ASCII_FOLD_RESTRICTED)
10538 : (AT_LEAST_UNI_SEMANTICS)
10544 *flagp |= HASWIDTH;
10546 /* override incorrect value set in reganode MJD */
10547 Set_Node_Offset(ret, parse_start+1);
10548 Set_Node_Cur_Length(ret); /* MJD */
10549 nextchar(pRExC_state);
10555 case '1': case '2': case '3': case '4':
10556 case '5': case '6': case '7': case '8': case '9':
10559 bool isg = *RExC_parse == 'g';
10564 if (*RExC_parse == '{') {
10568 if (*RExC_parse == '-') {
10572 if (hasbrace && !isDIGIT(*RExC_parse)) {
10573 if (isrel) RExC_parse--;
10575 goto parse_named_seq;
10577 num = atoi(RExC_parse);
10578 if (isg && num == 0)
10579 vFAIL("Reference to invalid group 0");
10581 num = RExC_npar - num;
10583 vFAIL("Reference to nonexistent or unclosed group");
10585 if (!isg && num > 9 && num >= RExC_npar)
10586 /* Probably a character specified in octal, e.g. \35 */
10589 char * const parse_start = RExC_parse - 1; /* MJD */
10590 while (isDIGIT(*RExC_parse))
10592 if (parse_start == RExC_parse - 1)
10593 vFAIL("Unterminated \\g... pattern");
10595 if (*RExC_parse != '}')
10596 vFAIL("Unterminated \\g{...} pattern");
10600 if (num > (I32)RExC_rx->nparens)
10601 vFAIL("Reference to nonexistent group");
10604 ret = reganode(pRExC_state,
10607 : (ASCII_FOLD_RESTRICTED)
10609 : (AT_LEAST_UNI_SEMANTICS)
10615 *flagp |= HASWIDTH;
10617 /* override incorrect value set in reganode MJD */
10618 Set_Node_Offset(ret, parse_start+1);
10619 Set_Node_Cur_Length(ret); /* MJD */
10621 nextchar(pRExC_state);
10626 if (RExC_parse >= RExC_end)
10627 FAIL("Trailing \\");
10630 /* Do not generate "unrecognized" warnings here, we fall
10631 back into the quick-grab loop below */
10638 if (RExC_flags & RXf_PMf_EXTENDED) {
10639 if ( reg_skipcomment( pRExC_state ) )
10646 parse_start = RExC_parse - 1;
10655 #define MAX_NODE_STRING_SIZE 127
10656 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10658 U8 upper_parse = MAX_NODE_STRING_SIZE;
10661 bool next_is_quantifier;
10662 char * oldp = NULL;
10664 /* If a folding node contains only code points that don't
10665 * participate in folds, it can be changed into an EXACT node,
10666 * which allows the optimizer more things to look for */
10670 node_type = compute_EXACTish(pRExC_state);
10671 ret = reg_node(pRExC_state, node_type);
10673 /* In pass1, folded, we use a temporary buffer instead of the
10674 * actual node, as the node doesn't exist yet */
10675 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10681 /* We do the EXACTFish to EXACT node only if folding, and not if in
10682 * locale, as whether a character folds or not isn't known until
10684 maybe_exact = FOLD && ! LOC;
10686 /* XXX The node can hold up to 255 bytes, yet this only goes to
10687 * 127. I (khw) do not know why. Keeping it somewhat less than
10688 * 255 allows us to not have to worry about overflow due to
10689 * converting to utf8 and fold expansion, but that value is
10690 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
10691 * split up by this limit into a single one using the real max of
10692 * 255. Even at 127, this breaks under rare circumstances. If
10693 * folding, we do not want to split a node at a character that is a
10694 * non-final in a multi-char fold, as an input string could just
10695 * happen to want to match across the node boundary. The join
10696 * would solve that problem if the join actually happens. But a
10697 * series of more than two nodes in a row each of 127 would cause
10698 * the first join to succeed to get to 254, but then there wouldn't
10699 * be room for the next one, which could at be one of those split
10700 * multi-char folds. I don't know of any fool-proof solution. One
10701 * could back off to end with only a code point that isn't such a
10702 * non-final, but it is possible for there not to be any in the
10704 for (p = RExC_parse - 1;
10705 len < upper_parse && p < RExC_end;
10710 if (RExC_flags & RXf_PMf_EXTENDED)
10711 p = regwhite( pRExC_state, p );
10722 /* Literal Escapes Switch
10724 This switch is meant to handle escape sequences that
10725 resolve to a literal character.
10727 Every escape sequence that represents something
10728 else, like an assertion or a char class, is handled
10729 in the switch marked 'Special Escapes' above in this
10730 routine, but also has an entry here as anything that
10731 isn't explicitly mentioned here will be treated as
10732 an unescaped equivalent literal.
10735 switch ((U8)*++p) {
10736 /* These are all the special escapes. */
10737 case 'A': /* Start assertion */
10738 case 'b': case 'B': /* Word-boundary assertion*/
10739 case 'C': /* Single char !DANGEROUS! */
10740 case 'd': case 'D': /* digit class */
10741 case 'g': case 'G': /* generic-backref, pos assertion */
10742 case 'h': case 'H': /* HORIZWS */
10743 case 'k': case 'K': /* named backref, keep marker */
10744 case 'p': case 'P': /* Unicode property */
10745 case 'R': /* LNBREAK */
10746 case 's': case 'S': /* space class */
10747 case 'v': case 'V': /* VERTWS */
10748 case 'w': case 'W': /* word class */
10749 case 'X': /* eXtended Unicode "combining character sequence" */
10750 case 'z': case 'Z': /* End of line/string assertion */
10754 /* Anything after here is an escape that resolves to a
10755 literal. (Except digits, which may or may not)
10761 case 'N': /* Handle a single-code point named character. */
10762 /* The options cause it to fail if a multiple code
10763 * point sequence. Handle those in the switch() above
10765 RExC_parse = p + 1;
10766 if (! grok_bslash_N(pRExC_state, NULL, &ender,
10767 flagp, depth, FALSE,
10768 FALSE /* not strict */ ))
10770 if (*flagp & RESTART_UTF8)
10771 FAIL("panic: grok_bslash_N set RESTART_UTF8");
10772 RExC_parse = p = oldp;
10776 if (ender > 0xff) {
10793 ender = ASCII_TO_NATIVE('\033');
10797 ender = ASCII_TO_NATIVE('\007');
10803 const char* error_msg;
10805 bool valid = grok_bslash_o(&p,
10808 TRUE, /* out warnings */
10809 FALSE, /* not strict */
10810 TRUE, /* Output warnings
10815 RExC_parse = p; /* going to die anyway; point
10816 to exact spot of failure */
10820 if (PL_encoding && ender < 0x100) {
10821 goto recode_encoding;
10823 if (ender > 0xff) {
10830 UV result = UV_MAX; /* initialize to erroneous
10832 const char* error_msg;
10834 bool valid = grok_bslash_x(&p,
10837 TRUE, /* out warnings */
10838 FALSE, /* not strict */
10839 TRUE, /* Output warnings
10844 RExC_parse = p; /* going to die anyway; point
10845 to exact spot of failure */
10850 if (PL_encoding && ender < 0x100) {
10851 goto recode_encoding;
10853 if (ender > 0xff) {
10860 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10862 case '0': case '1': case '2': case '3':case '4':
10863 case '5': case '6': case '7':
10865 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10867 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10869 ender = grok_oct(p, &numlen, &flags, NULL);
10870 if (ender > 0xff) {
10874 if (SIZE_ONLY /* like \08, \178 */
10877 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
10879 reg_warn_non_literal_string(
10881 form_short_octal_warning(p, numlen));
10884 else { /* Not to be treated as an octal constant, go
10889 if (PL_encoding && ender < 0x100)
10890 goto recode_encoding;
10893 if (! RExC_override_recoding) {
10894 SV* enc = PL_encoding;
10895 ender = reg_recode((const char)(U8)ender, &enc);
10896 if (!enc && SIZE_ONLY)
10897 ckWARNreg(p, "Invalid escape in the specified encoding");
10903 FAIL("Trailing \\");
10906 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
10907 /* Include any { following the alpha to emphasize
10908 * that it could be part of an escape at some point
10910 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
10911 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
10913 goto normal_default;
10914 } /* End of switch on '\' */
10916 default: /* A literal character */
10919 && RExC_flags & RXf_PMf_EXTENDED
10920 && ckWARN(WARN_DEPRECATED)
10921 && is_PATWS_non_low(p, UTF))
10923 vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
10924 "Escape literal pattern white space under /x");
10928 if (UTF8_IS_START(*p) && UTF) {
10930 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10931 &numlen, UTF8_ALLOW_DEFAULT);
10937 } /* End of switch on the literal */
10939 /* Here, have looked at the literal character and <ender>
10940 * contains its ordinal, <p> points to the character after it
10943 if ( RExC_flags & RXf_PMf_EXTENDED)
10944 p = regwhite( pRExC_state, p );
10946 /* If the next thing is a quantifier, it applies to this
10947 * character only, which means that this character has to be in
10948 * its own node and can't just be appended to the string in an
10949 * existing node, so if there are already other characters in
10950 * the node, close the node with just them, and set up to do
10951 * this character again next time through, when it will be the
10952 * only thing in its new node */
10953 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
10961 /* See comments for join_exact() as to why we fold
10962 * this non-UTF at compile time */
10963 || (node_type == EXACTFU
10964 && ender == LATIN_SMALL_LETTER_SHARP_S))
10968 /* Prime the casefolded buffer. Locale rules, which
10969 * apply only to code points < 256, aren't known until
10970 * execution, so for them, just output the original
10971 * character using utf8. If we start to fold non-UTF
10972 * patterns, be sure to update join_exact() */
10973 if (LOC && ender < 256) {
10974 if (UNI_IS_INVARIANT(ender)) {
10978 *s = UTF8_TWO_BYTE_HI(ender);
10979 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
10984 UV folded = _to_uni_fold_flags(
10989 | ((LOC) ? FOLD_FLAGS_LOCALE
10990 : (ASCII_FOLD_RESTRICTED)
10991 ? FOLD_FLAGS_NOMIX_ASCII
10995 /* If this node only contains non-folding code
10996 * points so far, see if this new one is also
10999 if (folded != ender) {
11000 maybe_exact = FALSE;
11003 /* Here the fold is the original; we have
11004 * to check further to see if anything
11006 if (! PL_utf8_foldable) {
11007 SV* swash = swash_init("utf8",
11009 &PL_sv_undef, 1, 0);
11011 _get_swash_invlist(swash);
11012 SvREFCNT_dec_NN(swash);
11014 if (_invlist_contains_cp(PL_utf8_foldable,
11017 maybe_exact = FALSE;
11025 /* The loop increments <len> each time, as all but this
11026 * path (and the one just below for UTF) through it add
11027 * a single byte to the EXACTish node. But this one
11028 * has changed len to be the correct final value, so
11029 * subtract one to cancel out the increment that
11031 len += foldlen - 1;
11034 *(s++) = (char) ender;
11035 maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
11039 const STRLEN unilen = reguni(pRExC_state, ender, s);
11045 /* See comment just above for - 1 */
11049 REGC((char)ender, s++);
11052 if (next_is_quantifier) {
11054 /* Here, the next input is a quantifier, and to get here,
11055 * the current character is the only one in the node.
11056 * Also, here <len> doesn't include the final byte for this
11062 } /* End of loop through literal characters */
11064 /* Here we have either exhausted the input or ran out of room in
11065 * the node. (If we encountered a character that can't be in the
11066 * node, transfer is made directly to <loopdone>, and so we
11067 * wouldn't have fallen off the end of the loop.) In the latter
11068 * case, we artificially have to split the node into two, because
11069 * we just don't have enough space to hold everything. This
11070 * creates a problem if the final character participates in a
11071 * multi-character fold in the non-final position, as a match that
11072 * should have occurred won't, due to the way nodes are matched,
11073 * and our artificial boundary. So back off until we find a non-
11074 * problematic character -- one that isn't at the beginning or
11075 * middle of such a fold. (Either it doesn't participate in any
11076 * folds, or appears only in the final position of all the folds it
11077 * does participate in.) A better solution with far fewer false
11078 * positives, and that would fill the nodes more completely, would
11079 * be to actually have available all the multi-character folds to
11080 * test against, and to back-off only far enough to be sure that
11081 * this node isn't ending with a partial one. <upper_parse> is set
11082 * further below (if we need to reparse the node) to include just
11083 * up through that final non-problematic character that this code
11084 * identifies, so when it is set to less than the full node, we can
11085 * skip the rest of this */
11086 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
11088 const STRLEN full_len = len;
11090 assert(len >= MAX_NODE_STRING_SIZE);
11092 /* Here, <s> points to the final byte of the final character.
11093 * Look backwards through the string until find a non-
11094 * problematic character */
11098 /* These two have no multi-char folds to non-UTF characters
11100 if (ASCII_FOLD_RESTRICTED || LOC) {
11104 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
11108 if (! PL_NonL1NonFinalFold) {
11109 PL_NonL1NonFinalFold = _new_invlist_C_array(
11110 NonL1_Perl_Non_Final_Folds_invlist);
11113 /* Point to the first byte of the final character */
11114 s = (char *) utf8_hop((U8 *) s, -1);
11116 while (s >= s0) { /* Search backwards until find
11117 non-problematic char */
11118 if (UTF8_IS_INVARIANT(*s)) {
11120 /* There are no ascii characters that participate
11121 * in multi-char folds under /aa. In EBCDIC, the
11122 * non-ascii invariants are all control characters,
11123 * so don't ever participate in any folds. */
11124 if (ASCII_FOLD_RESTRICTED
11125 || ! IS_NON_FINAL_FOLD(*s))
11130 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11132 /* No Latin1 characters participate in multi-char
11133 * folds under /l */
11135 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
11141 else if (! _invlist_contains_cp(
11142 PL_NonL1NonFinalFold,
11143 valid_utf8_to_uvchr((U8 *) s, NULL)))
11148 /* Here, the current character is problematic in that
11149 * it does occur in the non-final position of some
11150 * fold, so try the character before it, but have to
11151 * special case the very first byte in the string, so
11152 * we don't read outside the string */
11153 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11154 } /* End of loop backwards through the string */
11156 /* If there were only problematic characters in the string,
11157 * <s> will point to before s0, in which case the length
11158 * should be 0, otherwise include the length of the
11159 * non-problematic character just found */
11160 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11163 /* Here, have found the final character, if any, that is
11164 * non-problematic as far as ending the node without splitting
11165 * it across a potential multi-char fold. <len> contains the
11166 * number of bytes in the node up-to and including that
11167 * character, or is 0 if there is no such character, meaning
11168 * the whole node contains only problematic characters. In
11169 * this case, give up and just take the node as-is. We can't
11175 /* Here, the node does contain some characters that aren't
11176 * problematic. If one such is the final character in the
11177 * node, we are done */
11178 if (len == full_len) {
11181 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11183 /* If the final character is problematic, but the
11184 * penultimate is not, back-off that last character to
11185 * later start a new node with it */
11190 /* Here, the final non-problematic character is earlier
11191 * in the input than the penultimate character. What we do
11192 * is reparse from the beginning, going up only as far as
11193 * this final ok one, thus guaranteeing that the node ends
11194 * in an acceptable character. The reason we reparse is
11195 * that we know how far in the character is, but we don't
11196 * know how to correlate its position with the input parse.
11197 * An alternate implementation would be to build that
11198 * correlation as we go along during the original parse,
11199 * but that would entail extra work for every node, whereas
11200 * this code gets executed only when the string is too
11201 * large for the node, and the final two characters are
11202 * problematic, an infrequent occurrence. Yet another
11203 * possible strategy would be to save the tail of the
11204 * string, and the next time regatom is called, initialize
11205 * with that. The problem with this is that unless you
11206 * back off one more character, you won't be guaranteed
11207 * regatom will get called again, unless regbranch,
11208 * regpiece ... are also changed. If you do back off that
11209 * extra character, so that there is input guaranteed to
11210 * force calling regatom, you can't handle the case where
11211 * just the first character in the node is acceptable. I
11212 * (khw) decided to try this method which doesn't have that
11213 * pitfall; if performance issues are found, we can do a
11214 * combination of the current approach plus that one */
11220 } /* End of verifying node ends with an appropriate char */
11222 loopdone: /* Jumped to when encounters something that shouldn't be in
11225 /* If 'maybe_exact' is still set here, means there are no
11226 * code points in the node that participate in folds */
11227 if (FOLD && maybe_exact) {
11231 /* I (khw) don't know if you can get here with zero length, but the
11232 * old code handled this situation by creating a zero-length EXACT
11233 * node. Might as well be NOTHING instead */
11238 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11241 RExC_parse = p - 1;
11242 Set_Node_Cur_Length(ret); /* MJD */
11243 nextchar(pRExC_state);
11245 /* len is STRLEN which is unsigned, need to copy to signed */
11248 vFAIL("Internal disaster");
11251 } /* End of label 'defchar:' */
11253 } /* End of giant switch on input character */
11259 S_regwhite( RExC_state_t *pRExC_state, char *p )
11261 const char *e = RExC_end;
11263 PERL_ARGS_ASSERT_REGWHITE;
11268 else if (*p == '#') {
11271 if (*p++ == '\n') {
11277 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11286 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
11288 /* Returns the next non-pattern-white space, non-comment character (the
11289 * latter only if 'recognize_comment is true) in the string p, which is
11290 * ended by RExC_end. If there is no line break ending a comment,
11291 * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
11292 const char *e = RExC_end;
11294 PERL_ARGS_ASSERT_REGPATWS;
11298 if ((len = is_PATWS_safe(p, e, UTF))) {
11301 else if (recognize_comment && *p == '#') {
11305 if (is_LNBREAK_safe(p, e, UTF)) {
11311 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11319 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11320 Character classes ([:foo:]) can also be negated ([:^foo:]).
11321 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11322 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11323 but trigger failures because they are currently unimplemented. */
11325 #define POSIXCC_DONE(c) ((c) == ':')
11326 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11327 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11329 PERL_STATIC_INLINE I32
11330 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
11333 I32 namedclass = OOB_NAMEDCLASS;
11335 PERL_ARGS_ASSERT_REGPPOSIXCC;
11337 if (value == '[' && RExC_parse + 1 < RExC_end &&
11338 /* I smell either [: or [= or [. -- POSIX has been here, right? */
11339 POSIXCC(UCHARAT(RExC_parse)))
11341 const char c = UCHARAT(RExC_parse);
11342 char* const s = RExC_parse++;
11344 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11346 if (RExC_parse == RExC_end) {
11349 /* Try to give a better location for the error (than the end of
11350 * the string) by looking for the matching ']' */
11352 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11355 vFAIL2("Unmatched '%c' in POSIX class", c);
11357 /* Grandfather lone [:, [=, [. */
11361 const char* const t = RExC_parse++; /* skip over the c */
11364 if (UCHARAT(RExC_parse) == ']') {
11365 const char *posixcc = s + 1;
11366 RExC_parse++; /* skip over the ending ] */
11369 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11370 const I32 skip = t - posixcc;
11372 /* Initially switch on the length of the name. */
11375 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
11376 this is the Perl \w
11378 namedclass = ANYOF_WORDCHAR;
11381 /* Names all of length 5. */
11382 /* alnum alpha ascii blank cntrl digit graph lower
11383 print punct space upper */
11384 /* Offset 4 gives the best switch position. */
11385 switch (posixcc[4]) {
11387 if (memEQ(posixcc, "alph", 4)) /* alpha */
11388 namedclass = ANYOF_ALPHA;
11391 if (memEQ(posixcc, "spac", 4)) /* space */
11392 namedclass = ANYOF_PSXSPC;
11395 if (memEQ(posixcc, "grap", 4)) /* graph */
11396 namedclass = ANYOF_GRAPH;
11399 if (memEQ(posixcc, "asci", 4)) /* ascii */
11400 namedclass = ANYOF_ASCII;
11403 if (memEQ(posixcc, "blan", 4)) /* blank */
11404 namedclass = ANYOF_BLANK;
11407 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11408 namedclass = ANYOF_CNTRL;
11411 if (memEQ(posixcc, "alnu", 4)) /* alnum */
11412 namedclass = ANYOF_ALPHANUMERIC;
11415 if (memEQ(posixcc, "lowe", 4)) /* lower */
11416 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
11417 else if (memEQ(posixcc, "uppe", 4)) /* upper */
11418 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
11421 if (memEQ(posixcc, "digi", 4)) /* digit */
11422 namedclass = ANYOF_DIGIT;
11423 else if (memEQ(posixcc, "prin", 4)) /* print */
11424 namedclass = ANYOF_PRINT;
11425 else if (memEQ(posixcc, "punc", 4)) /* punct */
11426 namedclass = ANYOF_PUNCT;
11431 if (memEQ(posixcc, "xdigit", 6))
11432 namedclass = ANYOF_XDIGIT;
11436 if (namedclass == OOB_NAMEDCLASS)
11437 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11440 /* The #defines are structured so each complement is +1 to
11441 * the normal one */
11445 assert (posixcc[skip] == ':');
11446 assert (posixcc[skip+1] == ']');
11447 } else if (!SIZE_ONLY) {
11448 /* [[=foo=]] and [[.foo.]] are still future. */
11450 /* adjust RExC_parse so the warning shows after
11451 the class closes */
11452 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11454 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11457 /* Maternal grandfather:
11458 * "[:" ending in ":" but not in ":]" */
11460 vFAIL("Unmatched '[' in POSIX class");
11463 /* Grandfather lone [:, [=, [. */
11473 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
11475 /* This applies some heuristics at the current parse position (which should
11476 * be at a '[') to see if what follows might be intended to be a [:posix:]
11477 * class. It returns true if it really is a posix class, of course, but it
11478 * also can return true if it thinks that what was intended was a posix
11479 * class that didn't quite make it.
11481 * It will return true for
11483 * [:alphanumerics] (as long as the ] isn't followed immediately by a
11484 * ')' indicating the end of the (?[
11485 * [:any garbage including %^&$ punctuation:]
11487 * This is designed to be called only from S_handle_regex_sets; it could be
11488 * easily adapted to be called from the spot at the beginning of regclass()
11489 * that checks to see in a normal bracketed class if the surrounding []
11490 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
11491 * change long-standing behavior, so I (khw) didn't do that */
11492 char* p = RExC_parse + 1;
11493 char first_char = *p;
11495 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
11497 assert(*(p - 1) == '[');
11499 if (! POSIXCC(first_char)) {
11504 while (p < RExC_end && isWORDCHAR(*p)) p++;
11506 if (p >= RExC_end) {
11510 if (p - RExC_parse > 2 /* Got at least 1 word character */
11511 && (*p == first_char
11512 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
11517 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
11520 && p - RExC_parse > 2 /* [:] evaluates to colon;
11521 [::] is a bad posix class. */
11522 && first_char == *(p - 1));
11526 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth,
11527 char * const oregcomp_parse)
11529 /* Handle the (?[...]) construct to do set operations */
11532 UV start, end; /* End points of code point ranges */
11534 char *save_end, *save_parse;
11539 const bool save_fold = FOLD;
11541 GET_RE_DEBUG_FLAGS_DECL;
11543 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
11546 vFAIL("(?[...]) not valid in locale");
11548 RExC_uni_semantics = 1;
11550 /* This will return only an ANYOF regnode, or (unlikely) something smaller
11551 * (such as EXACT). Thus we can skip most everything if just sizing. We
11552 * call regclass to handle '[]' so as to not have to reinvent its parsing
11553 * rules here (throwing away the size it computes each time). And, we exit
11554 * upon an unescaped ']' that isn't one ending a regclass. To do both
11555 * these things, we need to realize that something preceded by a backslash
11556 * is escaped, so we have to keep track of backslashes */
11559 Perl_ck_warner_d(aTHX_
11560 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
11561 "The regex_sets feature is experimental" REPORT_LOCATION,
11562 (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse);
11564 while (RExC_parse < RExC_end) {
11565 SV* current = NULL;
11566 RExC_parse = regpatws(pRExC_state, RExC_parse,
11567 TRUE); /* means recognize comments */
11568 switch (*RExC_parse) {
11572 /* Skip the next byte (which could cause us to end up in
11573 * the middle of a UTF-8 character, but since none of those
11574 * are confusable with anything we currently handle in this
11575 * switch (invariants all), it's safe. We'll just hit the
11576 * default: case next time and keep on incrementing until
11577 * we find one of the invariants we do handle. */
11582 /* If this looks like it is a [:posix:] class, leave the
11583 * parse pointer at the '[' to fool regclass() into
11584 * thinking it is part of a '[[:posix:]]'. That function
11585 * will use strict checking to force a syntax error if it
11586 * doesn't work out to a legitimate class */
11587 bool is_posix_class
11588 = could_it_be_a_POSIX_class(pRExC_state);
11589 if (! is_posix_class) {
11593 /* regclass() can only return RESTART_UTF8 if multi-char
11594 folds are allowed. */
11595 if (!regclass(pRExC_state, flagp,depth+1,
11596 is_posix_class, /* parse the whole char
11597 class only if not a
11599 FALSE, /* don't allow multi-char folds */
11600 TRUE, /* silence non-portable warnings. */
11602 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
11605 /* function call leaves parse pointing to the ']', except
11606 * if we faked it */
11607 if (is_posix_class) {
11611 SvREFCNT_dec(current); /* In case it returned something */
11617 if (RExC_parse < RExC_end
11618 && *RExC_parse == ')')
11620 node = reganode(pRExC_state, ANYOF, 0);
11621 RExC_size += ANYOF_SKIP;
11622 nextchar(pRExC_state);
11623 Set_Node_Length(node,
11624 RExC_parse - oregcomp_parse + 1); /* MJD */
11633 FAIL("Syntax error in (?[...])");
11636 /* Pass 2 only after this. Everything in this construct is a
11637 * metacharacter. Operands begin with either a '\' (for an escape
11638 * sequence), or a '[' for a bracketed character class. Any other
11639 * character should be an operator, or parenthesis for grouping. Both
11640 * types of operands are handled by calling regclass() to parse them. It
11641 * is called with a parameter to indicate to return the computed inversion
11642 * list. The parsing here is implemented via a stack. Each entry on the
11643 * stack is a single character representing one of the operators, or the
11644 * '('; or else a pointer to an operand inversion list. */
11646 #define IS_OPERAND(a) (! SvIOK(a))
11648 /* The stack starts empty. It is a syntax error if the first thing parsed
11649 * is a binary operator; everything else is pushed on the stack. When an
11650 * operand is parsed, the top of the stack is examined. If it is a binary
11651 * operator, the item before it should be an operand, and both are replaced
11652 * by the result of doing that operation on the new operand and the one on
11653 * the stack. Thus a sequence of binary operands is reduced to a single
11654 * one before the next one is parsed.
11656 * A unary operator may immediately follow a binary in the input, for
11659 * When an operand is parsed and the top of the stack is a unary operator,
11660 * the operation is performed, and then the stack is rechecked to see if
11661 * this new operand is part of a binary operation; if so, it is handled as
11664 * A '(' is simply pushed on the stack; it is valid only if the stack is
11665 * empty, or the top element of the stack is an operator or another '('
11666 * (for which the parenthesized expression will become an operand). By the
11667 * time the corresponding ')' is parsed everything in between should have
11668 * been parsed and evaluated to a single operand (or else is a syntax
11669 * error), and is handled as a regular operand */
11673 while (RExC_parse < RExC_end) {
11674 I32 top_index = av_tindex(stack);
11676 SV* current = NULL;
11678 /* Skip white space */
11679 RExC_parse = regpatws(pRExC_state, RExC_parse,
11680 TRUE); /* means recognize comments */
11681 if (RExC_parse >= RExC_end) {
11682 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
11684 if ((curchar = UCHARAT(RExC_parse)) == ']') {
11691 if (av_tindex(stack) >= 0 /* This makes sure that we can
11692 safely subtract 1 from
11693 RExC_parse in the next clause.
11694 If we have something on the
11695 stack, we have parsed something
11697 && UCHARAT(RExC_parse - 1) == '('
11698 && RExC_parse < RExC_end)
11700 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
11701 * This happens when we have some thing like
11703 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
11705 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
11707 * Here we would be handling the interpolated
11708 * '$thai_or_lao'. We handle this by a recursive call to
11709 * ourselves which returns the inversion list the
11710 * interpolated expression evaluates to. We use the flags
11711 * from the interpolated pattern. */
11712 U32 save_flags = RExC_flags;
11713 const char * const save_parse = ++RExC_parse;
11715 parse_lparen_question_flags(pRExC_state);
11717 if (RExC_parse == save_parse /* Makes sure there was at
11718 least one flag (or this
11719 embedding wasn't compiled)
11721 || RExC_parse >= RExC_end - 4
11722 || UCHARAT(RExC_parse) != ':'
11723 || UCHARAT(++RExC_parse) != '('
11724 || UCHARAT(++RExC_parse) != '?'
11725 || UCHARAT(++RExC_parse) != '[')
11728 /* In combination with the above, this moves the
11729 * pointer to the point just after the first erroneous
11730 * character (or if there are no flags, to where they
11731 * should have been) */
11732 if (RExC_parse >= RExC_end - 4) {
11733 RExC_parse = RExC_end;
11735 else if (RExC_parse != save_parse) {
11736 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11738 vFAIL("Expecting '(?flags:(?[...'");
11741 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
11742 depth+1, oregcomp_parse);
11744 /* Here, 'current' contains the embedded expression's
11745 * inversion list, and RExC_parse points to the trailing
11746 * ']'; the next character should be the ')' which will be
11747 * paired with the '(' that has been put on the stack, so
11748 * the whole embedded expression reduces to '(operand)' */
11751 RExC_flags = save_flags;
11752 goto handle_operand;
11757 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11758 vFAIL("Unexpected character");
11761 /* regclass() can only return RESTART_UTF8 if multi-char
11762 folds are allowed. */
11763 if (!regclass(pRExC_state, flagp,depth+1,
11764 TRUE, /* means parse just the next thing */
11765 FALSE, /* don't allow multi-char folds */
11766 FALSE, /* don't silence non-portable warnings. */
11768 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
11770 /* regclass() will return with parsing just the \ sequence,
11771 * leaving the parse pointer at the next thing to parse */
11773 goto handle_operand;
11775 case '[': /* Is a bracketed character class */
11777 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
11779 if (! is_posix_class) {
11783 /* regclass() can only return RESTART_UTF8 if multi-char
11784 folds are allowed. */
11785 if(!regclass(pRExC_state, flagp,depth+1,
11786 is_posix_class, /* parse the whole char class
11787 only if not a posix class */
11788 FALSE, /* don't allow multi-char folds */
11789 FALSE, /* don't silence non-portable warnings. */
11791 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
11793 /* function call leaves parse pointing to the ']', except if we
11795 if (is_posix_class) {
11799 goto handle_operand;
11808 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
11809 || ! IS_OPERAND(*top_ptr))
11812 vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
11814 av_push(stack, newSVuv(curchar));
11818 av_push(stack, newSVuv(curchar));
11822 if (top_index >= 0) {
11823 top_ptr = av_fetch(stack, top_index, FALSE);
11825 if (IS_OPERAND(*top_ptr)) {
11827 vFAIL("Unexpected '(' with no preceding operator");
11830 av_push(stack, newSVuv(curchar));
11837 || ! (current = av_pop(stack))
11838 || ! IS_OPERAND(current)
11839 || ! (lparen = av_pop(stack))
11840 || IS_OPERAND(lparen)
11841 || SvUV(lparen) != '(')
11844 vFAIL("Unexpected ')'");
11847 SvREFCNT_dec_NN(lparen);
11854 /* Here, we have an operand to process, in 'current' */
11856 if (top_index < 0) { /* Just push if stack is empty */
11857 av_push(stack, current);
11860 SV* top = av_pop(stack);
11861 char current_operator;
11863 if (IS_OPERAND(top)) {
11864 vFAIL("Operand with no preceding operator");
11866 current_operator = (char) SvUV(top);
11867 switch (current_operator) {
11868 case '(': /* Push the '(' back on followed by the new
11870 av_push(stack, top);
11871 av_push(stack, current);
11872 SvREFCNT_inc(top); /* Counters the '_dec' done
11873 just after the 'break', so
11874 it doesn't get wrongly freed
11879 _invlist_invert(current);
11881 /* Unlike binary operators, the top of the stack,
11882 * now that this unary one has been popped off, may
11883 * legally be an operator, and we now have operand
11886 SvREFCNT_dec_NN(top);
11887 goto handle_operand;
11890 _invlist_intersection(av_pop(stack),
11893 av_push(stack, current);
11898 _invlist_union(av_pop(stack), current, ¤t);
11899 av_push(stack, current);
11903 _invlist_subtract(av_pop(stack), current, ¤t);
11904 av_push(stack, current);
11907 case '^': /* The union minus the intersection */
11913 element = av_pop(stack);
11914 _invlist_union(element, current, &u);
11915 _invlist_intersection(element, current, &i);
11916 _invlist_subtract(u, i, ¤t);
11917 av_push(stack, current);
11918 SvREFCNT_dec_NN(i);
11919 SvREFCNT_dec_NN(u);
11920 SvREFCNT_dec_NN(element);
11925 Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
11927 SvREFCNT_dec_NN(top);
11931 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11934 if (av_tindex(stack) < 0 /* Was empty */
11935 || ((final = av_pop(stack)) == NULL)
11936 || ! IS_OPERAND(final)
11937 || av_tindex(stack) >= 0) /* More left on stack */
11939 vFAIL("Incomplete expression within '(?[ ])'");
11942 /* Here, 'final' is the resultant inversion list from evaluating the
11943 * expression. Return it if so requested */
11944 if (return_invlist) {
11945 *return_invlist = final;
11949 /* Otherwise generate a resultant node, based on 'final'. regclass() is
11950 * expecting a string of ranges and individual code points */
11951 invlist_iterinit(final);
11952 result_string = newSVpvs("");
11953 while (invlist_iternext(final, &start, &end)) {
11954 if (start == end) {
11955 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
11958 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
11963 save_parse = RExC_parse;
11964 RExC_parse = SvPV(result_string, len);
11965 save_end = RExC_end;
11966 RExC_end = RExC_parse + len;
11968 /* We turn off folding around the call, as the class we have constructed
11969 * already has all folding taken into consideration, and we don't want
11970 * regclass() to add to that */
11971 RExC_flags &= ~RXf_PMf_FOLD;
11972 /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
11974 node = regclass(pRExC_state, flagp,depth+1,
11975 FALSE, /* means parse the whole char class */
11976 FALSE, /* don't allow multi-char folds */
11977 TRUE, /* silence non-portable warnings. The above may very
11978 well have generated non-portable code points, but
11979 they're valid on this machine */
11982 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X", flagp);
11984 RExC_flags |= RXf_PMf_FOLD;
11986 RExC_parse = save_parse + 1;
11987 RExC_end = save_end;
11988 SvREFCNT_dec_NN(final);
11989 SvREFCNT_dec_NN(result_string);
11990 SvREFCNT_dec_NN(stack);
11992 nextchar(pRExC_state);
11993 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
11998 /* The names of properties whose definitions are not known at compile time are
11999 * stored in this SV, after a constant heading. So if the length has been
12000 * changed since initialization, then there is a run-time definition. */
12001 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
12004 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
12005 const bool stop_at_1, /* Just parse the next thing, don't
12006 look for a full character class */
12007 bool allow_multi_folds,
12008 const bool silence_non_portable, /* Don't output warnings
12011 SV** ret_invlist) /* Return an inversion list, not a node */
12013 /* parse a bracketed class specification. Most of these will produce an
12014 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
12015 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
12016 * under /i with multi-character folds: it will be rewritten following the
12017 * paradigm of this example, where the <multi-fold>s are characters which
12018 * fold to multiple character sequences:
12019 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
12020 * gets effectively rewritten as:
12021 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
12022 * reg() gets called (recursively) on the rewritten version, and this
12023 * function will return what it constructs. (Actually the <multi-fold>s
12024 * aren't physically removed from the [abcdefghi], it's just that they are
12025 * ignored in the recursion by means of a flag:
12026 * <RExC_in_multi_char_class>.)
12028 * ANYOF nodes contain a bit map for the first 256 characters, with the
12029 * corresponding bit set if that character is in the list. For characters
12030 * above 255, a range list or swash is used. There are extra bits for \w,
12031 * etc. in locale ANYOFs, as what these match is not determinable at
12034 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
12035 * to be restarted. This can only happen if ret_invlist is non-NULL.
12039 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
12041 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
12044 IV namedclass = OOB_NAMEDCLASS;
12045 char *rangebegin = NULL;
12046 bool need_class = 0;
12048 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
12049 than just initialized. */
12050 SV* properties = NULL; /* Code points that match \p{} \P{} */
12051 SV* posixes = NULL; /* Code points that match classes like, [:word:],
12052 extended beyond the Latin1 range */
12053 UV element_count = 0; /* Number of distinct elements in the class.
12054 Optimizations may be possible if this is tiny */
12055 AV * multi_char_matches = NULL; /* Code points that fold to more than one
12056 character; used under /i */
12058 char * stop_ptr = RExC_end; /* where to stop parsing */
12059 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
12061 const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
12063 /* Unicode properties are stored in a swash; this holds the current one
12064 * being parsed. If this swash is the only above-latin1 component of the
12065 * character class, an optimization is to pass it directly on to the
12066 * execution engine. Otherwise, it is set to NULL to indicate that there
12067 * are other things in the class that have to be dealt with at execution
12069 SV* swash = NULL; /* Code points that match \p{} \P{} */
12071 /* Set if a component of this character class is user-defined; just passed
12072 * on to the engine */
12073 bool has_user_defined_property = FALSE;
12075 /* inversion list of code points this node matches only when the target
12076 * string is in UTF-8. (Because is under /d) */
12077 SV* depends_list = NULL;
12079 /* inversion list of code points this node matches. For much of the
12080 * function, it includes only those that match regardless of the utf8ness
12081 * of the target string */
12082 SV* cp_list = NULL;
12085 /* In a range, counts how many 0-2 of the ends of it came from literals,
12086 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
12087 UV literal_endpoint = 0;
12089 bool invert = FALSE; /* Is this class to be complemented */
12091 /* Is there any thing like \W or [:^digit:] that matches above the legal
12092 * Unicode range? */
12093 bool runtime_posix_matches_above_Unicode = FALSE;
12095 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
12096 case we need to change the emitted regop to an EXACT. */
12097 const char * orig_parse = RExC_parse;
12098 const I32 orig_size = RExC_size;
12099 GET_RE_DEBUG_FLAGS_DECL;
12101 PERL_ARGS_ASSERT_REGCLASS;
12103 PERL_UNUSED_ARG(depth);
12106 DEBUG_PARSE("clas");
12108 /* Assume we are going to generate an ANYOF node. */
12109 ret = reganode(pRExC_state, ANYOF, 0);
12112 RExC_size += ANYOF_SKIP;
12113 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
12116 ANYOF_FLAGS(ret) = 0;
12118 RExC_emit += ANYOF_SKIP;
12120 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
12122 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
12123 initial_listsv_len = SvCUR(listsv);
12124 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
12128 RExC_parse = regpatws(pRExC_state, RExC_parse,
12129 FALSE /* means don't recognize comments */);
12132 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
12135 allow_multi_folds = FALSE;
12138 RExC_parse = regpatws(pRExC_state, RExC_parse,
12139 FALSE /* means don't recognize comments */);
12143 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
12144 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
12145 const char *s = RExC_parse;
12146 const char c = *s++;
12148 while (isWORDCHAR(*s))
12150 if (*s && c == *s && s[1] == ']') {
12151 SAVEFREESV(RExC_rx_sv);
12153 "POSIX syntax [%c %c] belongs inside character classes",
12155 (void)ReREFCNT_inc(RExC_rx_sv);
12159 /* If the caller wants us to just parse a single element, accomplish this
12160 * by faking the loop ending condition */
12161 if (stop_at_1 && RExC_end > RExC_parse) {
12162 stop_ptr = RExC_parse + 1;
12165 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
12166 if (UCHARAT(RExC_parse) == ']')
12167 goto charclassloop;
12171 if (RExC_parse >= stop_ptr) {
12176 RExC_parse = regpatws(pRExC_state, RExC_parse,
12177 FALSE /* means don't recognize comments */);
12180 if (UCHARAT(RExC_parse) == ']') {
12186 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
12187 save_value = value;
12188 save_prevvalue = prevvalue;
12191 rangebegin = RExC_parse;
12195 value = utf8n_to_uvchr((U8*)RExC_parse,
12196 RExC_end - RExC_parse,
12197 &numlen, UTF8_ALLOW_DEFAULT);
12198 RExC_parse += numlen;
12201 value = UCHARAT(RExC_parse++);
12204 && RExC_parse < RExC_end
12205 && POSIXCC(UCHARAT(RExC_parse)))
12207 namedclass = regpposixcc(pRExC_state, value, strict);
12209 else if (value == '\\') {
12211 value = utf8n_to_uvchr((U8*)RExC_parse,
12212 RExC_end - RExC_parse,
12213 &numlen, UTF8_ALLOW_DEFAULT);
12214 RExC_parse += numlen;
12217 value = UCHARAT(RExC_parse++);
12219 /* Some compilers cannot handle switching on 64-bit integer
12220 * values, therefore value cannot be an UV. Yes, this will
12221 * be a problem later if we want switch on Unicode.
12222 * A similar issue a little bit later when switching on
12223 * namedclass. --jhi */
12225 /* If the \ is escaping white space when white space is being
12226 * skipped, it means that that white space is wanted literally, and
12227 * is already in 'value'. Otherwise, need to translate the escape
12228 * into what it signifies. */
12229 if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
12231 case 'w': namedclass = ANYOF_WORDCHAR; break;
12232 case 'W': namedclass = ANYOF_NWORDCHAR; break;
12233 case 's': namedclass = ANYOF_SPACE; break;
12234 case 'S': namedclass = ANYOF_NSPACE; break;
12235 case 'd': namedclass = ANYOF_DIGIT; break;
12236 case 'D': namedclass = ANYOF_NDIGIT; break;
12237 case 'v': namedclass = ANYOF_VERTWS; break;
12238 case 'V': namedclass = ANYOF_NVERTWS; break;
12239 case 'h': namedclass = ANYOF_HORIZWS; break;
12240 case 'H': namedclass = ANYOF_NHORIZWS; break;
12241 case 'N': /* Handle \N{NAME} in class */
12243 /* We only pay attention to the first char of
12244 multichar strings being returned. I kinda wonder
12245 if this makes sense as it does change the behaviour
12246 from earlier versions, OTOH that behaviour was broken
12248 if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
12249 TRUE, /* => charclass */
12252 if (*flagp & RESTART_UTF8)
12253 FAIL("panic: grok_bslash_N set RESTART_UTF8");
12263 /* We will handle any undefined properties ourselves */
12264 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
12266 if (RExC_parse >= RExC_end)
12267 vFAIL2("Empty \\%c{}", (U8)value);
12268 if (*RExC_parse == '{') {
12269 const U8 c = (U8)value;
12270 e = strchr(RExC_parse++, '}');
12272 vFAIL2("Missing right brace on \\%c{}", c);
12273 while (isSPACE(UCHARAT(RExC_parse)))
12275 if (e == RExC_parse)
12276 vFAIL2("Empty \\%c{}", c);
12277 n = e - RExC_parse;
12278 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
12289 if (UCHARAT(RExC_parse) == '^') {
12292 /* toggle. (The rhs xor gets the single bit that
12293 * differs between P and p; the other xor inverts just
12295 value ^= 'P' ^ 'p';
12297 while (isSPACE(UCHARAT(RExC_parse))) {
12302 /* Try to get the definition of the property into
12303 * <invlist>. If /i is in effect, the effective property
12304 * will have its name be <__NAME_i>. The design is
12305 * discussed in commit
12306 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
12307 Newx(name, n + sizeof("_i__\n"), char);
12309 sprintf(name, "%s%.*s%s\n",
12310 (FOLD) ? "__" : "",
12316 /* Look up the property name, and get its swash and
12317 * inversion list, if the property is found */
12319 SvREFCNT_dec_NN(swash);
12321 swash = _core_swash_init("utf8", name, &PL_sv_undef,
12324 NULL, /* No inversion list */
12327 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
12329 SvREFCNT_dec_NN(swash);
12333 /* Here didn't find it. It could be a user-defined
12334 * property that will be available at run-time. If we
12335 * accept only compile-time properties, is an error;
12336 * otherwise add it to the list for run-time look up */
12338 RExC_parse = e + 1;
12339 vFAIL3("Property '%.*s' is unknown", (int) n, name);
12341 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
12342 (value == 'p' ? '+' : '!'),
12344 has_user_defined_property = TRUE;
12346 /* We don't know yet, so have to assume that the
12347 * property could match something in the Latin1 range,
12348 * hence something that isn't utf8. Note that this
12349 * would cause things in <depends_list> to match
12350 * inappropriately, except that any \p{}, including
12351 * this one forces Unicode semantics, which means there
12352 * is <no depends_list> */
12353 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
12357 /* Here, did get the swash and its inversion list. If
12358 * the swash is from a user-defined property, then this
12359 * whole character class should be regarded as such */
12360 has_user_defined_property =
12362 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
12364 /* Invert if asking for the complement */
12365 if (value == 'P') {
12366 _invlist_union_complement_2nd(properties,
12370 /* The swash can't be used as-is, because we've
12371 * inverted things; delay removing it to here after
12372 * have copied its invlist above */
12373 SvREFCNT_dec_NN(swash);
12377 _invlist_union(properties, invlist, &properties);
12382 RExC_parse = e + 1;
12383 namedclass = ANYOF_UNIPROP; /* no official name, but it's
12386 /* \p means they want Unicode semantics */
12387 RExC_uni_semantics = 1;
12390 case 'n': value = '\n'; break;
12391 case 'r': value = '\r'; break;
12392 case 't': value = '\t'; break;
12393 case 'f': value = '\f'; break;
12394 case 'b': value = '\b'; break;
12395 case 'e': value = ASCII_TO_NATIVE('\033');break;
12396 case 'a': value = ASCII_TO_NATIVE('\007');break;
12398 RExC_parse--; /* function expects to be pointed at the 'o' */
12400 const char* error_msg;
12401 bool valid = grok_bslash_o(&RExC_parse,
12404 SIZE_ONLY, /* warnings in pass
12407 silence_non_portable,
12413 if (PL_encoding && value < 0x100) {
12414 goto recode_encoding;
12418 RExC_parse--; /* function expects to be pointed at the 'x' */
12420 const char* error_msg;
12421 bool valid = grok_bslash_x(&RExC_parse,
12424 TRUE, /* Output warnings */
12426 silence_non_portable,
12432 if (PL_encoding && value < 0x100)
12433 goto recode_encoding;
12436 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
12438 case '0': case '1': case '2': case '3': case '4':
12439 case '5': case '6': case '7':
12441 /* Take 1-3 octal digits */
12442 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12443 numlen = (strict) ? 4 : 3;
12444 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
12445 RExC_parse += numlen;
12448 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12449 vFAIL("Need exactly 3 octal digits");
12451 else if (! SIZE_ONLY /* like \08, \178 */
12453 && RExC_parse < RExC_end
12454 && isDIGIT(*RExC_parse)
12455 && ckWARN(WARN_REGEXP))
12457 SAVEFREESV(RExC_rx_sv);
12458 reg_warn_non_literal_string(
12460 form_short_octal_warning(RExC_parse, numlen));
12461 (void)ReREFCNT_inc(RExC_rx_sv);
12464 if (PL_encoding && value < 0x100)
12465 goto recode_encoding;
12469 if (! RExC_override_recoding) {
12470 SV* enc = PL_encoding;
12471 value = reg_recode((const char)(U8)value, &enc);
12474 vFAIL("Invalid escape in the specified encoding");
12476 else if (SIZE_ONLY) {
12477 ckWARNreg(RExC_parse,
12478 "Invalid escape in the specified encoding");
12484 /* Allow \_ to not give an error */
12485 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
12487 vFAIL2("Unrecognized escape \\%c in character class",
12491 SAVEFREESV(RExC_rx_sv);
12492 ckWARN2reg(RExC_parse,
12493 "Unrecognized escape \\%c in character class passed through",
12495 (void)ReREFCNT_inc(RExC_rx_sv);
12499 } /* End of switch on char following backslash */
12500 } /* end of handling backslash escape sequences */
12503 literal_endpoint++;
12506 /* Here, we have the current token in 'value' */
12508 /* What matches in a locale is not known until runtime. This includes
12509 * what the Posix classes (like \w, [:space:]) match. Room must be
12510 * reserved (one time per class) to store such classes, either if Perl
12511 * is compiled so that locale nodes always should have this space, or
12512 * if there is such class info to be stored. The space will contain a
12513 * bit for each named class that is to be matched against. This isn't
12514 * needed for \p{} and pseudo-classes, as they are not affected by
12515 * locale, and hence are dealt with separately */
12518 && (ANYOF_LOCALE == ANYOF_CLASS
12519 || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
12523 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12526 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12527 ANYOF_CLASS_ZERO(ret);
12529 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
12532 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
12534 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
12535 * literal, as is the character that began the false range, i.e.
12536 * the 'a' in the examples */
12539 const int w = (RExC_parse >= rangebegin)
12540 ? RExC_parse - rangebegin
12543 vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin);
12546 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
12547 ckWARN4reg(RExC_parse,
12548 "False [] range \"%*.*s\"",
12550 (void)ReREFCNT_inc(RExC_rx_sv);
12551 cp_list = add_cp_to_invlist(cp_list, '-');
12552 cp_list = add_cp_to_invlist(cp_list, prevvalue);
12556 range = 0; /* this was not a true range */
12557 element_count += 2; /* So counts for three values */
12561 U8 classnum = namedclass_to_classnum(namedclass);
12562 if (namedclass >= ANYOF_MAX) { /* If a special class */
12563 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
12565 /* Here, should be \h, \H, \v, or \V. Neither /d nor
12566 * /l make a difference in what these match. There
12567 * would be problems if these characters had folds
12568 * other than themselves, as cp_list is subject to
12570 if (classnum != _CC_VERTSPACE) {
12571 assert( namedclass == ANYOF_HORIZWS
12572 || namedclass == ANYOF_NHORIZWS);
12574 /* It turns out that \h is just a synonym for
12576 classnum = _CC_BLANK;
12579 _invlist_union_maybe_complement_2nd(
12581 PL_XPosix_ptrs[classnum],
12582 cBOOL(namedclass % 2), /* Complement if odd
12583 (NHORIZWS, NVERTWS)
12588 else if (classnum == _CC_ASCII) {
12591 ANYOF_CLASS_SET(ret, namedclass);
12594 #endif /* Not isascii(); just use the hard-coded definition for it */
12595 _invlist_union_maybe_complement_2nd(
12598 cBOOL(namedclass % 2), /* Complement if odd
12602 else { /* Garden variety class */
12604 /* The ascii range inversion list */
12605 SV* ascii_source = PL_Posix_ptrs[classnum];
12607 /* The full Latin1 range inversion list */
12608 SV* l1_source = PL_L1Posix_ptrs[classnum];
12610 /* This code is structured into two major clauses. The
12611 * first is for classes whose complete definitions may not
12612 * already be known. It not, the Latin1 definition
12613 * (guaranteed to already known) is used plus code is
12614 * generated to load the rest at run-time (only if needed).
12615 * If the complete definition is known, it drops down to
12616 * the second clause, where the complete definition is
12619 if (classnum < _FIRST_NON_SWASH_CC) {
12621 /* Here, the class has a swash, which may or not
12622 * already be loaded */
12624 /* The name of the property to use to match the full
12625 * eXtended Unicode range swash for this character
12627 const char *Xname = swash_property_names[classnum];
12629 /* If returning the inversion list, we can't defer
12630 * getting this until runtime */
12631 if (ret_invlist && ! PL_utf8_swash_ptrs[classnum]) {
12632 PL_utf8_swash_ptrs[classnum] =
12633 _core_swash_init("utf8", Xname, &PL_sv_undef,
12636 NULL, /* No inversion list */
12637 NULL /* No flags */
12639 assert(PL_utf8_swash_ptrs[classnum]);
12641 if ( ! PL_utf8_swash_ptrs[classnum]) {
12642 if (namedclass % 2 == 0) { /* A non-complemented
12644 /* If not /a matching, there are code points we
12645 * don't know at compile time. Arrange for the
12646 * unknown matches to be loaded at run-time, if
12648 if (! AT_LEAST_ASCII_RESTRICTED) {
12649 Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
12652 if (LOC) { /* Under locale, set run-time
12654 ANYOF_CLASS_SET(ret, namedclass);
12657 /* Add the current class's code points to
12658 * the running total */
12659 _invlist_union(posixes,
12660 (AT_LEAST_ASCII_RESTRICTED)
12666 else { /* A complemented class */
12667 if (AT_LEAST_ASCII_RESTRICTED) {
12668 /* Under /a should match everything above
12669 * ASCII, plus the complement of the set's
12671 _invlist_union_complement_2nd(posixes,
12676 /* Arrange for the unknown matches to be
12677 * loaded at run-time, if needed */
12678 Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
12680 runtime_posix_matches_above_Unicode = TRUE;
12682 ANYOF_CLASS_SET(ret, namedclass);
12686 /* We want to match everything in
12687 * Latin1, except those things that
12688 * l1_source matches */
12689 SV* scratch_list = NULL;
12690 _invlist_subtract(PL_Latin1, l1_source,
12693 /* Add the list from this class to the
12696 posixes = scratch_list;
12699 _invlist_union(posixes,
12702 SvREFCNT_dec_NN(scratch_list);
12704 if (DEPENDS_SEMANTICS) {
12706 |= ANYOF_NON_UTF8_LATIN1_ALL;
12711 goto namedclass_done;
12714 /* Here, there is a swash loaded for the class. If no
12715 * inversion list for it yet, get it */
12716 if (! PL_XPosix_ptrs[classnum]) {
12717 PL_XPosix_ptrs[classnum]
12718 = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
12722 /* Here there is an inversion list already loaded for the
12725 if (namedclass % 2 == 0) { /* A non-complemented class,
12726 like ANYOF_PUNCT */
12728 /* For non-locale, just add it to any existing list
12730 _invlist_union(posixes,
12731 (AT_LEAST_ASCII_RESTRICTED)
12733 : PL_XPosix_ptrs[classnum],
12736 else { /* Locale */
12737 SV* scratch_list = NULL;
12739 /* For above Latin1 code points, we use the full
12741 _invlist_intersection(PL_AboveLatin1,
12742 PL_XPosix_ptrs[classnum],
12744 /* And set the output to it, adding instead if
12745 * there already is an output. Checking if
12746 * 'posixes' is NULL first saves an extra clone.
12747 * Its reference count will be decremented at the
12748 * next union, etc, or if this is the only
12749 * instance, at the end of the routine */
12751 posixes = scratch_list;
12754 _invlist_union(posixes, scratch_list, &posixes);
12755 SvREFCNT_dec_NN(scratch_list);
12758 #ifndef HAS_ISBLANK
12759 if (namedclass != ANYOF_BLANK) {
12761 /* Set this class in the node for runtime
12763 ANYOF_CLASS_SET(ret, namedclass);
12764 #ifndef HAS_ISBLANK
12767 /* No isblank(), use the hard-coded ASCII-range
12768 * blanks, adding them to the running total. */
12770 _invlist_union(posixes, ascii_source, &posixes);
12775 else { /* A complemented class, like ANYOF_NPUNCT */
12777 _invlist_union_complement_2nd(
12779 (AT_LEAST_ASCII_RESTRICTED)
12781 : PL_XPosix_ptrs[classnum],
12783 /* Under /d, everything in the upper half of the
12784 * Latin1 range matches this complement */
12785 if (DEPENDS_SEMANTICS) {
12786 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
12789 else { /* Locale */
12790 SV* scratch_list = NULL;
12791 _invlist_subtract(PL_AboveLatin1,
12792 PL_XPosix_ptrs[classnum],
12795 posixes = scratch_list;
12798 _invlist_union(posixes, scratch_list, &posixes);
12799 SvREFCNT_dec_NN(scratch_list);
12801 #ifndef HAS_ISBLANK
12802 if (namedclass != ANYOF_NBLANK) {
12804 ANYOF_CLASS_SET(ret, namedclass);
12805 #ifndef HAS_ISBLANK
12808 /* Get the list of all code points in Latin1
12809 * that are not ASCII blanks, and add them to
12810 * the running total */
12811 _invlist_subtract(PL_Latin1, ascii_source,
12813 _invlist_union(posixes, scratch_list, &posixes);
12814 SvREFCNT_dec_NN(scratch_list);
12821 continue; /* Go get next character */
12823 } /* end of namedclass \blah */
12825 /* Here, we have a single value. If 'range' is set, it is the ending
12826 * of a range--check its validity. Later, we will handle each
12827 * individual code point in the range. If 'range' isn't set, this
12828 * could be the beginning of a range, so check for that by looking
12829 * ahead to see if the next real character to be processed is the range
12830 * indicator--the minus sign */
12833 RExC_parse = regpatws(pRExC_state, RExC_parse,
12834 FALSE /* means don't recognize comments */);
12838 if (prevvalue > value) /* b-a */ {
12839 const int w = RExC_parse - rangebegin;
12840 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
12841 range = 0; /* not a valid range */
12845 prevvalue = value; /* save the beginning of the potential range */
12846 if (! stop_at_1 /* Can't be a range if parsing just one thing */
12847 && *RExC_parse == '-')
12849 char* next_char_ptr = RExC_parse + 1;
12850 if (skip_white) { /* Get the next real char after the '-' */
12851 next_char_ptr = regpatws(pRExC_state,
12853 FALSE); /* means don't recognize
12857 /* If the '-' is at the end of the class (just before the ']',
12858 * it is a literal minus; otherwise it is a range */
12859 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
12860 RExC_parse = next_char_ptr;
12862 /* a bad range like \w-, [:word:]- ? */
12863 if (namedclass > OOB_NAMEDCLASS) {
12864 if (strict || ckWARN(WARN_REGEXP)) {
12866 RExC_parse >= rangebegin ?
12867 RExC_parse - rangebegin : 0;
12869 vFAIL4("False [] range \"%*.*s\"",
12874 "False [] range \"%*.*s\"",
12879 cp_list = add_cp_to_invlist(cp_list, '-');
12883 range = 1; /* yeah, it's a range! */
12884 continue; /* but do it the next time */
12889 /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12892 /* non-Latin1 code point implies unicode semantics. Must be set in
12893 * pass1 so is there for the whole of pass 2 */
12895 RExC_uni_semantics = 1;
12898 /* Ready to process either the single value, or the completed range.
12899 * For single-valued non-inverted ranges, we consider the possibility
12900 * of multi-char folds. (We made a conscious decision to not do this
12901 * for the other cases because it can often lead to non-intuitive
12902 * results. For example, you have the peculiar case that:
12903 * "s s" =~ /^[^\xDF]+$/i => Y
12904 * "ss" =~ /^[^\xDF]+$/i => N
12906 * See [perl #89750] */
12907 if (FOLD && allow_multi_folds && value == prevvalue) {
12908 if (value == LATIN_SMALL_LETTER_SHARP_S
12909 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
12912 /* Here <value> is indeed a multi-char fold. Get what it is */
12914 U8 foldbuf[UTF8_MAXBYTES_CASE];
12917 UV folded = _to_uni_fold_flags(
12922 | ((LOC) ? FOLD_FLAGS_LOCALE
12923 : (ASCII_FOLD_RESTRICTED)
12924 ? FOLD_FLAGS_NOMIX_ASCII
12928 /* Here, <folded> should be the first character of the
12929 * multi-char fold of <value>, with <foldbuf> containing the
12930 * whole thing. But, if this fold is not allowed (because of
12931 * the flags), <fold> will be the same as <value>, and should
12932 * be processed like any other character, so skip the special
12934 if (folded != value) {
12936 /* Skip if we are recursed, currently parsing the class
12937 * again. Otherwise add this character to the list of
12938 * multi-char folds. */
12939 if (! RExC_in_multi_char_class) {
12940 AV** this_array_ptr;
12942 STRLEN cp_count = utf8_length(foldbuf,
12943 foldbuf + foldlen);
12944 SV* multi_fold = sv_2mortal(newSVpvn("", 0));
12946 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
12949 if (! multi_char_matches) {
12950 multi_char_matches = newAV();
12953 /* <multi_char_matches> is actually an array of arrays.
12954 * There will be one or two top-level elements: [2],
12955 * and/or [3]. The [2] element is an array, each
12956 * element thereof is a character which folds to two
12957 * characters; likewise for [3]. (Unicode guarantees a
12958 * maximum of 3 characters in any fold.) When we
12959 * rewrite the character class below, we will do so
12960 * such that the longest folds are written first, so
12961 * that it prefers the longest matching strings first.
12962 * This is done even if it turns out that any
12963 * quantifier is non-greedy, out of programmer
12964 * laziness. Tom Christiansen has agreed that this is
12965 * ok. This makes the test for the ligature 'ffi' come
12966 * before the test for 'ff' */
12967 if (av_exists(multi_char_matches, cp_count)) {
12968 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12970 this_array = *this_array_ptr;
12973 this_array = newAV();
12974 av_store(multi_char_matches, cp_count,
12977 av_push(this_array, multi_fold);
12980 /* This element should not be processed further in this
12983 value = save_value;
12984 prevvalue = save_prevvalue;
12990 /* Deal with this element of the class */
12993 cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
12995 SV* this_range = _new_invlist(1);
12996 _append_range_to_invlist(this_range, prevvalue, value);
12998 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
12999 * If this range was specified using something like 'i-j', we want
13000 * to include only the 'i' and the 'j', and not anything in
13001 * between, so exclude non-ASCII, non-alphabetics from it.
13002 * However, if the range was specified with something like
13003 * [\x89-\x91] or [\x89-j], all code points within it should be
13004 * included. literal_endpoint==2 means both ends of the range used
13005 * a literal character, not \x{foo} */
13006 if (literal_endpoint == 2
13007 && (prevvalue >= 'a' && value <= 'z')
13008 || (prevvalue >= 'A' && value <= 'Z'))
13010 _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
13013 _invlist_union(cp_list, this_range, &cp_list);
13014 literal_endpoint = 0;
13018 range = 0; /* this range (if it was one) is done now */
13019 } /* End of loop through all the text within the brackets */
13021 /* If anything in the class expands to more than one character, we have to
13022 * deal with them by building up a substitute parse string, and recursively
13023 * calling reg() on it, instead of proceeding */
13024 if (multi_char_matches) {
13025 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
13028 char *save_end = RExC_end;
13029 char *save_parse = RExC_parse;
13030 bool first_time = TRUE; /* First multi-char occurrence doesn't get
13035 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
13036 because too confusing */
13038 sv_catpv(substitute_parse, "(?:");
13042 /* Look at the longest folds first */
13043 for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
13045 if (av_exists(multi_char_matches, cp_count)) {
13046 AV** this_array_ptr;
13049 this_array_ptr = (AV**) av_fetch(multi_char_matches,
13051 while ((this_sequence = av_pop(*this_array_ptr)) !=
13054 if (! first_time) {
13055 sv_catpv(substitute_parse, "|");
13057 first_time = FALSE;
13059 sv_catpv(substitute_parse, SvPVX(this_sequence));
13064 /* If the character class contains anything else besides these
13065 * multi-character folds, have to include it in recursive parsing */
13066 if (element_count) {
13067 sv_catpv(substitute_parse, "|[");
13068 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
13069 sv_catpv(substitute_parse, "]");
13072 sv_catpv(substitute_parse, ")");
13075 /* This is a way to get the parse to skip forward a whole named
13076 * sequence instead of matching the 2nd character when it fails the
13078 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
13082 RExC_parse = SvPV(substitute_parse, len);
13083 RExC_end = RExC_parse + len;
13084 RExC_in_multi_char_class = 1;
13085 RExC_emit = (regnode *)orig_emit;
13087 ret = reg(pRExC_state, 1, ®_flags, depth+1);
13089 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
13091 RExC_parse = save_parse;
13092 RExC_end = save_end;
13093 RExC_in_multi_char_class = 0;
13094 SvREFCNT_dec_NN(multi_char_matches);
13098 /* If the character class contains only a single element, it may be
13099 * optimizable into another node type which is smaller and runs faster.
13100 * Check if this is the case for this class */
13101 if (element_count == 1 && ! ret_invlist) {
13105 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
13106 [:digit:] or \p{foo} */
13108 /* All named classes are mapped into POSIXish nodes, with its FLAG
13109 * argument giving which class it is */
13110 switch ((I32)namedclass) {
13111 case ANYOF_UNIPROP:
13114 /* These don't depend on the charset modifiers. They always
13115 * match under /u rules */
13116 case ANYOF_NHORIZWS:
13117 case ANYOF_HORIZWS:
13118 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
13121 case ANYOF_NVERTWS:
13126 /* The actual POSIXish node for all the rest depends on the
13127 * charset modifier. The ones in the first set depend only on
13128 * ASCII or, if available on this platform, locale */
13132 op = (LOC) ? POSIXL : POSIXA;
13143 /* under /a could be alpha */
13145 if (ASCII_RESTRICTED) {
13146 namedclass = ANYOF_ALPHA + (namedclass % 2);
13154 /* The rest have more possibilities depending on the charset.
13155 * We take advantage of the enum ordering of the charset
13156 * modifiers to get the exact node type, */
13158 op = POSIXD + get_regex_charset(RExC_flags);
13159 if (op > POSIXA) { /* /aa is same as /a */
13162 #ifndef HAS_ISBLANK
13164 && (namedclass == ANYOF_BLANK
13165 || namedclass == ANYOF_NBLANK))
13172 /* The odd numbered ones are the complements of the
13173 * next-lower even number one */
13174 if (namedclass % 2 == 1) {
13178 arg = namedclass_to_classnum(namedclass);
13182 else if (value == prevvalue) {
13184 /* Here, the class consists of just a single code point */
13187 if (! LOC && value == '\n') {
13188 op = REG_ANY; /* Optimize [^\n] */
13189 *flagp |= HASWIDTH|SIMPLE;
13193 else if (value < 256 || UTF) {
13195 /* Optimize a single value into an EXACTish node, but not if it
13196 * would require converting the pattern to UTF-8. */
13197 op = compute_EXACTish(pRExC_state);
13199 } /* Otherwise is a range */
13200 else if (! LOC) { /* locale could vary these */
13201 if (prevvalue == '0') {
13202 if (value == '9') {
13209 /* Here, we have changed <op> away from its initial value iff we found
13210 * an optimization */
13213 /* Throw away this ANYOF regnode, and emit the calculated one,
13214 * which should correspond to the beginning, not current, state of
13216 const char * cur_parse = RExC_parse;
13217 RExC_parse = (char *)orig_parse;
13221 /* To get locale nodes to not use the full ANYOF size would
13222 * require moving the code above that writes the portions
13223 * of it that aren't in other nodes to after this point.
13224 * e.g. ANYOF_CLASS_SET */
13225 RExC_size = orig_size;
13229 RExC_emit = (regnode *)orig_emit;
13230 if (PL_regkind[op] == POSIXD) {
13232 op += NPOSIXD - POSIXD;
13237 ret = reg_node(pRExC_state, op);
13239 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
13243 *flagp |= HASWIDTH|SIMPLE;
13245 else if (PL_regkind[op] == EXACT) {
13246 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13249 RExC_parse = (char *) cur_parse;
13251 SvREFCNT_dec(posixes);
13252 SvREFCNT_dec(cp_list);
13259 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
13261 /* If folding, we calculate all characters that could fold to or from the
13262 * ones already on the list */
13263 if (FOLD && cp_list) {
13264 UV start, end; /* End points of code point ranges */
13266 SV* fold_intersection = NULL;
13268 /* If the highest code point is within Latin1, we can use the
13269 * compiled-in Alphas list, and not have to go out to disk. This
13270 * yields two false positives, the masculine and feminine ordinal
13271 * indicators, which are weeded out below using the
13272 * IS_IN_SOME_FOLD_L1() macro */
13273 if (invlist_highest(cp_list) < 256) {
13274 _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
13275 &fold_intersection);
13279 /* Here, there are non-Latin1 code points, so we will have to go
13280 * fetch the list of all the characters that participate in folds
13282 if (! PL_utf8_foldable) {
13283 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13284 &PL_sv_undef, 1, 0);
13285 PL_utf8_foldable = _get_swash_invlist(swash);
13286 SvREFCNT_dec_NN(swash);
13289 /* This is a hash that for a particular fold gives all characters
13290 * that are involved in it */
13291 if (! PL_utf8_foldclosures) {
13293 /* If we were unable to find any folds, then we likely won't be
13294 * able to find the closures. So just create an empty list.
13295 * Folding will effectively be restricted to the non-Unicode
13296 * rules hard-coded into Perl. (This case happens legitimately
13297 * during compilation of Perl itself before the Unicode tables
13298 * are generated) */
13299 if (_invlist_len(PL_utf8_foldable) == 0) {
13300 PL_utf8_foldclosures = newHV();
13303 /* If the folds haven't been read in, call a fold function
13305 if (! PL_utf8_tofold) {
13306 U8 dummy[UTF8_MAXBYTES+1];
13308 /* This string is just a short named one above \xff */
13309 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
13310 assert(PL_utf8_tofold); /* Verify that worked */
13312 PL_utf8_foldclosures =
13313 _swash_inversion_hash(PL_utf8_tofold);
13317 /* Only the characters in this class that participate in folds need
13318 * be checked. Get the intersection of this class and all the
13319 * possible characters that are foldable. This can quickly narrow
13320 * down a large class */
13321 _invlist_intersection(PL_utf8_foldable, cp_list,
13322 &fold_intersection);
13325 /* Now look at the foldable characters in this class individually */
13326 invlist_iterinit(fold_intersection);
13327 while (invlist_iternext(fold_intersection, &start, &end)) {
13330 /* Locale folding for Latin1 characters is deferred until runtime */
13331 if (LOC && start < 256) {
13335 /* Look at every character in the range */
13336 for (j = start; j <= end; j++) {
13338 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
13344 /* We have the latin1 folding rules hard-coded here so that
13345 * an innocent-looking character class, like /[ks]/i won't
13346 * have to go out to disk to find the possible matches.
13347 * XXX It would be better to generate these via regen, in
13348 * case a new version of the Unicode standard adds new
13349 * mappings, though that is not really likely, and may be
13350 * caught by the default: case of the switch below. */
13352 if (IS_IN_SOME_FOLD_L1(j)) {
13354 /* ASCII is always matched; non-ASCII is matched only
13355 * under Unicode rules */
13356 if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
13358 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
13362 add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
13366 if (HAS_NONLATIN1_FOLD_CLOSURE(j)
13367 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
13369 /* Certain Latin1 characters have matches outside
13370 * Latin1. To get here, <j> is one of those
13371 * characters. None of these matches is valid for
13372 * ASCII characters under /aa, which is why the 'if'
13373 * just above excludes those. These matches only
13374 * happen when the target string is utf8. The code
13375 * below adds the single fold closures for <j> to the
13376 * inversion list. */
13381 add_cp_to_invlist(cp_list, KELVIN_SIGN);
13385 cp_list = add_cp_to_invlist(cp_list,
13386 LATIN_SMALL_LETTER_LONG_S);
13389 cp_list = add_cp_to_invlist(cp_list,
13390 GREEK_CAPITAL_LETTER_MU);
13391 cp_list = add_cp_to_invlist(cp_list,
13392 GREEK_SMALL_LETTER_MU);
13394 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13395 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13397 add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
13399 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13400 cp_list = add_cp_to_invlist(cp_list,
13401 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13403 case LATIN_SMALL_LETTER_SHARP_S:
13404 cp_list = add_cp_to_invlist(cp_list,
13405 LATIN_CAPITAL_LETTER_SHARP_S);
13407 case 'F': case 'f':
13408 case 'I': case 'i':
13409 case 'L': case 'l':
13410 case 'T': case 't':
13411 case 'A': case 'a':
13412 case 'H': case 'h':
13413 case 'J': case 'j':
13414 case 'N': case 'n':
13415 case 'W': case 'w':
13416 case 'Y': case 'y':
13417 /* These all are targets of multi-character
13418 * folds from code points that require UTF8 to
13419 * express, so they can't match unless the
13420 * target string is in UTF-8, so no action here
13421 * is necessary, as regexec.c properly handles
13422 * the general case for UTF-8 matching and
13423 * multi-char folds */
13426 /* Use deprecated warning to increase the
13427 * chances of this being output */
13428 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
13435 /* Here is an above Latin1 character. We don't have the rules
13436 * hard-coded for it. First, get its fold. This is the simple
13437 * fold, as the multi-character folds have been handled earlier
13438 * and separated out */
13439 _to_uni_fold_flags(j, foldbuf, &foldlen,
13441 ? FOLD_FLAGS_LOCALE
13442 : (ASCII_FOLD_RESTRICTED)
13443 ? FOLD_FLAGS_NOMIX_ASCII
13446 /* Single character fold of above Latin1. Add everything in
13447 * its fold closure to the list that this node should match.
13448 * The fold closures data structure is a hash with the keys
13449 * being the UTF-8 of every character that is folded to, like
13450 * 'k', and the values each an array of all code points that
13451 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
13452 * Multi-character folds are not included */
13453 if ((listp = hv_fetch(PL_utf8_foldclosures,
13454 (char *) foldbuf, foldlen, FALSE)))
13456 AV* list = (AV*) *listp;
13458 for (k = 0; k <= av_len(list); k++) {
13459 SV** c_p = av_fetch(list, k, FALSE);
13462 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
13466 /* /aa doesn't allow folds between ASCII and non-; /l
13467 * doesn't allow them between above and below 256 */
13468 if ((ASCII_FOLD_RESTRICTED
13469 && (isASCII(c) != isASCII(j)))
13470 || (LOC && ((c < 256) != (j < 256))))
13475 /* Folds involving non-ascii Latin1 characters
13476 * under /d are added to a separate list */
13477 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
13479 cp_list = add_cp_to_invlist(cp_list, c);
13482 depends_list = add_cp_to_invlist(depends_list, c);
13488 SvREFCNT_dec_NN(fold_intersection);
13491 /* And combine the result (if any) with any inversion list from posix
13492 * classes. The lists are kept separate up to now because we don't want to
13493 * fold the classes (folding of those is automatically handled by the swash
13494 * fetching code) */
13496 if (! DEPENDS_SEMANTICS) {
13498 _invlist_union(cp_list, posixes, &cp_list);
13499 SvREFCNT_dec_NN(posixes);
13506 /* Under /d, we put into a separate list the Latin1 things that
13507 * match only when the target string is utf8 */
13508 SV* nonascii_but_latin1_properties = NULL;
13509 _invlist_intersection(posixes, PL_Latin1,
13510 &nonascii_but_latin1_properties);
13511 _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
13512 &nonascii_but_latin1_properties);
13513 _invlist_subtract(posixes, nonascii_but_latin1_properties,
13516 _invlist_union(cp_list, posixes, &cp_list);
13517 SvREFCNT_dec_NN(posixes);
13523 if (depends_list) {
13524 _invlist_union(depends_list, nonascii_but_latin1_properties,
13526 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
13529 depends_list = nonascii_but_latin1_properties;
13534 /* And combine the result (if any) with any inversion list from properties.
13535 * The lists are kept separate up to now so that we can distinguish the two
13536 * in regards to matching above-Unicode. A run-time warning is generated
13537 * if a Unicode property is matched against a non-Unicode code point. But,
13538 * we allow user-defined properties to match anything, without any warning,
13539 * and we also suppress the warning if there is a portion of the character
13540 * class that isn't a Unicode property, and which matches above Unicode, \W
13541 * or [\x{110000}] for example.
13542 * (Note that in this case, unlike the Posix one above, there is no
13543 * <depends_list>, because having a Unicode property forces Unicode
13546 bool warn_super = ! has_user_defined_property;
13549 /* If it matters to the final outcome, see if a non-property
13550 * component of the class matches above Unicode. If so, the
13551 * warning gets suppressed. This is true even if just a single
13552 * such code point is specified, as though not strictly correct if
13553 * another such code point is matched against, the fact that they
13554 * are using above-Unicode code points indicates they should know
13555 * the issues involved */
13557 bool non_prop_matches_above_Unicode =
13558 runtime_posix_matches_above_Unicode
13559 | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
13561 non_prop_matches_above_Unicode =
13562 ! non_prop_matches_above_Unicode;
13564 warn_super = ! non_prop_matches_above_Unicode;
13567 _invlist_union(properties, cp_list, &cp_list);
13568 SvREFCNT_dec_NN(properties);
13571 cp_list = properties;
13575 OP(ret) = ANYOF_WARN_SUPER;
13579 /* Here, we have calculated what code points should be in the character
13582 * Now we can see about various optimizations. Fold calculation (which we
13583 * did above) needs to take place before inversion. Otherwise /[^k]/i
13584 * would invert to include K, which under /i would match k, which it
13585 * shouldn't. Therefore we can't invert folded locale now, as it won't be
13586 * folded until runtime */
13588 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
13589 * at compile time. Besides not inverting folded locale now, we can't
13590 * invert if there are things such as \w, which aren't known until runtime
13593 && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
13595 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13597 _invlist_invert(cp_list);
13599 /* Any swash can't be used as-is, because we've inverted things */
13601 SvREFCNT_dec_NN(swash);
13605 /* Clear the invert flag since have just done it here */
13610 *ret_invlist = cp_list;
13612 /* Discard the generated node */
13614 RExC_size = orig_size;
13617 RExC_emit = orig_emit;
13622 /* If we didn't do folding, it's because some information isn't available
13623 * until runtime; set the run-time fold flag for these. (We don't have to
13624 * worry about properties folding, as that is taken care of by the swash
13628 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
13631 /* Some character classes are equivalent to other nodes. Such nodes take
13632 * up less room and generally fewer operations to execute than ANYOF nodes.
13633 * Above, we checked for and optimized into some such equivalents for
13634 * certain common classes that are easy to test. Getting to this point in
13635 * the code means that the class didn't get optimized there. Since this
13636 * code is only executed in Pass 2, it is too late to save space--it has
13637 * been allocated in Pass 1, and currently isn't given back. But turning
13638 * things into an EXACTish node can allow the optimizer to join it to any
13639 * adjacent such nodes. And if the class is equivalent to things like /./,
13640 * expensive run-time swashes can be avoided. Now that we have more
13641 * complete information, we can find things necessarily missed by the
13642 * earlier code. I (khw) am not sure how much to look for here. It would
13643 * be easy, but perhaps too slow, to check any candidates against all the
13644 * node types they could possibly match using _invlistEQ(). */
13649 && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
13650 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13653 U8 op = END; /* The optimzation node-type */
13654 const char * cur_parse= RExC_parse;
13656 invlist_iterinit(cp_list);
13657 if (! invlist_iternext(cp_list, &start, &end)) {
13659 /* Here, the list is empty. This happens, for example, when a
13660 * Unicode property is the only thing in the character class, and
13661 * it doesn't match anything. (perluniprops.pod notes such
13664 *flagp |= HASWIDTH|SIMPLE;
13666 else if (start == end) { /* The range is a single code point */
13667 if (! invlist_iternext(cp_list, &start, &end)
13669 /* Don't do this optimization if it would require changing
13670 * the pattern to UTF-8 */
13671 && (start < 256 || UTF))
13673 /* Here, the list contains a single code point. Can optimize
13674 * into an EXACT node */
13683 /* A locale node under folding with one code point can be
13684 * an EXACTFL, as its fold won't be calculated until
13690 /* Here, we are generally folding, but there is only one
13691 * code point to match. If we have to, we use an EXACT
13692 * node, but it would be better for joining with adjacent
13693 * nodes in the optimization pass if we used the same
13694 * EXACTFish node that any such are likely to be. We can
13695 * do this iff the code point doesn't participate in any
13696 * folds. For example, an EXACTF of a colon is the same as
13697 * an EXACT one, since nothing folds to or from a colon. */
13699 if (IS_IN_SOME_FOLD_L1(value)) {
13704 if (! PL_utf8_foldable) {
13705 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13706 &PL_sv_undef, 1, 0);
13707 PL_utf8_foldable = _get_swash_invlist(swash);
13708 SvREFCNT_dec_NN(swash);
13710 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13715 /* If we haven't found the node type, above, it means we
13716 * can use the prevailing one */
13718 op = compute_EXACTish(pRExC_state);
13723 else if (start == 0) {
13724 if (end == UV_MAX) {
13726 *flagp |= HASWIDTH|SIMPLE;
13729 else if (end == '\n' - 1
13730 && invlist_iternext(cp_list, &start, &end)
13731 && start == '\n' + 1 && end == UV_MAX)
13734 *flagp |= HASWIDTH|SIMPLE;
13738 invlist_iterfinish(cp_list);
13741 RExC_parse = (char *)orig_parse;
13742 RExC_emit = (regnode *)orig_emit;
13744 ret = reg_node(pRExC_state, op);
13746 RExC_parse = (char *)cur_parse;
13748 if (PL_regkind[op] == EXACT) {
13749 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13752 SvREFCNT_dec_NN(cp_list);
13757 /* Here, <cp_list> contains all the code points we can determine at
13758 * compile time that match under all conditions. Go through it, and
13759 * for things that belong in the bitmap, put them there, and delete from
13760 * <cp_list>. While we are at it, see if everything above 255 is in the
13761 * list, and if so, set a flag to speed up execution */
13762 ANYOF_BITMAP_ZERO(ret);
13765 /* This gets set if we actually need to modify things */
13766 bool change_invlist = FALSE;
13770 /* Start looking through <cp_list> */
13771 invlist_iterinit(cp_list);
13772 while (invlist_iternext(cp_list, &start, &end)) {
13776 if (end == UV_MAX && start <= 256) {
13777 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13780 /* Quit if are above what we should change */
13785 change_invlist = TRUE;
13787 /* Set all the bits in the range, up to the max that we are doing */
13788 high = (end < 255) ? end : 255;
13789 for (i = start; i <= (int) high; i++) {
13790 if (! ANYOF_BITMAP_TEST(ret, i)) {
13791 ANYOF_BITMAP_SET(ret, i);
13797 invlist_iterfinish(cp_list);
13799 /* Done with loop; remove any code points that are in the bitmap from
13801 if (change_invlist) {
13802 _invlist_subtract(cp_list, PL_Latin1, &cp_list);
13805 /* If have completely emptied it, remove it completely */
13806 if (_invlist_len(cp_list) == 0) {
13807 SvREFCNT_dec_NN(cp_list);
13813 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
13816 /* Here, the bitmap has been populated with all the Latin1 code points that
13817 * always match. Can now add to the overall list those that match only
13818 * when the target string is UTF-8 (<depends_list>). */
13819 if (depends_list) {
13821 _invlist_union(cp_list, depends_list, &cp_list);
13822 SvREFCNT_dec_NN(depends_list);
13825 cp_list = depends_list;
13829 /* If there is a swash and more than one element, we can't use the swash in
13830 * the optimization below. */
13831 if (swash && element_count > 1) {
13832 SvREFCNT_dec_NN(swash);
13837 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13839 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
13842 /* av[0] stores the character class description in its textual form:
13843 * used later (regexec.c:Perl_regclass_swash()) to initialize the
13844 * appropriate swash, and is also useful for dumping the regnode.
13845 * av[1] if NULL, is a placeholder to later contain the swash computed
13846 * from av[0]. But if no further computation need be done, the
13847 * swash is stored there now.
13848 * av[2] stores the cp_list inversion list for use in addition or
13849 * instead of av[0]; used only if av[1] is NULL
13850 * av[3] is set if any component of the class is from a user-defined
13851 * property; used only if av[1] is NULL */
13852 AV * const av = newAV();
13855 av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13856 ? SvREFCNT_inc(listsv) : &PL_sv_undef);
13858 av_store(av, 1, swash);
13859 SvREFCNT_dec_NN(cp_list);
13862 av_store(av, 1, NULL);
13864 av_store(av, 2, cp_list);
13865 av_store(av, 3, newSVuv(has_user_defined_property));
13869 rv = newRV_noinc(MUTABLE_SV(av));
13870 n = add_data(pRExC_state, 1, "s");
13871 RExC_rxi->data->data[n] = (void*)rv;
13875 *flagp |= HASWIDTH|SIMPLE;
13878 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
13881 /* reg_skipcomment()
13883 Absorbs an /x style # comments from the input stream.
13884 Returns true if there is more text remaining in the stream.
13885 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
13886 terminates the pattern without including a newline.
13888 Note its the callers responsibility to ensure that we are
13889 actually in /x mode
13894 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
13898 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
13900 while (RExC_parse < RExC_end)
13901 if (*RExC_parse++ == '\n') {
13906 /* we ran off the end of the pattern without ending
13907 the comment, so we have to add an \n when wrapping */
13908 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
13916 Advances the parse position, and optionally absorbs
13917 "whitespace" from the inputstream.
13919 Without /x "whitespace" means (?#...) style comments only,
13920 with /x this means (?#...) and # comments and whitespace proper.
13922 Returns the RExC_parse point from BEFORE the scan occurs.
13924 This is the /x friendly way of saying RExC_parse++.
13928 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
13930 char* const retval = RExC_parse++;
13932 PERL_ARGS_ASSERT_NEXTCHAR;
13935 if (RExC_end - RExC_parse >= 3
13936 && *RExC_parse == '('
13937 && RExC_parse[1] == '?'
13938 && RExC_parse[2] == '#')
13940 while (*RExC_parse != ')') {
13941 if (RExC_parse == RExC_end)
13942 FAIL("Sequence (?#... not terminated");
13948 if (RExC_flags & RXf_PMf_EXTENDED) {
13949 if (isSPACE(*RExC_parse)) {
13953 else if (*RExC_parse == '#') {
13954 if ( reg_skipcomment( pRExC_state ) )
13963 - reg_node - emit a node
13965 STATIC regnode * /* Location. */
13966 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
13970 regnode * const ret = RExC_emit;
13971 GET_RE_DEBUG_FLAGS_DECL;
13973 PERL_ARGS_ASSERT_REG_NODE;
13976 SIZE_ALIGN(RExC_size);
13980 if (RExC_emit >= RExC_emit_bound)
13981 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13982 op, RExC_emit, RExC_emit_bound);
13984 NODE_ALIGN_FILL(ret);
13986 FILL_ADVANCE_NODE(ptr, op);
13987 #ifdef RE_TRACK_PATTERN_OFFSETS
13988 if (RExC_offsets) { /* MJD */
13989 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
13990 "reg_node", __LINE__,
13992 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
13993 ? "Overwriting end of array!\n" : "OK",
13994 (UV)(RExC_emit - RExC_emit_start),
13995 (UV)(RExC_parse - RExC_start),
13996 (UV)RExC_offsets[0]));
13997 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
14005 - reganode - emit a node with an argument
14007 STATIC regnode * /* Location. */
14008 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
14012 regnode * const ret = RExC_emit;
14013 GET_RE_DEBUG_FLAGS_DECL;
14015 PERL_ARGS_ASSERT_REGANODE;
14018 SIZE_ALIGN(RExC_size);
14023 assert(2==regarglen[op]+1);
14025 Anything larger than this has to allocate the extra amount.
14026 If we changed this to be:
14028 RExC_size += (1 + regarglen[op]);
14030 then it wouldn't matter. Its not clear what side effect
14031 might come from that so its not done so far.
14036 if (RExC_emit >= RExC_emit_bound)
14037 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14038 op, RExC_emit, RExC_emit_bound);
14040 NODE_ALIGN_FILL(ret);
14042 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
14043 #ifdef RE_TRACK_PATTERN_OFFSETS
14044 if (RExC_offsets) { /* MJD */
14045 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14049 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
14050 "Overwriting end of array!\n" : "OK",
14051 (UV)(RExC_emit - RExC_emit_start),
14052 (UV)(RExC_parse - RExC_start),
14053 (UV)RExC_offsets[0]));
14054 Set_Cur_Node_Offset;
14062 - reguni - emit (if appropriate) a Unicode character
14065 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
14069 PERL_ARGS_ASSERT_REGUNI;
14071 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
14075 - reginsert - insert an operator in front of already-emitted operand
14077 * Means relocating the operand.
14080 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
14086 const int offset = regarglen[(U8)op];
14087 const int size = NODE_STEP_REGNODE + offset;
14088 GET_RE_DEBUG_FLAGS_DECL;
14090 PERL_ARGS_ASSERT_REGINSERT;
14091 PERL_UNUSED_ARG(depth);
14092 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
14093 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
14102 if (RExC_open_parens) {
14104 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
14105 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
14106 if ( RExC_open_parens[paren] >= opnd ) {
14107 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
14108 RExC_open_parens[paren] += size;
14110 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
14112 if ( RExC_close_parens[paren] >= opnd ) {
14113 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
14114 RExC_close_parens[paren] += size;
14116 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
14121 while (src > opnd) {
14122 StructCopy(--src, --dst, regnode);
14123 #ifdef RE_TRACK_PATTERN_OFFSETS
14124 if (RExC_offsets) { /* MJD 20010112 */
14125 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
14129 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
14130 ? "Overwriting end of array!\n" : "OK",
14131 (UV)(src - RExC_emit_start),
14132 (UV)(dst - RExC_emit_start),
14133 (UV)RExC_offsets[0]));
14134 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
14135 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
14141 place = opnd; /* Op node, where operand used to be. */
14142 #ifdef RE_TRACK_PATTERN_OFFSETS
14143 if (RExC_offsets) { /* MJD */
14144 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14148 (UV)(place - RExC_emit_start) > RExC_offsets[0]
14149 ? "Overwriting end of array!\n" : "OK",
14150 (UV)(place - RExC_emit_start),
14151 (UV)(RExC_parse - RExC_start),
14152 (UV)RExC_offsets[0]));
14153 Set_Node_Offset(place, RExC_parse);
14154 Set_Node_Length(place, 1);
14157 src = NEXTOPER(place);
14158 FILL_ADVANCE_NODE(place, op);
14159 Zero(src, offset, regnode);
14163 - regtail - set the next-pointer at the end of a node chain of p to val.
14164 - SEE ALSO: regtail_study
14166 /* TODO: All three parms should be const */
14168 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14172 GET_RE_DEBUG_FLAGS_DECL;
14174 PERL_ARGS_ASSERT_REGTAIL;
14176 PERL_UNUSED_ARG(depth);
14182 /* Find last node. */
14185 regnode * const temp = regnext(scan);
14187 SV * const mysv=sv_newmortal();
14188 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
14189 regprop(RExC_rx, mysv, scan);
14190 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
14191 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
14192 (temp == NULL ? "->" : ""),
14193 (temp == NULL ? PL_reg_name[OP(val)] : "")
14201 if (reg_off_by_arg[OP(scan)]) {
14202 ARG_SET(scan, val - scan);
14205 NEXT_OFF(scan) = val - scan;
14211 - regtail_study - set the next-pointer at the end of a node chain of p to val.
14212 - Look for optimizable sequences at the same time.
14213 - currently only looks for EXACT chains.
14215 This is experimental code. The idea is to use this routine to perform
14216 in place optimizations on branches and groups as they are constructed,
14217 with the long term intention of removing optimization from study_chunk so
14218 that it is purely analytical.
14220 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
14221 to control which is which.
14224 /* TODO: All four parms should be const */
14227 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14232 #ifdef EXPERIMENTAL_INPLACESCAN
14235 GET_RE_DEBUG_FLAGS_DECL;
14237 PERL_ARGS_ASSERT_REGTAIL_STUDY;
14243 /* Find last node. */
14247 regnode * const temp = regnext(scan);
14248 #ifdef EXPERIMENTAL_INPLACESCAN
14249 if (PL_regkind[OP(scan)] == EXACT) {
14250 bool has_exactf_sharp_s; /* Unexamined in this routine */
14251 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
14256 switch (OP(scan)) {
14262 case EXACTFU_TRICKYFOLD:
14264 if( exact == PSEUDO )
14266 else if ( exact != OP(scan) )
14275 SV * const mysv=sv_newmortal();
14276 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
14277 regprop(RExC_rx, mysv, scan);
14278 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
14279 SvPV_nolen_const(mysv),
14280 REG_NODE_NUM(scan),
14281 PL_reg_name[exact]);
14288 SV * const mysv_val=sv_newmortal();
14289 DEBUG_PARSE_MSG("");
14290 regprop(RExC_rx, mysv_val, val);
14291 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
14292 SvPV_nolen_const(mysv_val),
14293 (IV)REG_NODE_NUM(val),
14297 if (reg_off_by_arg[OP(scan)]) {
14298 ARG_SET(scan, val - scan);
14301 NEXT_OFF(scan) = val - scan;
14309 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
14313 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
14319 for (bit=0; bit<32; bit++) {
14320 if (flags & (1<<bit)) {
14321 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
14324 if (!set++ && lead)
14325 PerlIO_printf(Perl_debug_log, "%s",lead);
14326 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
14329 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
14330 if (!set++ && lead) {
14331 PerlIO_printf(Perl_debug_log, "%s",lead);
14334 case REGEX_UNICODE_CHARSET:
14335 PerlIO_printf(Perl_debug_log, "UNICODE");
14337 case REGEX_LOCALE_CHARSET:
14338 PerlIO_printf(Perl_debug_log, "LOCALE");
14340 case REGEX_ASCII_RESTRICTED_CHARSET:
14341 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
14343 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
14344 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
14347 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
14353 PerlIO_printf(Perl_debug_log, "\n");
14355 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14361 Perl_regdump(pTHX_ const regexp *r)
14365 SV * const sv = sv_newmortal();
14366 SV *dsv= sv_newmortal();
14367 RXi_GET_DECL(r,ri);
14368 GET_RE_DEBUG_FLAGS_DECL;
14370 PERL_ARGS_ASSERT_REGDUMP;
14372 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
14374 /* Header fields of interest. */
14375 if (r->anchored_substr) {
14376 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
14377 RE_SV_DUMPLEN(r->anchored_substr), 30);
14378 PerlIO_printf(Perl_debug_log,
14379 "anchored %s%s at %"IVdf" ",
14380 s, RE_SV_TAIL(r->anchored_substr),
14381 (IV)r->anchored_offset);
14382 } else if (r->anchored_utf8) {
14383 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
14384 RE_SV_DUMPLEN(r->anchored_utf8), 30);
14385 PerlIO_printf(Perl_debug_log,
14386 "anchored utf8 %s%s at %"IVdf" ",
14387 s, RE_SV_TAIL(r->anchored_utf8),
14388 (IV)r->anchored_offset);
14390 if (r->float_substr) {
14391 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
14392 RE_SV_DUMPLEN(r->float_substr), 30);
14393 PerlIO_printf(Perl_debug_log,
14394 "floating %s%s at %"IVdf"..%"UVuf" ",
14395 s, RE_SV_TAIL(r->float_substr),
14396 (IV)r->float_min_offset, (UV)r->float_max_offset);
14397 } else if (r->float_utf8) {
14398 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
14399 RE_SV_DUMPLEN(r->float_utf8), 30);
14400 PerlIO_printf(Perl_debug_log,
14401 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
14402 s, RE_SV_TAIL(r->float_utf8),
14403 (IV)r->float_min_offset, (UV)r->float_max_offset);
14405 if (r->check_substr || r->check_utf8)
14406 PerlIO_printf(Perl_debug_log,
14408 (r->check_substr == r->float_substr
14409 && r->check_utf8 == r->float_utf8
14410 ? "(checking floating" : "(checking anchored"));
14411 if (r->extflags & RXf_NOSCAN)
14412 PerlIO_printf(Perl_debug_log, " noscan");
14413 if (r->extflags & RXf_CHECK_ALL)
14414 PerlIO_printf(Perl_debug_log, " isall");
14415 if (r->check_substr || r->check_utf8)
14416 PerlIO_printf(Perl_debug_log, ") ");
14418 if (ri->regstclass) {
14419 regprop(r, sv, ri->regstclass);
14420 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
14422 if (r->extflags & RXf_ANCH) {
14423 PerlIO_printf(Perl_debug_log, "anchored");
14424 if (r->extflags & RXf_ANCH_BOL)
14425 PerlIO_printf(Perl_debug_log, "(BOL)");
14426 if (r->extflags & RXf_ANCH_MBOL)
14427 PerlIO_printf(Perl_debug_log, "(MBOL)");
14428 if (r->extflags & RXf_ANCH_SBOL)
14429 PerlIO_printf(Perl_debug_log, "(SBOL)");
14430 if (r->extflags & RXf_ANCH_GPOS)
14431 PerlIO_printf(Perl_debug_log, "(GPOS)");
14432 PerlIO_putc(Perl_debug_log, ' ');
14434 if (r->extflags & RXf_GPOS_SEEN)
14435 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
14436 if (r->intflags & PREGf_SKIP)
14437 PerlIO_printf(Perl_debug_log, "plus ");
14438 if (r->intflags & PREGf_IMPLICIT)
14439 PerlIO_printf(Perl_debug_log, "implicit ");
14440 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
14441 if (r->extflags & RXf_EVAL_SEEN)
14442 PerlIO_printf(Perl_debug_log, "with eval ");
14443 PerlIO_printf(Perl_debug_log, "\n");
14444 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
14446 PERL_ARGS_ASSERT_REGDUMP;
14447 PERL_UNUSED_CONTEXT;
14448 PERL_UNUSED_ARG(r);
14449 #endif /* DEBUGGING */
14453 - regprop - printable representation of opcode
14455 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
14458 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
14459 if (flags & ANYOF_INVERT) \
14460 /*make sure the invert info is in each */ \
14461 sv_catpvs(sv, "^"); \
14467 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
14473 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
14474 static const char * const anyofs[] = {
14475 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
14476 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
14477 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
14478 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
14479 || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \
14480 || _CC_VERTSPACE != 16
14481 #error Need to adjust order of anyofs[]
14518 RXi_GET_DECL(prog,progi);
14519 GET_RE_DEBUG_FLAGS_DECL;
14521 PERL_ARGS_ASSERT_REGPROP;
14525 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
14526 /* It would be nice to FAIL() here, but this may be called from
14527 regexec.c, and it would be hard to supply pRExC_state. */
14528 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
14529 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
14531 k = PL_regkind[OP(o)];
14534 sv_catpvs(sv, " ");
14535 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
14536 * is a crude hack but it may be the best for now since
14537 * we have no flag "this EXACTish node was UTF-8"
14539 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
14540 PERL_PV_ESCAPE_UNI_DETECT |
14541 PERL_PV_ESCAPE_NONASCII |
14542 PERL_PV_PRETTY_ELLIPSES |
14543 PERL_PV_PRETTY_LTGT |
14544 PERL_PV_PRETTY_NOCLEAR
14546 } else if (k == TRIE) {
14547 /* print the details of the trie in dumpuntil instead, as
14548 * progi->data isn't available here */
14549 const char op = OP(o);
14550 const U32 n = ARG(o);
14551 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
14552 (reg_ac_data *)progi->data->data[n] :
14554 const reg_trie_data * const trie
14555 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
14557 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
14558 DEBUG_TRIE_COMPILE_r(
14559 Perl_sv_catpvf(aTHX_ sv,
14560 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
14561 (UV)trie->startstate,
14562 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
14563 (UV)trie->wordcount,
14566 (UV)TRIE_CHARCOUNT(trie),
14567 (UV)trie->uniquecharcount
14570 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
14572 int rangestart = -1;
14573 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
14574 sv_catpvs(sv, "[");
14575 for (i = 0; i <= 256; i++) {
14576 if (i < 256 && BITMAP_TEST(bitmap,i)) {
14577 if (rangestart == -1)
14579 } else if (rangestart != -1) {
14580 if (i <= rangestart + 3)
14581 for (; rangestart < i; rangestart++)
14582 put_byte(sv, rangestart);
14584 put_byte(sv, rangestart);
14585 sv_catpvs(sv, "-");
14586 put_byte(sv, i - 1);
14591 sv_catpvs(sv, "]");
14594 } else if (k == CURLY) {
14595 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
14596 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
14597 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
14599 else if (k == WHILEM && o->flags) /* Ordinal/of */
14600 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
14601 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
14602 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
14603 if ( RXp_PAREN_NAMES(prog) ) {
14604 if ( k != REF || (OP(o) < NREF)) {
14605 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
14606 SV **name= av_fetch(list, ARG(o), 0 );
14608 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14611 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
14612 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
14613 I32 *nums=(I32*)SvPVX(sv_dat);
14614 SV **name= av_fetch(list, nums[0], 0 );
14617 for ( n=0; n<SvIVX(sv_dat); n++ ) {
14618 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
14619 (n ? "," : ""), (IV)nums[n]);
14621 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14625 } else if (k == GOSUB)
14626 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
14627 else if (k == VERB) {
14629 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
14630 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
14631 } else if (k == LOGICAL)
14632 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
14633 else if (k == ANYOF) {
14634 int i, rangestart = -1;
14635 const U8 flags = ANYOF_FLAGS(o);
14639 if (flags & ANYOF_LOCALE)
14640 sv_catpvs(sv, "{loc}");
14641 if (flags & ANYOF_LOC_FOLD)
14642 sv_catpvs(sv, "{i}");
14643 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
14644 if (flags & ANYOF_INVERT)
14645 sv_catpvs(sv, "^");
14647 /* output what the standard cp 0-255 bitmap matches */
14648 for (i = 0; i <= 256; i++) {
14649 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
14650 if (rangestart == -1)
14652 } else if (rangestart != -1) {
14653 if (i <= rangestart + 3)
14654 for (; rangestart < i; rangestart++)
14655 put_byte(sv, rangestart);
14657 put_byte(sv, rangestart);
14658 sv_catpvs(sv, "-");
14659 put_byte(sv, i - 1);
14666 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14667 /* output any special charclass tests (used entirely under use locale) */
14668 if (ANYOF_CLASS_TEST_ANY_SET(o))
14669 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
14670 if (ANYOF_CLASS_TEST(o,i)) {
14671 sv_catpv(sv, anyofs[i]);
14675 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14677 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
14678 sv_catpvs(sv, "{non-utf8-latin1-all}");
14681 /* output information about the unicode matching */
14682 if (flags & ANYOF_UNICODE_ALL)
14683 sv_catpvs(sv, "{unicode_all}");
14684 else if (ANYOF_NONBITMAP(o))
14685 sv_catpvs(sv, "{unicode}");
14686 if (flags & ANYOF_NONBITMAP_NON_UTF8)
14687 sv_catpvs(sv, "{outside bitmap}");
14689 if (ANYOF_NONBITMAP(o)) {
14690 SV *lv; /* Set if there is something outside the bit map */
14691 SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
14692 bool byte_output = FALSE; /* If something in the bitmap has been
14695 if (lv && lv != &PL_sv_undef) {
14697 U8 s[UTF8_MAXBYTES_CASE+1];
14699 for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
14700 uvchr_to_utf8(s, i);
14703 && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
14707 && swash_fetch(sw, s, TRUE))
14709 if (rangestart == -1)
14711 } else if (rangestart != -1) {
14712 byte_output = TRUE;
14713 if (i <= rangestart + 3)
14714 for (; rangestart < i; rangestart++) {
14715 put_byte(sv, rangestart);
14718 put_byte(sv, rangestart);
14719 sv_catpvs(sv, "-");
14728 char *s = savesvpv(lv);
14729 char * const origs = s;
14731 while (*s && *s != '\n')
14735 const char * const t = ++s;
14738 sv_catpvs(sv, " ");
14744 /* Truncate very long output */
14745 if (s - origs > 256) {
14746 Perl_sv_catpvf(aTHX_ sv,
14748 (int) (s - origs - 1),
14754 else if (*s == '\t') {
14769 SvREFCNT_dec_NN(lv);
14773 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14775 else if (k == POSIXD || k == NPOSIXD) {
14776 U8 index = FLAGS(o) * 2;
14777 if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14778 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14781 sv_catpv(sv, anyofs[index]);
14784 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14785 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14787 PERL_UNUSED_CONTEXT;
14788 PERL_UNUSED_ARG(sv);
14789 PERL_UNUSED_ARG(o);
14790 PERL_UNUSED_ARG(prog);
14791 #endif /* DEBUGGING */
14795 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14796 { /* Assume that RE_INTUIT is set */
14798 struct regexp *const prog = ReANY(r);
14799 GET_RE_DEBUG_FLAGS_DECL;
14801 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14802 PERL_UNUSED_CONTEXT;
14806 const char * const s = SvPV_nolen_const(prog->check_substr
14807 ? prog->check_substr : prog->check_utf8);
14809 if (!PL_colorset) reginitcolors();
14810 PerlIO_printf(Perl_debug_log,
14811 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14813 prog->check_substr ? "" : "utf8 ",
14814 PL_colors[5],PL_colors[0],
14817 (strlen(s) > 60 ? "..." : ""));
14820 return prog->check_substr ? prog->check_substr : prog->check_utf8;
14826 handles refcounting and freeing the perl core regexp structure. When
14827 it is necessary to actually free the structure the first thing it
14828 does is call the 'free' method of the regexp_engine associated to
14829 the regexp, allowing the handling of the void *pprivate; member
14830 first. (This routine is not overridable by extensions, which is why
14831 the extensions free is called first.)
14833 See regdupe and regdupe_internal if you change anything here.
14835 #ifndef PERL_IN_XSUB_RE
14837 Perl_pregfree(pTHX_ REGEXP *r)
14843 Perl_pregfree2(pTHX_ REGEXP *rx)
14846 struct regexp *const r = ReANY(rx);
14847 GET_RE_DEBUG_FLAGS_DECL;
14849 PERL_ARGS_ASSERT_PREGFREE2;
14851 if (r->mother_re) {
14852 ReREFCNT_dec(r->mother_re);
14854 CALLREGFREE_PVT(rx); /* free the private data */
14855 SvREFCNT_dec(RXp_PAREN_NAMES(r));
14856 Safefree(r->xpv_len_u.xpvlenu_pv);
14859 SvREFCNT_dec(r->anchored_substr);
14860 SvREFCNT_dec(r->anchored_utf8);
14861 SvREFCNT_dec(r->float_substr);
14862 SvREFCNT_dec(r->float_utf8);
14863 Safefree(r->substrs);
14865 RX_MATCH_COPY_FREE(rx);
14866 #ifdef PERL_ANY_COW
14867 SvREFCNT_dec(r->saved_copy);
14870 SvREFCNT_dec(r->qr_anoncv);
14871 rx->sv_u.svu_rx = 0;
14876 This is a hacky workaround to the structural issue of match results
14877 being stored in the regexp structure which is in turn stored in
14878 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
14879 could be PL_curpm in multiple contexts, and could require multiple
14880 result sets being associated with the pattern simultaneously, such
14881 as when doing a recursive match with (??{$qr})
14883 The solution is to make a lightweight copy of the regexp structure
14884 when a qr// is returned from the code executed by (??{$qr}) this
14885 lightweight copy doesn't actually own any of its data except for
14886 the starp/end and the actual regexp structure itself.
14892 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
14894 struct regexp *ret;
14895 struct regexp *const r = ReANY(rx);
14896 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
14898 PERL_ARGS_ASSERT_REG_TEMP_COPY;
14901 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
14903 SvOK_off((SV *)ret_x);
14905 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
14906 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
14907 made both spots point to the same regexp body.) */
14908 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
14909 assert(!SvPVX(ret_x));
14910 ret_x->sv_u.svu_rx = temp->sv_any;
14911 temp->sv_any = NULL;
14912 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
14913 SvREFCNT_dec_NN(temp);
14914 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
14915 ing below will not set it. */
14916 SvCUR_set(ret_x, SvCUR(rx));
14919 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
14920 sv_force_normal(sv) is called. */
14922 ret = ReANY(ret_x);
14924 SvFLAGS(ret_x) |= SvUTF8(rx);
14925 /* We share the same string buffer as the original regexp, on which we
14926 hold a reference count, incremented when mother_re is set below.
14927 The string pointer is copied here, being part of the regexp struct.
14929 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
14930 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
14932 const I32 npar = r->nparens+1;
14933 Newx(ret->offs, npar, regexp_paren_pair);
14934 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14937 Newx(ret->substrs, 1, struct reg_substr_data);
14938 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14940 SvREFCNT_inc_void(ret->anchored_substr);
14941 SvREFCNT_inc_void(ret->anchored_utf8);
14942 SvREFCNT_inc_void(ret->float_substr);
14943 SvREFCNT_inc_void(ret->float_utf8);
14945 /* check_substr and check_utf8, if non-NULL, point to either their
14946 anchored or float namesakes, and don't hold a second reference. */
14948 RX_MATCH_COPIED_off(ret_x);
14949 #ifdef PERL_ANY_COW
14950 ret->saved_copy = NULL;
14952 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
14953 SvREFCNT_inc_void(ret->qr_anoncv);
14959 /* regfree_internal()
14961 Free the private data in a regexp. This is overloadable by
14962 extensions. Perl takes care of the regexp structure in pregfree(),
14963 this covers the *pprivate pointer which technically perl doesn't
14964 know about, however of course we have to handle the
14965 regexp_internal structure when no extension is in use.
14967 Note this is called before freeing anything in the regexp
14972 Perl_regfree_internal(pTHX_ REGEXP * const rx)
14975 struct regexp *const r = ReANY(rx);
14976 RXi_GET_DECL(r,ri);
14977 GET_RE_DEBUG_FLAGS_DECL;
14979 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
14985 SV *dsv= sv_newmortal();
14986 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
14987 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
14988 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
14989 PL_colors[4],PL_colors[5],s);
14992 #ifdef RE_TRACK_PATTERN_OFFSETS
14994 Safefree(ri->u.offsets); /* 20010421 MJD */
14996 if (ri->code_blocks) {
14998 for (n = 0; n < ri->num_code_blocks; n++)
14999 SvREFCNT_dec(ri->code_blocks[n].src_regex);
15000 Safefree(ri->code_blocks);
15004 int n = ri->data->count;
15007 /* If you add a ->what type here, update the comment in regcomp.h */
15008 switch (ri->data->what[n]) {
15014 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
15017 Safefree(ri->data->data[n]);
15023 { /* Aho Corasick add-on structure for a trie node.
15024 Used in stclass optimization only */
15026 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
15028 refcount = --aho->refcount;
15031 PerlMemShared_free(aho->states);
15032 PerlMemShared_free(aho->fail);
15033 /* do this last!!!! */
15034 PerlMemShared_free(ri->data->data[n]);
15035 PerlMemShared_free(ri->regstclass);
15041 /* trie structure. */
15043 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
15045 refcount = --trie->refcount;
15048 PerlMemShared_free(trie->charmap);
15049 PerlMemShared_free(trie->states);
15050 PerlMemShared_free(trie->trans);
15052 PerlMemShared_free(trie->bitmap);
15054 PerlMemShared_free(trie->jump);
15055 PerlMemShared_free(trie->wordinfo);
15056 /* do this last!!!! */
15057 PerlMemShared_free(ri->data->data[n]);
15062 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
15065 Safefree(ri->data->what);
15066 Safefree(ri->data);
15072 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
15073 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
15074 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
15077 re_dup - duplicate a regexp.
15079 This routine is expected to clone a given regexp structure. It is only
15080 compiled under USE_ITHREADS.
15082 After all of the core data stored in struct regexp is duplicated
15083 the regexp_engine.dupe method is used to copy any private data
15084 stored in the *pprivate pointer. This allows extensions to handle
15085 any duplication it needs to do.
15087 See pregfree() and regfree_internal() if you change anything here.
15089 #if defined(USE_ITHREADS)
15090 #ifndef PERL_IN_XSUB_RE
15092 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
15096 const struct regexp *r = ReANY(sstr);
15097 struct regexp *ret = ReANY(dstr);
15099 PERL_ARGS_ASSERT_RE_DUP_GUTS;
15101 npar = r->nparens+1;
15102 Newx(ret->offs, npar, regexp_paren_pair);
15103 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15105 if (ret->substrs) {
15106 /* Do it this way to avoid reading from *r after the StructCopy().
15107 That way, if any of the sv_dup_inc()s dislodge *r from the L1
15108 cache, it doesn't matter. */
15109 const bool anchored = r->check_substr
15110 ? r->check_substr == r->anchored_substr
15111 : r->check_utf8 == r->anchored_utf8;
15112 Newx(ret->substrs, 1, struct reg_substr_data);
15113 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15115 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
15116 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
15117 ret->float_substr = sv_dup_inc(ret->float_substr, param);
15118 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
15120 /* check_substr and check_utf8, if non-NULL, point to either their
15121 anchored or float namesakes, and don't hold a second reference. */
15123 if (ret->check_substr) {
15125 assert(r->check_utf8 == r->anchored_utf8);
15126 ret->check_substr = ret->anchored_substr;
15127 ret->check_utf8 = ret->anchored_utf8;
15129 assert(r->check_substr == r->float_substr);
15130 assert(r->check_utf8 == r->float_utf8);
15131 ret->check_substr = ret->float_substr;
15132 ret->check_utf8 = ret->float_utf8;
15134 } else if (ret->check_utf8) {
15136 ret->check_utf8 = ret->anchored_utf8;
15138 ret->check_utf8 = ret->float_utf8;
15143 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
15144 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
15147 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
15149 if (RX_MATCH_COPIED(dstr))
15150 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
15152 ret->subbeg = NULL;
15153 #ifdef PERL_ANY_COW
15154 ret->saved_copy = NULL;
15157 /* Whether mother_re be set or no, we need to copy the string. We
15158 cannot refrain from copying it when the storage points directly to
15159 our mother regexp, because that's
15160 1: a buffer in a different thread
15161 2: something we no longer hold a reference on
15162 so we need to copy it locally. */
15163 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
15164 ret->mother_re = NULL;
15167 #endif /* PERL_IN_XSUB_RE */
15172 This is the internal complement to regdupe() which is used to copy
15173 the structure pointed to by the *pprivate pointer in the regexp.
15174 This is the core version of the extension overridable cloning hook.
15175 The regexp structure being duplicated will be copied by perl prior
15176 to this and will be provided as the regexp *r argument, however
15177 with the /old/ structures pprivate pointer value. Thus this routine
15178 may override any copying normally done by perl.
15180 It returns a pointer to the new regexp_internal structure.
15184 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
15187 struct regexp *const r = ReANY(rx);
15188 regexp_internal *reti;
15190 RXi_GET_DECL(r,ri);
15192 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
15196 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
15197 Copy(ri->program, reti->program, len+1, regnode);
15199 reti->num_code_blocks = ri->num_code_blocks;
15200 if (ri->code_blocks) {
15202 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
15203 struct reg_code_block);
15204 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
15205 struct reg_code_block);
15206 for (n = 0; n < ri->num_code_blocks; n++)
15207 reti->code_blocks[n].src_regex = (REGEXP*)
15208 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
15211 reti->code_blocks = NULL;
15213 reti->regstclass = NULL;
15216 struct reg_data *d;
15217 const int count = ri->data->count;
15220 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
15221 char, struct reg_data);
15222 Newx(d->what, count, U8);
15225 for (i = 0; i < count; i++) {
15226 d->what[i] = ri->data->what[i];
15227 switch (d->what[i]) {
15228 /* see also regcomp.h and regfree_internal() */
15229 case 'a': /* actually an AV, but the dup function is identical. */
15233 case 'u': /* actually an HV, but the dup function is identical. */
15234 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
15237 /* This is cheating. */
15238 Newx(d->data[i], 1, struct regnode_charclass_class);
15239 StructCopy(ri->data->data[i], d->data[i],
15240 struct regnode_charclass_class);
15241 reti->regstclass = (regnode*)d->data[i];
15244 /* Trie stclasses are readonly and can thus be shared
15245 * without duplication. We free the stclass in pregfree
15246 * when the corresponding reg_ac_data struct is freed.
15248 reti->regstclass= ri->regstclass;
15252 ((reg_trie_data*)ri->data->data[i])->refcount++;
15257 d->data[i] = ri->data->data[i];
15260 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
15269 reti->name_list_idx = ri->name_list_idx;
15271 #ifdef RE_TRACK_PATTERN_OFFSETS
15272 if (ri->u.offsets) {
15273 Newx(reti->u.offsets, 2*len+1, U32);
15274 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
15277 SetProgLen(reti,len);
15280 return (void*)reti;
15283 #endif /* USE_ITHREADS */
15285 #ifndef PERL_IN_XSUB_RE
15288 - regnext - dig the "next" pointer out of a node
15291 Perl_regnext(pTHX_ regnode *p)
15299 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
15300 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
15303 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
15312 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
15315 STRLEN l1 = strlen(pat1);
15316 STRLEN l2 = strlen(pat2);
15319 const char *message;
15321 PERL_ARGS_ASSERT_RE_CROAK2;
15327 Copy(pat1, buf, l1 , char);
15328 Copy(pat2, buf + l1, l2 , char);
15329 buf[l1 + l2] = '\n';
15330 buf[l1 + l2 + 1] = '\0';
15332 /* ANSI variant takes additional second argument */
15333 va_start(args, pat2);
15337 msv = vmess(buf, &args);
15339 message = SvPV_const(msv,l1);
15342 Copy(message, buf, l1 , char);
15343 buf[l1-1] = '\0'; /* Overwrite \n */
15344 Perl_croak(aTHX_ "%s", buf);
15347 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
15349 #ifndef PERL_IN_XSUB_RE
15351 Perl_save_re_context(pTHX)
15355 struct re_save_state *state;
15357 SAVEVPTR(PL_curcop);
15358 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
15360 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
15361 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
15362 SSPUSHUV(SAVEt_RE_STATE);
15364 Copy(&PL_reg_state, state, 1, struct re_save_state);
15366 PL_reg_oldsaved = NULL;
15367 PL_reg_oldsavedlen = 0;
15368 PL_reg_oldsavedoffset = 0;
15369 PL_reg_oldsavedcoffset = 0;
15370 PL_reg_maxiter = 0;
15371 PL_reg_leftiter = 0;
15372 PL_reg_poscache = NULL;
15373 PL_reg_poscache_size = 0;
15374 #ifdef PERL_ANY_COW
15378 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
15380 const REGEXP * const rx = PM_GETRE(PL_curpm);
15383 for (i = 1; i <= RX_NPARENS(rx); i++) {
15384 char digits[TYPE_CHARS(long)];
15385 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
15386 GV *const *const gvp
15387 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
15390 GV * const gv = *gvp;
15391 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
15403 S_put_byte(pTHX_ SV *sv, int c)
15405 PERL_ARGS_ASSERT_PUT_BYTE;
15407 /* Our definition of isPRINT() ignores locales, so only bytes that are
15408 not part of UTF-8 are considered printable. I assume that the same
15409 holds for UTF-EBCDIC.
15410 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
15411 which Wikipedia says:
15413 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
15414 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
15415 identical, to the ASCII delete (DEL) or rubout control character. ...
15416 it is typically mapped to hexadecimal code 9F, in order to provide a
15417 unique character mapping in both directions)
15419 So the old condition can be simplified to !isPRINT(c) */
15422 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
15425 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
15429 const char string = c;
15430 if (c == '-' || c == ']' || c == '\\' || c == '^')
15431 sv_catpvs(sv, "\\");
15432 sv_catpvn(sv, &string, 1);
15437 #define CLEAR_OPTSTART \
15438 if (optstart) STMT_START { \
15439 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
15443 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
15445 STATIC const regnode *
15446 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
15447 const regnode *last, const regnode *plast,
15448 SV* sv, I32 indent, U32 depth)
15451 U8 op = PSEUDO; /* Arbitrary non-END op. */
15452 const regnode *next;
15453 const regnode *optstart= NULL;
15455 RXi_GET_DECL(r,ri);
15456 GET_RE_DEBUG_FLAGS_DECL;
15458 PERL_ARGS_ASSERT_DUMPUNTIL;
15460 #ifdef DEBUG_DUMPUNTIL
15461 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
15462 last ? last-start : 0,plast ? plast-start : 0);
15465 if (plast && plast < last)
15468 while (PL_regkind[op] != END && (!last || node < last)) {
15469 /* While that wasn't END last time... */
15472 if (op == CLOSE || op == WHILEM)
15474 next = regnext((regnode *)node);
15477 if (OP(node) == OPTIMIZED) {
15478 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
15485 regprop(r, sv, node);
15486 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
15487 (int)(2*indent + 1), "", SvPVX_const(sv));
15489 if (OP(node) != OPTIMIZED) {
15490 if (next == NULL) /* Next ptr. */
15491 PerlIO_printf(Perl_debug_log, " (0)");
15492 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
15493 PerlIO_printf(Perl_debug_log, " (FAIL)");
15495 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
15496 (void)PerlIO_putc(Perl_debug_log, '\n');
15500 if (PL_regkind[(U8)op] == BRANCHJ) {
15503 const regnode *nnode = (OP(next) == LONGJMP
15504 ? regnext((regnode *)next)
15506 if (last && nnode > last)
15508 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
15511 else if (PL_regkind[(U8)op] == BRANCH) {
15513 DUMPUNTIL(NEXTOPER(node), next);
15515 else if ( PL_regkind[(U8)op] == TRIE ) {
15516 const regnode *this_trie = node;
15517 const char op = OP(node);
15518 const U32 n = ARG(node);
15519 const reg_ac_data * const ac = op>=AHOCORASICK ?
15520 (reg_ac_data *)ri->data->data[n] :
15522 const reg_trie_data * const trie =
15523 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
15525 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
15527 const regnode *nextbranch= NULL;
15530 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
15531 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
15533 PerlIO_printf(Perl_debug_log, "%*s%s ",
15534 (int)(2*(indent+3)), "",
15535 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
15536 PL_colors[0], PL_colors[1],
15537 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
15538 PERL_PV_PRETTY_ELLIPSES |
15539 PERL_PV_PRETTY_LTGT
15544 U16 dist= trie->jump[word_idx+1];
15545 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
15546 (UV)((dist ? this_trie + dist : next) - start));
15549 nextbranch= this_trie + trie->jump[0];
15550 DUMPUNTIL(this_trie + dist, nextbranch);
15552 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
15553 nextbranch= regnext((regnode *)nextbranch);
15555 PerlIO_printf(Perl_debug_log, "\n");
15558 if (last && next > last)
15563 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
15564 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
15565 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
15567 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
15569 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
15571 else if ( op == PLUS || op == STAR) {
15572 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
15574 else if (PL_regkind[(U8)op] == ANYOF) {
15575 /* arglen 1 + class block */
15576 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
15577 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
15578 node = NEXTOPER(node);
15580 else if (PL_regkind[(U8)op] == EXACT) {
15581 /* Literal string, where present. */
15582 node += NODE_SZ_STR(node) - 1;
15583 node = NEXTOPER(node);
15586 node = NEXTOPER(node);
15587 node += regarglen[(U8)op];
15589 if (op == CURLYX || op == OPEN)
15593 #ifdef DEBUG_DUMPUNTIL
15594 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
15599 #endif /* DEBUGGING */
15603 * c-indentation-style: bsd
15604 * c-basic-offset: 4
15605 * indent-tabs-mode: nil
15608 * ex: set ts=8 sts=4 sw=4 et: