5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 extern const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
95 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
96 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
103 # if defined(BUGGY_MSC6)
104 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
105 # pragma optimize("a",off)
106 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
107 # pragma optimize("w",on )
108 # endif /* BUGGY_MSC6 */
112 #define STATIC static
116 typedef struct RExC_state_t {
117 U32 flags; /* RXf_* are we folding, multilining? */
118 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
119 char *precomp; /* uncompiled string. */
120 REGEXP *rx_sv; /* The SV that is the regexp. */
121 regexp *rx; /* perl core regexp structure */
122 regexp_internal *rxi; /* internal data for regexp object pprivate field */
123 char *start; /* Start of input for compile */
124 char *end; /* End of input for compile */
125 char *parse; /* Input-scan pointer. */
126 I32 whilem_seen; /* number of WHILEM in this expr */
127 regnode *emit_start; /* Start of emitted-code area */
128 regnode *emit_bound; /* First regnode outside of the allocated space */
129 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
133 I32 size; /* Code size. */
134 I32 npar; /* Capture buffer count, (OPEN). */
135 I32 cpar; /* Capture buffer count, (CLOSE). */
136 I32 nestroot; /* root parens we are in - used by accept */
139 regnode **open_parens; /* pointers to open parens */
140 regnode **close_parens; /* pointers to close parens */
141 regnode *opend; /* END node in program */
142 I32 utf8; /* whether the pattern is utf8 or not */
143 I32 orig_utf8; /* whether the pattern was originally in utf8 */
144 /* XXX use this for future optimisation of case
145 * where pattern must be upgraded to utf8. */
146 I32 uni_semantics; /* If a d charset modifier should use unicode
147 rules, even if the pattern is not in
149 HV *paren_names; /* Paren names */
151 regnode **recurse; /* Recurse regops */
152 I32 recurse_count; /* Number of recurse regops */
155 I32 override_recoding;
156 I32 in_multi_char_class;
157 struct reg_code_block *code_blocks; /* positions of literal (?{})
159 int num_code_blocks; /* size of code_blocks[] */
160 int code_index; /* next code_blocks[] slot */
162 char *starttry; /* -Dr: where regtry was called. */
163 #define RExC_starttry (pRExC_state->starttry)
165 SV *runtime_code_qr; /* qr with the runtime code blocks */
167 const char *lastparse;
169 AV *paren_name_list; /* idx -> name */
170 #define RExC_lastparse (pRExC_state->lastparse)
171 #define RExC_lastnum (pRExC_state->lastnum)
172 #define RExC_paren_name_list (pRExC_state->paren_name_list)
176 #define RExC_flags (pRExC_state->flags)
177 #define RExC_pm_flags (pRExC_state->pm_flags)
178 #define RExC_precomp (pRExC_state->precomp)
179 #define RExC_rx_sv (pRExC_state->rx_sv)
180 #define RExC_rx (pRExC_state->rx)
181 #define RExC_rxi (pRExC_state->rxi)
182 #define RExC_start (pRExC_state->start)
183 #define RExC_end (pRExC_state->end)
184 #define RExC_parse (pRExC_state->parse)
185 #define RExC_whilem_seen (pRExC_state->whilem_seen)
186 #ifdef RE_TRACK_PATTERN_OFFSETS
187 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
189 #define RExC_emit (pRExC_state->emit)
190 #define RExC_emit_start (pRExC_state->emit_start)
191 #define RExC_emit_bound (pRExC_state->emit_bound)
192 #define RExC_naughty (pRExC_state->naughty)
193 #define RExC_sawback (pRExC_state->sawback)
194 #define RExC_seen (pRExC_state->seen)
195 #define RExC_size (pRExC_state->size)
196 #define RExC_npar (pRExC_state->npar)
197 #define RExC_nestroot (pRExC_state->nestroot)
198 #define RExC_extralen (pRExC_state->extralen)
199 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
200 #define RExC_utf8 (pRExC_state->utf8)
201 #define RExC_uni_semantics (pRExC_state->uni_semantics)
202 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
203 #define RExC_open_parens (pRExC_state->open_parens)
204 #define RExC_close_parens (pRExC_state->close_parens)
205 #define RExC_opend (pRExC_state->opend)
206 #define RExC_paren_names (pRExC_state->paren_names)
207 #define RExC_recurse (pRExC_state->recurse)
208 #define RExC_recurse_count (pRExC_state->recurse_count)
209 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
210 #define RExC_contains_locale (pRExC_state->contains_locale)
211 #define RExC_override_recoding (pRExC_state->override_recoding)
212 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
215 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
216 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
217 ((*s) == '{' && regcurly(s, FALSE)))
220 #undef SPSTART /* dratted cpp namespace... */
223 * Flags to be passed up and down.
225 #define WORST 0 /* Worst case. */
226 #define HASWIDTH 0x01 /* Known to match non-null strings. */
228 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
229 * character. (There needs to be a case: in the switch statement in regexec.c
230 * for any node marked SIMPLE.) Note that this is not the same thing as
233 #define SPSTART 0x04 /* Starts with * or + */
234 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
235 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
237 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
239 /* whether trie related optimizations are enabled */
240 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
241 #define TRIE_STUDY_OPT
242 #define FULL_TRIE_STUDY
248 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
249 #define PBITVAL(paren) (1 << ((paren) & 7))
250 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
251 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
252 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
254 /* If not already in utf8, do a longjmp back to the beginning */
255 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
256 #define REQUIRE_UTF8 STMT_START { \
257 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
260 /* This converts the named class defined in regcomp.h to its equivalent class
261 * number defined in handy.h. */
262 #define namedclass_to_classnum(class) ((int) ((class) / 2))
263 #define classnum_to_namedclass(classnum) ((classnum) * 2)
265 /* About scan_data_t.
267 During optimisation we recurse through the regexp program performing
268 various inplace (keyhole style) optimisations. In addition study_chunk
269 and scan_commit populate this data structure with information about
270 what strings MUST appear in the pattern. We look for the longest
271 string that must appear at a fixed location, and we look for the
272 longest string that may appear at a floating location. So for instance
277 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
278 strings (because they follow a .* construct). study_chunk will identify
279 both FOO and BAR as being the longest fixed and floating strings respectively.
281 The strings can be composites, for instance
285 will result in a composite fixed substring 'foo'.
287 For each string some basic information is maintained:
289 - offset or min_offset
290 This is the position the string must appear at, or not before.
291 It also implicitly (when combined with minlenp) tells us how many
292 characters must match before the string we are searching for.
293 Likewise when combined with minlenp and the length of the string it
294 tells us how many characters must appear after the string we have
298 Only used for floating strings. This is the rightmost point that
299 the string can appear at. If set to I32 max it indicates that the
300 string can occur infinitely far to the right.
303 A pointer to the minimum number of characters of the pattern that the
304 string was found inside. This is important as in the case of positive
305 lookahead or positive lookbehind we can have multiple patterns
310 The minimum length of the pattern overall is 3, the minimum length
311 of the lookahead part is 3, but the minimum length of the part that
312 will actually match is 1. So 'FOO's minimum length is 3, but the
313 minimum length for the F is 1. This is important as the minimum length
314 is used to determine offsets in front of and behind the string being
315 looked for. Since strings can be composites this is the length of the
316 pattern at the time it was committed with a scan_commit. Note that
317 the length is calculated by study_chunk, so that the minimum lengths
318 are not known until the full pattern has been compiled, thus the
319 pointer to the value.
323 In the case of lookbehind the string being searched for can be
324 offset past the start point of the final matching string.
325 If this value was just blithely removed from the min_offset it would
326 invalidate some of the calculations for how many chars must match
327 before or after (as they are derived from min_offset and minlen and
328 the length of the string being searched for).
329 When the final pattern is compiled and the data is moved from the
330 scan_data_t structure into the regexp structure the information
331 about lookbehind is factored in, with the information that would
332 have been lost precalculated in the end_shift field for the
335 The fields pos_min and pos_delta are used to store the minimum offset
336 and the delta to the maximum offset at the current point in the pattern.
340 typedef struct scan_data_t {
341 /*I32 len_min; unused */
342 /*I32 len_delta; unused */
346 I32 last_end; /* min value, <0 unless valid. */
349 SV **longest; /* Either &l_fixed, or &l_float. */
350 SV *longest_fixed; /* longest fixed string found in pattern */
351 I32 offset_fixed; /* offset where it starts */
352 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
353 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
354 SV *longest_float; /* longest floating string found in pattern */
355 I32 offset_float_min; /* earliest point in string it can appear */
356 I32 offset_float_max; /* latest point in string it can appear */
357 I32 *minlen_float; /* pointer to the minlen relevant to the string */
358 I32 lookbehind_float; /* is the position of the string modified by LB */
362 struct regnode_charclass_class *start_class;
366 * Forward declarations for pregcomp()'s friends.
369 static const scan_data_t zero_scan_data =
370 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
372 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
373 #define SF_BEFORE_SEOL 0x0001
374 #define SF_BEFORE_MEOL 0x0002
375 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
376 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
379 # define SF_FIX_SHIFT_EOL (0+2)
380 # define SF_FL_SHIFT_EOL (0+4)
382 # define SF_FIX_SHIFT_EOL (+2)
383 # define SF_FL_SHIFT_EOL (+4)
386 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
387 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
389 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
390 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
391 #define SF_IS_INF 0x0040
392 #define SF_HAS_PAR 0x0080
393 #define SF_IN_PAR 0x0100
394 #define SF_HAS_EVAL 0x0200
395 #define SCF_DO_SUBSTR 0x0400
396 #define SCF_DO_STCLASS_AND 0x0800
397 #define SCF_DO_STCLASS_OR 0x1000
398 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
399 #define SCF_WHILEM_VISITED_POS 0x2000
401 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
402 #define SCF_SEEN_ACCEPT 0x8000
404 #define UTF cBOOL(RExC_utf8)
406 /* The enums for all these are ordered so things work out correctly */
407 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
408 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
409 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
410 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
411 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
412 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
413 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
415 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
417 #define OOB_NAMEDCLASS -1
419 /* There is no code point that is out-of-bounds, so this is problematic. But
420 * its only current use is to initialize a variable that is always set before
422 #define OOB_UNICODE 0xDEADBEEF
424 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
425 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
428 /* length of regex to show in messages that don't mark a position within */
429 #define RegexLengthToShowInErrorMessages 127
432 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
433 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
434 * op/pragma/warn/regcomp.
436 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
437 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
439 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
442 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
443 * arg. Show regex, up to a maximum length. If it's too long, chop and add
446 #define _FAIL(code) STMT_START { \
447 const char *ellipses = ""; \
448 IV len = RExC_end - RExC_precomp; \
451 SAVEFREESV(RExC_rx_sv); \
452 if (len > RegexLengthToShowInErrorMessages) { \
453 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
454 len = RegexLengthToShowInErrorMessages - 10; \
460 #define FAIL(msg) _FAIL( \
461 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
462 msg, (int)len, RExC_precomp, ellipses))
464 #define FAIL2(msg,arg) _FAIL( \
465 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
466 arg, (int)len, RExC_precomp, ellipses))
469 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
471 #define Simple_vFAIL(m) STMT_START { \
472 const IV offset = RExC_parse - RExC_precomp; \
473 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
474 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
478 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
480 #define vFAIL(m) STMT_START { \
482 SAVEFREESV(RExC_rx_sv); \
487 * Like Simple_vFAIL(), but accepts two arguments.
489 #define Simple_vFAIL2(m,a1) STMT_START { \
490 const IV offset = RExC_parse - RExC_precomp; \
491 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
492 (int)offset, RExC_precomp, RExC_precomp + offset); \
496 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
498 #define vFAIL2(m,a1) STMT_START { \
500 SAVEFREESV(RExC_rx_sv); \
501 Simple_vFAIL2(m, a1); \
506 * Like Simple_vFAIL(), but accepts three arguments.
508 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
509 const IV offset = RExC_parse - RExC_precomp; \
510 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
511 (int)offset, RExC_precomp, RExC_precomp + offset); \
515 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
517 #define vFAIL3(m,a1,a2) STMT_START { \
519 SAVEFREESV(RExC_rx_sv); \
520 Simple_vFAIL3(m, a1, a2); \
524 * Like Simple_vFAIL(), but accepts four arguments.
526 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
527 const IV offset = RExC_parse - RExC_precomp; \
528 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
529 (int)offset, RExC_precomp, RExC_precomp + offset); \
532 #define vFAIL4(m,a1,a2,a3) STMT_START { \
534 SAVEFREESV(RExC_rx_sv); \
535 Simple_vFAIL4(m, a1, a2, a3); \
538 /* m is not necessarily a "literal string", in this macro */
539 #define reg_warn_non_literal_string(loc, m) STMT_START { \
540 const IV offset = loc - RExC_precomp; \
541 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
542 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
545 #define ckWARNreg(loc,m) STMT_START { \
546 const IV offset = loc - RExC_precomp; \
547 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
548 (int)offset, RExC_precomp, RExC_precomp + offset); \
551 #define vWARN_dep(loc, m) STMT_START { \
552 const IV offset = loc - RExC_precomp; \
553 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
554 (int)offset, RExC_precomp, RExC_precomp + offset); \
557 #define ckWARNdep(loc,m) STMT_START { \
558 const IV offset = loc - RExC_precomp; \
559 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
561 (int)offset, RExC_precomp, RExC_precomp + offset); \
564 #define ckWARNregdep(loc,m) STMT_START { \
565 const IV offset = loc - RExC_precomp; \
566 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
568 (int)offset, RExC_precomp, RExC_precomp + offset); \
571 #define ckWARN2regdep(loc,m, a1) STMT_START { \
572 const IV offset = loc - RExC_precomp; \
573 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
575 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
578 #define ckWARN2reg(loc, m, a1) STMT_START { \
579 const IV offset = loc - RExC_precomp; \
580 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
581 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
584 #define vWARN3(loc, m, a1, a2) STMT_START { \
585 const IV offset = loc - RExC_precomp; \
586 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
587 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
590 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
591 const IV offset = loc - RExC_precomp; \
592 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
593 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
596 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
597 const IV offset = loc - RExC_precomp; \
598 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
599 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
602 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
603 const IV offset = loc - RExC_precomp; \
604 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
605 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
608 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
609 const IV offset = loc - RExC_precomp; \
610 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
611 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
615 /* Allow for side effects in s */
616 #define REGC(c,s) STMT_START { \
617 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
620 /* Macros for recording node offsets. 20001227 mjd@plover.com
621 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
622 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
623 * Element 0 holds the number n.
624 * Position is 1 indexed.
626 #ifndef RE_TRACK_PATTERN_OFFSETS
627 #define Set_Node_Offset_To_R(node,byte)
628 #define Set_Node_Offset(node,byte)
629 #define Set_Cur_Node_Offset
630 #define Set_Node_Length_To_R(node,len)
631 #define Set_Node_Length(node,len)
632 #define Set_Node_Cur_Length(node)
633 #define Node_Offset(n)
634 #define Node_Length(n)
635 #define Set_Node_Offset_Length(node,offset,len)
636 #define ProgLen(ri) ri->u.proglen
637 #define SetProgLen(ri,x) ri->u.proglen = x
639 #define ProgLen(ri) ri->u.offsets[0]
640 #define SetProgLen(ri,x) ri->u.offsets[0] = x
641 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
643 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
644 __LINE__, (int)(node), (int)(byte))); \
646 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
648 RExC_offsets[2*(node)-1] = (byte); \
653 #define Set_Node_Offset(node,byte) \
654 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
655 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
657 #define Set_Node_Length_To_R(node,len) STMT_START { \
659 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
660 __LINE__, (int)(node), (int)(len))); \
662 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
664 RExC_offsets[2*(node)] = (len); \
669 #define Set_Node_Length(node,len) \
670 Set_Node_Length_To_R((node)-RExC_emit_start, len)
671 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
672 #define Set_Node_Cur_Length(node) \
673 Set_Node_Length(node, RExC_parse - parse_start)
675 /* Get offsets and lengths */
676 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
677 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
679 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
680 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
681 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
685 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
686 #define EXPERIMENTAL_INPLACESCAN
687 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
689 #define DEBUG_STUDYDATA(str,data,depth) \
690 DEBUG_OPTIMISE_MORE_r(if(data){ \
691 PerlIO_printf(Perl_debug_log, \
692 "%*s" str "Pos:%"IVdf"/%"IVdf \
693 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
694 (int)(depth)*2, "", \
695 (IV)((data)->pos_min), \
696 (IV)((data)->pos_delta), \
697 (UV)((data)->flags), \
698 (IV)((data)->whilem_c), \
699 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
700 is_inf ? "INF " : "" \
702 if ((data)->last_found) \
703 PerlIO_printf(Perl_debug_log, \
704 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
705 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
706 SvPVX_const((data)->last_found), \
707 (IV)((data)->last_end), \
708 (IV)((data)->last_start_min), \
709 (IV)((data)->last_start_max), \
710 ((data)->longest && \
711 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
712 SvPVX_const((data)->longest_fixed), \
713 (IV)((data)->offset_fixed), \
714 ((data)->longest && \
715 (data)->longest==&((data)->longest_float)) ? "*" : "", \
716 SvPVX_const((data)->longest_float), \
717 (IV)((data)->offset_float_min), \
718 (IV)((data)->offset_float_max) \
720 PerlIO_printf(Perl_debug_log,"\n"); \
723 /* Mark that we cannot extend a found fixed substring at this point.
724 Update the longest found anchored substring and the longest found
725 floating substrings if needed. */
728 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
730 const STRLEN l = CHR_SVLEN(data->last_found);
731 const STRLEN old_l = CHR_SVLEN(*data->longest);
732 GET_RE_DEBUG_FLAGS_DECL;
734 PERL_ARGS_ASSERT_SCAN_COMMIT;
736 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
737 SvSetMagicSV(*data->longest, data->last_found);
738 if (*data->longest == data->longest_fixed) {
739 data->offset_fixed = l ? data->last_start_min : data->pos_min;
740 if (data->flags & SF_BEFORE_EOL)
742 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
744 data->flags &= ~SF_FIX_BEFORE_EOL;
745 data->minlen_fixed=minlenp;
746 data->lookbehind_fixed=0;
748 else { /* *data->longest == data->longest_float */
749 data->offset_float_min = l ? data->last_start_min : data->pos_min;
750 data->offset_float_max = (l
751 ? data->last_start_max
752 : data->pos_min + data->pos_delta);
753 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
754 data->offset_float_max = I32_MAX;
755 if (data->flags & SF_BEFORE_EOL)
757 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
759 data->flags &= ~SF_FL_BEFORE_EOL;
760 data->minlen_float=minlenp;
761 data->lookbehind_float=0;
764 SvCUR_set(data->last_found, 0);
766 SV * const sv = data->last_found;
767 if (SvUTF8(sv) && SvMAGICAL(sv)) {
768 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
774 data->flags &= ~SF_BEFORE_EOL;
775 DEBUG_STUDYDATA("commit: ",data,0);
778 /* These macros set, clear and test whether the synthetic start class ('ssc',
779 * given by the parameter) matches an empty string (EOS). This uses the
780 * 'next_off' field in the node, to save a bit in the flags field. The ssc
781 * stands alone, so there is never a next_off, so this field is otherwise
782 * unused. The EOS information is used only for compilation, but theoretically
783 * it could be passed on to the execution code. This could be used to store
784 * more than one bit of information, but only this one is currently used. */
785 #define SET_SSC_EOS(node) STMT_START { (node)->next_off = TRUE; } STMT_END
786 #define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
787 #define TEST_SSC_EOS(node) cBOOL((node)->next_off)
789 /* Can match anything (initialization) */
791 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
793 PERL_ARGS_ASSERT_CL_ANYTHING;
795 ANYOF_BITMAP_SETALL(cl);
796 cl->flags = ANYOF_UNICODE_ALL;
799 /* If any portion of the regex is to operate under locale rules,
800 * initialization includes it. The reason this isn't done for all regexes
801 * is that the optimizer was written under the assumption that locale was
802 * all-or-nothing. Given the complexity and lack of documentation in the
803 * optimizer, and that there are inadequate test cases for locale, so many
804 * parts of it may not work properly, it is safest to avoid locale unless
806 if (RExC_contains_locale) {
807 ANYOF_CLASS_SETALL(cl); /* /l uses class */
808 cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD;
811 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
815 /* Can match anything (initialization) */
817 S_cl_is_anything(const struct regnode_charclass_class *cl)
821 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
823 for (value = 0; value < ANYOF_MAX; value += 2)
824 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
826 if (!(cl->flags & ANYOF_UNICODE_ALL))
828 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
833 /* Can match anything (initialization) */
835 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
837 PERL_ARGS_ASSERT_CL_INIT;
839 Zero(cl, 1, struct regnode_charclass_class);
841 cl_anything(pRExC_state, cl);
842 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
845 /* These two functions currently do the exact same thing */
846 #define cl_init_zero S_cl_init
848 /* 'AND' a given class with another one. Can create false positives. 'cl'
849 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
850 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
852 S_cl_and(struct regnode_charclass_class *cl,
853 const struct regnode_charclass_class *and_with)
855 PERL_ARGS_ASSERT_CL_AND;
857 assert(PL_regkind[and_with->type] == ANYOF);
859 /* I (khw) am not sure all these restrictions are necessary XXX */
860 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
861 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
862 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
863 && !(and_with->flags & ANYOF_LOC_FOLD)
864 && !(cl->flags & ANYOF_LOC_FOLD)) {
867 if (and_with->flags & ANYOF_INVERT)
868 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
869 cl->bitmap[i] &= ~and_with->bitmap[i];
871 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
872 cl->bitmap[i] &= and_with->bitmap[i];
873 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
875 if (and_with->flags & ANYOF_INVERT) {
877 /* Here, the and'ed node is inverted. Get the AND of the flags that
878 * aren't affected by the inversion. Those that are affected are
879 * handled individually below */
880 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
881 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
882 cl->flags |= affected_flags;
884 /* We currently don't know how to deal with things that aren't in the
885 * bitmap, but we know that the intersection is no greater than what
886 * is already in cl, so let there be false positives that get sorted
887 * out after the synthetic start class succeeds, and the node is
888 * matched for real. */
890 /* The inversion of these two flags indicate that the resulting
891 * intersection doesn't have them */
892 if (and_with->flags & ANYOF_UNICODE_ALL) {
893 cl->flags &= ~ANYOF_UNICODE_ALL;
895 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
896 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
899 else { /* and'd node is not inverted */
900 U8 outside_bitmap_but_not_utf8; /* Temp variable */
902 if (! ANYOF_NONBITMAP(and_with)) {
904 /* Here 'and_with' doesn't match anything outside the bitmap
905 * (except possibly ANYOF_UNICODE_ALL), which means the
906 * intersection can't either, except for ANYOF_UNICODE_ALL, in
907 * which case we don't know what the intersection is, but it's no
908 * greater than what cl already has, so can just leave it alone,
909 * with possible false positives */
910 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
911 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
912 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
915 else if (! ANYOF_NONBITMAP(cl)) {
917 /* Here, 'and_with' does match something outside the bitmap, and cl
918 * doesn't have a list of things to match outside the bitmap. If
919 * cl can match all code points above 255, the intersection will
920 * be those above-255 code points that 'and_with' matches. If cl
921 * can't match all Unicode code points, it means that it can't
922 * match anything outside the bitmap (since the 'if' that got us
923 * into this block tested for that), so we leave the bitmap empty.
925 if (cl->flags & ANYOF_UNICODE_ALL) {
926 ARG_SET(cl, ARG(and_with));
928 /* and_with's ARG may match things that don't require UTF8.
929 * And now cl's will too, in spite of this being an 'and'. See
930 * the comments below about the kludge */
931 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
935 /* Here, both 'and_with' and cl match something outside the
936 * bitmap. Currently we do not do the intersection, so just match
937 * whatever cl had at the beginning. */
941 /* Take the intersection of the two sets of flags. However, the
942 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
943 * kludge around the fact that this flag is not treated like the others
944 * which are initialized in cl_anything(). The way the optimizer works
945 * is that the synthetic start class (SSC) is initialized to match
946 * anything, and then the first time a real node is encountered, its
947 * values are AND'd with the SSC's with the result being the values of
948 * the real node. However, there are paths through the optimizer where
949 * the AND never gets called, so those initialized bits are set
950 * inappropriately, which is not usually a big deal, as they just cause
951 * false positives in the SSC, which will just mean a probably
952 * imperceptible slow down in execution. However this bit has a
953 * higher false positive consequence in that it can cause utf8.pm,
954 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
955 * bigger slowdown and also causes significant extra memory to be used.
956 * In order to prevent this, the code now takes a different tack. The
957 * bit isn't set unless some part of the regular expression needs it,
958 * but once set it won't get cleared. This means that these extra
959 * modules won't get loaded unless there was some path through the
960 * pattern that would have required them anyway, and so any false
961 * positives that occur by not ANDing them out when they could be
962 * aren't as severe as they would be if we treated this bit like all
964 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
965 & ANYOF_NONBITMAP_NON_UTF8;
966 cl->flags &= and_with->flags;
967 cl->flags |= outside_bitmap_but_not_utf8;
971 /* 'OR' a given class with another one. Can create false positives. 'cl'
972 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
973 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
975 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
977 PERL_ARGS_ASSERT_CL_OR;
979 if (or_with->flags & ANYOF_INVERT) {
981 /* Here, the or'd node is to be inverted. This means we take the
982 * complement of everything not in the bitmap, but currently we don't
983 * know what that is, so give up and match anything */
984 if (ANYOF_NONBITMAP(or_with)) {
985 cl_anything(pRExC_state, cl);
988 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
989 * <= (B1 | !B2) | (CL1 | !CL2)
990 * which is wasteful if CL2 is small, but we ignore CL2:
991 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
992 * XXXX Can we handle case-fold? Unclear:
993 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
994 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
996 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
997 && !(or_with->flags & ANYOF_LOC_FOLD)
998 && !(cl->flags & ANYOF_LOC_FOLD) ) {
1001 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1002 cl->bitmap[i] |= ~or_with->bitmap[i];
1003 } /* XXXX: logic is complicated otherwise */
1005 cl_anything(pRExC_state, cl);
1008 /* And, we can just take the union of the flags that aren't affected
1009 * by the inversion */
1010 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
1012 /* For the remaining flags:
1013 ANYOF_UNICODE_ALL and inverted means to not match anything above
1014 255, which means that the union with cl should just be
1015 what cl has in it, so can ignore this flag
1016 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
1017 is 127-255 to match them, but then invert that, so the
1018 union with cl should just be what cl has in it, so can
1021 } else { /* 'or_with' is not inverted */
1022 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
1023 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1024 && (!(or_with->flags & ANYOF_LOC_FOLD)
1025 || (cl->flags & ANYOF_LOC_FOLD)) ) {
1028 /* OR char bitmap and class bitmap separately */
1029 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1030 cl->bitmap[i] |= or_with->bitmap[i];
1031 ANYOF_CLASS_OR(or_with, cl);
1033 else { /* XXXX: logic is complicated, leave it along for a moment. */
1034 cl_anything(pRExC_state, cl);
1037 if (ANYOF_NONBITMAP(or_with)) {
1039 /* Use the added node's outside-the-bit-map match if there isn't a
1040 * conflict. If there is a conflict (both nodes match something
1041 * outside the bitmap, but what they match outside is not the same
1042 * pointer, and hence not easily compared until XXX we extend
1043 * inversion lists this far), give up and allow the start class to
1044 * match everything outside the bitmap. If that stuff is all above
1045 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1046 if (! ANYOF_NONBITMAP(cl)) {
1047 ARG_SET(cl, ARG(or_with));
1049 else if (ARG(cl) != ARG(or_with)) {
1051 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1052 cl_anything(pRExC_state, cl);
1055 cl->flags |= ANYOF_UNICODE_ALL;
1060 /* Take the union */
1061 cl->flags |= or_with->flags;
1065 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1066 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1067 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1068 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1073 dump_trie(trie,widecharmap,revcharmap)
1074 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1075 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1077 These routines dump out a trie in a somewhat readable format.
1078 The _interim_ variants are used for debugging the interim
1079 tables that are used to generate the final compressed
1080 representation which is what dump_trie expects.
1082 Part of the reason for their existence is to provide a form
1083 of documentation as to how the different representations function.
1088 Dumps the final compressed table form of the trie to Perl_debug_log.
1089 Used for debugging make_trie().
1093 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1094 AV *revcharmap, U32 depth)
1097 SV *sv=sv_newmortal();
1098 int colwidth= widecharmap ? 6 : 4;
1100 GET_RE_DEBUG_FLAGS_DECL;
1102 PERL_ARGS_ASSERT_DUMP_TRIE;
1104 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1105 (int)depth * 2 + 2,"",
1106 "Match","Base","Ofs" );
1108 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1109 SV ** const tmp = av_fetch( revcharmap, state, 0);
1111 PerlIO_printf( Perl_debug_log, "%*s",
1113 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1114 PL_colors[0], PL_colors[1],
1115 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1116 PERL_PV_ESCAPE_FIRSTCHAR
1121 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1122 (int)depth * 2 + 2,"");
1124 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1125 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1126 PerlIO_printf( Perl_debug_log, "\n");
1128 for( state = 1 ; state < trie->statecount ; state++ ) {
1129 const U32 base = trie->states[ state ].trans.base;
1131 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1133 if ( trie->states[ state ].wordnum ) {
1134 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1136 PerlIO_printf( Perl_debug_log, "%6s", "" );
1139 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1144 while( ( base + ofs < trie->uniquecharcount ) ||
1145 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1146 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1149 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1151 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1152 if ( ( base + ofs >= trie->uniquecharcount ) &&
1153 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1154 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1156 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1158 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1160 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1164 PerlIO_printf( Perl_debug_log, "]");
1167 PerlIO_printf( Perl_debug_log, "\n" );
1169 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1170 for (word=1; word <= trie->wordcount; word++) {
1171 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1172 (int)word, (int)(trie->wordinfo[word].prev),
1173 (int)(trie->wordinfo[word].len));
1175 PerlIO_printf(Perl_debug_log, "\n" );
1178 Dumps a fully constructed but uncompressed trie in list form.
1179 List tries normally only are used for construction when the number of
1180 possible chars (trie->uniquecharcount) is very high.
1181 Used for debugging make_trie().
1184 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1185 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1189 SV *sv=sv_newmortal();
1190 int colwidth= widecharmap ? 6 : 4;
1191 GET_RE_DEBUG_FLAGS_DECL;
1193 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1195 /* print out the table precompression. */
1196 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1197 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1198 "------:-----+-----------------\n" );
1200 for( state=1 ; state < next_alloc ; state ++ ) {
1203 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1204 (int)depth * 2 + 2,"", (UV)state );
1205 if ( ! trie->states[ state ].wordnum ) {
1206 PerlIO_printf( Perl_debug_log, "%5s| ","");
1208 PerlIO_printf( Perl_debug_log, "W%4x| ",
1209 trie->states[ state ].wordnum
1212 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1213 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1215 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1217 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1218 PL_colors[0], PL_colors[1],
1219 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1220 PERL_PV_ESCAPE_FIRSTCHAR
1222 TRIE_LIST_ITEM(state,charid).forid,
1223 (UV)TRIE_LIST_ITEM(state,charid).newstate
1226 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1227 (int)((depth * 2) + 14), "");
1230 PerlIO_printf( Perl_debug_log, "\n");
1235 Dumps a fully constructed but uncompressed trie in table form.
1236 This is the normal DFA style state transition table, with a few
1237 twists to facilitate compression later.
1238 Used for debugging make_trie().
1241 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1242 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1247 SV *sv=sv_newmortal();
1248 int colwidth= widecharmap ? 6 : 4;
1249 GET_RE_DEBUG_FLAGS_DECL;
1251 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1254 print out the table precompression so that we can do a visual check
1255 that they are identical.
1258 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1260 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1261 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1263 PerlIO_printf( Perl_debug_log, "%*s",
1265 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1266 PL_colors[0], PL_colors[1],
1267 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1268 PERL_PV_ESCAPE_FIRSTCHAR
1274 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1276 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1277 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1280 PerlIO_printf( Perl_debug_log, "\n" );
1282 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1284 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1285 (int)depth * 2 + 2,"",
1286 (UV)TRIE_NODENUM( state ) );
1288 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1289 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1291 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1293 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1295 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1296 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1298 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1299 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1307 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1308 startbranch: the first branch in the whole branch sequence
1309 first : start branch of sequence of branch-exact nodes.
1310 May be the same as startbranch
1311 last : Thing following the last branch.
1312 May be the same as tail.
1313 tail : item following the branch sequence
1314 count : words in the sequence
1315 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1316 depth : indent depth
1318 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1320 A trie is an N'ary tree where the branches are determined by digital
1321 decomposition of the key. IE, at the root node you look up the 1st character and
1322 follow that branch repeat until you find the end of the branches. Nodes can be
1323 marked as "accepting" meaning they represent a complete word. Eg:
1327 would convert into the following structure. Numbers represent states, letters
1328 following numbers represent valid transitions on the letter from that state, if
1329 the number is in square brackets it represents an accepting state, otherwise it
1330 will be in parenthesis.
1332 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1336 (1) +-i->(6)-+-s->[7]
1338 +-s->(3)-+-h->(4)-+-e->[5]
1340 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1342 This shows that when matching against the string 'hers' we will begin at state 1
1343 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1344 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1345 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1346 single traverse. We store a mapping from accepting to state to which word was
1347 matched, and then when we have multiple possibilities we try to complete the
1348 rest of the regex in the order in which they occured in the alternation.
1350 The only prior NFA like behaviour that would be changed by the TRIE support is
1351 the silent ignoring of duplicate alternations which are of the form:
1353 / (DUPE|DUPE) X? (?{ ... }) Y /x
1355 Thus EVAL blocks following a trie may be called a different number of times with
1356 and without the optimisation. With the optimisations dupes will be silently
1357 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1358 the following demonstrates:
1360 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1362 which prints out 'word' three times, but
1364 'words'=~/(word|word|word)(?{ print $1 })S/
1366 which doesnt print it out at all. This is due to other optimisations kicking in.
1368 Example of what happens on a structural level:
1370 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1372 1: CURLYM[1] {1,32767}(18)
1383 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1384 and should turn into:
1386 1: CURLYM[1] {1,32767}(18)
1388 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1396 Cases where tail != last would be like /(?foo|bar)baz/:
1406 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1407 and would end up looking like:
1410 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1417 d = uvuni_to_utf8_flags(d, uv, 0);
1419 is the recommended Unicode-aware way of saying
1424 #define TRIE_STORE_REVCHAR(val) \
1427 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1428 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1429 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1430 SvCUR_set(zlopp, kapow - flrbbbbb); \
1433 av_push(revcharmap, zlopp); \
1435 char ooooff = (char)val; \
1436 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1440 #define TRIE_READ_CHAR STMT_START { \
1443 /* if it is UTF then it is either already folded, or does not need folding */ \
1444 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
1446 else if (folder == PL_fold_latin1) { \
1447 /* if we use this folder we have to obey unicode rules on latin-1 data */ \
1448 if ( foldlen > 0 ) { \
1449 uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
1455 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1456 skiplen = UNISKIP(uvc); \
1457 foldlen -= skiplen; \
1458 scan = foldbuf + skiplen; \
1461 /* raw data, will be folded later if needed */ \
1469 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1470 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1471 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1472 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1474 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1475 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1476 TRIE_LIST_CUR( state )++; \
1479 #define TRIE_LIST_NEW(state) STMT_START { \
1480 Newxz( trie->states[ state ].trans.list, \
1481 4, reg_trie_trans_le ); \
1482 TRIE_LIST_CUR( state ) = 1; \
1483 TRIE_LIST_LEN( state ) = 4; \
1486 #define TRIE_HANDLE_WORD(state) STMT_START { \
1487 U16 dupe= trie->states[ state ].wordnum; \
1488 regnode * const noper_next = regnext( noper ); \
1491 /* store the word for dumping */ \
1493 if (OP(noper) != NOTHING) \
1494 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1496 tmp = newSVpvn_utf8( "", 0, UTF ); \
1497 av_push( trie_words, tmp ); \
1501 trie->wordinfo[curword].prev = 0; \
1502 trie->wordinfo[curword].len = wordlen; \
1503 trie->wordinfo[curword].accept = state; \
1505 if ( noper_next < tail ) { \
1507 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1508 trie->jump[curword] = (U16)(noper_next - convert); \
1510 jumper = noper_next; \
1512 nextbranch= regnext(cur); \
1516 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1517 /* chain, so that when the bits of chain are later */\
1518 /* linked together, the dups appear in the chain */\
1519 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1520 trie->wordinfo[dupe].prev = curword; \
1522 /* we haven't inserted this word yet. */ \
1523 trie->states[ state ].wordnum = curword; \
1528 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1529 ( ( base + charid >= ucharcount \
1530 && base + charid < ubound \
1531 && state == trie->trans[ base - ucharcount + charid ].check \
1532 && trie->trans[ base - ucharcount + charid ].next ) \
1533 ? trie->trans[ base - ucharcount + charid ].next \
1534 : ( state==1 ? special : 0 ) \
1538 #define MADE_JUMP_TRIE 2
1539 #define MADE_EXACT_TRIE 4
1542 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1545 /* first pass, loop through and scan words */
1546 reg_trie_data *trie;
1547 HV *widecharmap = NULL;
1548 AV *revcharmap = newAV();
1550 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1555 regnode *jumper = NULL;
1556 regnode *nextbranch = NULL;
1557 regnode *convert = NULL;
1558 U32 *prev_states; /* temp array mapping each state to previous one */
1559 /* we just use folder as a flag in utf8 */
1560 const U8 * folder = NULL;
1563 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1564 AV *trie_words = NULL;
1565 /* along with revcharmap, this only used during construction but both are
1566 * useful during debugging so we store them in the struct when debugging.
1569 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1570 STRLEN trie_charcount=0;
1572 SV *re_trie_maxbuff;
1573 GET_RE_DEBUG_FLAGS_DECL;
1575 PERL_ARGS_ASSERT_MAKE_TRIE;
1577 PERL_UNUSED_ARG(depth);
1584 case EXACTFU_TRICKYFOLD:
1585 case EXACTFU: folder = PL_fold_latin1; break;
1586 case EXACTF: folder = PL_fold; break;
1587 case EXACTFL: folder = PL_fold_locale; break;
1588 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1591 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1593 trie->startstate = 1;
1594 trie->wordcount = word_count;
1595 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1596 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1598 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1599 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1600 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1603 trie_words = newAV();
1606 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1607 if (!SvIOK(re_trie_maxbuff)) {
1608 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1610 DEBUG_TRIE_COMPILE_r({
1611 PerlIO_printf( Perl_debug_log,
1612 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1613 (int)depth * 2 + 2, "",
1614 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1615 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1619 /* Find the node we are going to overwrite */
1620 if ( first == startbranch && OP( last ) != BRANCH ) {
1621 /* whole branch chain */
1624 /* branch sub-chain */
1625 convert = NEXTOPER( first );
1628 /* -- First loop and Setup --
1630 We first traverse the branches and scan each word to determine if it
1631 contains widechars, and how many unique chars there are, this is
1632 important as we have to build a table with at least as many columns as we
1635 We use an array of integers to represent the character codes 0..255
1636 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1637 native representation of the character value as the key and IV's for the
1640 *TODO* If we keep track of how many times each character is used we can
1641 remap the columns so that the table compression later on is more
1642 efficient in terms of memory by ensuring the most common value is in the
1643 middle and the least common are on the outside. IMO this would be better
1644 than a most to least common mapping as theres a decent chance the most
1645 common letter will share a node with the least common, meaning the node
1646 will not be compressible. With a middle is most common approach the worst
1647 case is when we have the least common nodes twice.
1651 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1652 regnode *noper = NEXTOPER( cur );
1653 const U8 *uc = (U8*)STRING( noper );
1654 const U8 *e = uc + STR_LEN( noper );
1656 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1658 const U8 *scan = (U8*)NULL;
1659 U32 wordlen = 0; /* required init */
1661 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1663 if (OP(noper) == NOTHING) {
1664 regnode *noper_next= regnext(noper);
1665 if (noper_next != tail && OP(noper_next) == flags) {
1667 uc= (U8*)STRING(noper);
1668 e= uc + STR_LEN(noper);
1669 trie->minlen= STR_LEN(noper);
1676 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1677 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1678 regardless of encoding */
1679 if (OP( noper ) == EXACTFU_SS) {
1680 /* false positives are ok, so just set this */
1681 TRIE_BITMAP_SET(trie,0xDF);
1684 for ( ; uc < e ; uc += len ) {
1685 TRIE_CHARCOUNT(trie)++;
1690 U8 folded= folder[ (U8) uvc ];
1691 if ( !trie->charmap[ folded ] ) {
1692 trie->charmap[ folded ]=( ++trie->uniquecharcount );
1693 TRIE_STORE_REVCHAR( folded );
1696 if ( !trie->charmap[ uvc ] ) {
1697 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1698 TRIE_STORE_REVCHAR( uvc );
1701 /* store the codepoint in the bitmap, and its folded
1703 TRIE_BITMAP_SET(trie, uvc);
1705 /* store the folded codepoint */
1706 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1709 /* store first byte of utf8 representation of
1710 variant codepoints */
1711 if (! UNI_IS_INVARIANT(uvc)) {
1712 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1715 set_bit = 0; /* We've done our bit :-) */
1720 widecharmap = newHV();
1722 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1725 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1727 if ( !SvTRUE( *svpp ) ) {
1728 sv_setiv( *svpp, ++trie->uniquecharcount );
1729 TRIE_STORE_REVCHAR(uvc);
1733 if( cur == first ) {
1734 trie->minlen = chars;
1735 trie->maxlen = chars;
1736 } else if (chars < trie->minlen) {
1737 trie->minlen = chars;
1738 } else if (chars > trie->maxlen) {
1739 trie->maxlen = chars;
1741 if (OP( noper ) == EXACTFU_SS) {
1742 /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1743 if (trie->minlen > 1)
1746 if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1747 /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1748 * - We assume that any such sequence might match a 2 byte string */
1749 if (trie->minlen > 2 )
1753 } /* end first pass */
1754 DEBUG_TRIE_COMPILE_r(
1755 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1756 (int)depth * 2 + 2,"",
1757 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1758 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1759 (int)trie->minlen, (int)trie->maxlen )
1763 We now know what we are dealing with in terms of unique chars and
1764 string sizes so we can calculate how much memory a naive
1765 representation using a flat table will take. If it's over a reasonable
1766 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1767 conservative but potentially much slower representation using an array
1770 At the end we convert both representations into the same compressed
1771 form that will be used in regexec.c for matching with. The latter
1772 is a form that cannot be used to construct with but has memory
1773 properties similar to the list form and access properties similar
1774 to the table form making it both suitable for fast searches and
1775 small enough that its feasable to store for the duration of a program.
1777 See the comment in the code where the compressed table is produced
1778 inplace from the flat tabe representation for an explanation of how
1779 the compression works.
1784 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1787 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1789 Second Pass -- Array Of Lists Representation
1791 Each state will be represented by a list of charid:state records
1792 (reg_trie_trans_le) the first such element holds the CUR and LEN
1793 points of the allocated array. (See defines above).
1795 We build the initial structure using the lists, and then convert
1796 it into the compressed table form which allows faster lookups
1797 (but cant be modified once converted).
1800 STRLEN transcount = 1;
1802 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1803 "%*sCompiling trie using list compiler\n",
1804 (int)depth * 2 + 2, ""));
1806 trie->states = (reg_trie_state *)
1807 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1808 sizeof(reg_trie_state) );
1812 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1814 regnode *noper = NEXTOPER( cur );
1815 U8 *uc = (U8*)STRING( noper );
1816 const U8 *e = uc + STR_LEN( noper );
1817 U32 state = 1; /* required init */
1818 U16 charid = 0; /* sanity init */
1819 U8 *scan = (U8*)NULL; /* sanity init */
1820 STRLEN foldlen = 0; /* required init */
1821 U32 wordlen = 0; /* required init */
1822 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1825 if (OP(noper) == NOTHING) {
1826 regnode *noper_next= regnext(noper);
1827 if (noper_next != tail && OP(noper_next) == flags) {
1829 uc= (U8*)STRING(noper);
1830 e= uc + STR_LEN(noper);
1834 if (OP(noper) != NOTHING) {
1835 for ( ; uc < e ; uc += len ) {
1840 charid = trie->charmap[ uvc ];
1842 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1846 charid=(U16)SvIV( *svpp );
1849 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1856 if ( !trie->states[ state ].trans.list ) {
1857 TRIE_LIST_NEW( state );
1859 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1860 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1861 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1866 newstate = next_alloc++;
1867 prev_states[newstate] = state;
1868 TRIE_LIST_PUSH( state, charid, newstate );
1873 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1877 TRIE_HANDLE_WORD(state);
1879 } /* end second pass */
1881 /* next alloc is the NEXT state to be allocated */
1882 trie->statecount = next_alloc;
1883 trie->states = (reg_trie_state *)
1884 PerlMemShared_realloc( trie->states,
1886 * sizeof(reg_trie_state) );
1888 /* and now dump it out before we compress it */
1889 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1890 revcharmap, next_alloc,
1894 trie->trans = (reg_trie_trans *)
1895 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1902 for( state=1 ; state < next_alloc ; state ++ ) {
1906 DEBUG_TRIE_COMPILE_MORE_r(
1907 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1911 if (trie->states[state].trans.list) {
1912 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1916 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1917 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1918 if ( forid < minid ) {
1920 } else if ( forid > maxid ) {
1924 if ( transcount < tp + maxid - minid + 1) {
1926 trie->trans = (reg_trie_trans *)
1927 PerlMemShared_realloc( trie->trans,
1929 * sizeof(reg_trie_trans) );
1930 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1932 base = trie->uniquecharcount + tp - minid;
1933 if ( maxid == minid ) {
1935 for ( ; zp < tp ; zp++ ) {
1936 if ( ! trie->trans[ zp ].next ) {
1937 base = trie->uniquecharcount + zp - minid;
1938 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1939 trie->trans[ zp ].check = state;
1945 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1946 trie->trans[ tp ].check = state;
1951 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1952 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1953 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1954 trie->trans[ tid ].check = state;
1956 tp += ( maxid - minid + 1 );
1958 Safefree(trie->states[ state ].trans.list);
1961 DEBUG_TRIE_COMPILE_MORE_r(
1962 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1965 trie->states[ state ].trans.base=base;
1967 trie->lasttrans = tp + 1;
1971 Second Pass -- Flat Table Representation.
1973 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1974 We know that we will need Charcount+1 trans at most to store the data
1975 (one row per char at worst case) So we preallocate both structures
1976 assuming worst case.
1978 We then construct the trie using only the .next slots of the entry
1981 We use the .check field of the first entry of the node temporarily to
1982 make compression both faster and easier by keeping track of how many non
1983 zero fields are in the node.
1985 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1988 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1989 number representing the first entry of the node, and state as a
1990 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1991 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1992 are 2 entrys per node. eg:
2000 The table is internally in the right hand, idx form. However as we also
2001 have to deal with the states array which is indexed by nodenum we have to
2002 use TRIE_NODENUM() to convert.
2005 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2006 "%*sCompiling trie using table compiler\n",
2007 (int)depth * 2 + 2, ""));
2009 trie->trans = (reg_trie_trans *)
2010 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2011 * trie->uniquecharcount + 1,
2012 sizeof(reg_trie_trans) );
2013 trie->states = (reg_trie_state *)
2014 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2015 sizeof(reg_trie_state) );
2016 next_alloc = trie->uniquecharcount + 1;
2019 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2021 regnode *noper = NEXTOPER( cur );
2022 const U8 *uc = (U8*)STRING( noper );
2023 const U8 *e = uc + STR_LEN( noper );
2025 U32 state = 1; /* required init */
2027 U16 charid = 0; /* sanity init */
2028 U32 accept_state = 0; /* sanity init */
2029 U8 *scan = (U8*)NULL; /* sanity init */
2031 STRLEN foldlen = 0; /* required init */
2032 U32 wordlen = 0; /* required init */
2034 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2036 if (OP(noper) == NOTHING) {
2037 regnode *noper_next= regnext(noper);
2038 if (noper_next != tail && OP(noper_next) == flags) {
2040 uc= (U8*)STRING(noper);
2041 e= uc + STR_LEN(noper);
2045 if ( OP(noper) != NOTHING ) {
2046 for ( ; uc < e ; uc += len ) {
2051 charid = trie->charmap[ uvc ];
2053 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2054 charid = svpp ? (U16)SvIV(*svpp) : 0;
2058 if ( !trie->trans[ state + charid ].next ) {
2059 trie->trans[ state + charid ].next = next_alloc;
2060 trie->trans[ state ].check++;
2061 prev_states[TRIE_NODENUM(next_alloc)]
2062 = TRIE_NODENUM(state);
2063 next_alloc += trie->uniquecharcount;
2065 state = trie->trans[ state + charid ].next;
2067 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2069 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2072 accept_state = TRIE_NODENUM( state );
2073 TRIE_HANDLE_WORD(accept_state);
2075 } /* end second pass */
2077 /* and now dump it out before we compress it */
2078 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2080 next_alloc, depth+1));
2084 * Inplace compress the table.*
2086 For sparse data sets the table constructed by the trie algorithm will
2087 be mostly 0/FAIL transitions or to put it another way mostly empty.
2088 (Note that leaf nodes will not contain any transitions.)
2090 This algorithm compresses the tables by eliminating most such
2091 transitions, at the cost of a modest bit of extra work during lookup:
2093 - Each states[] entry contains a .base field which indicates the
2094 index in the state[] array wheres its transition data is stored.
2096 - If .base is 0 there are no valid transitions from that node.
2098 - If .base is nonzero then charid is added to it to find an entry in
2101 -If trans[states[state].base+charid].check!=state then the
2102 transition is taken to be a 0/Fail transition. Thus if there are fail
2103 transitions at the front of the node then the .base offset will point
2104 somewhere inside the previous nodes data (or maybe even into a node
2105 even earlier), but the .check field determines if the transition is
2109 The following process inplace converts the table to the compressed
2110 table: We first do not compress the root node 1,and mark all its
2111 .check pointers as 1 and set its .base pointer as 1 as well. This
2112 allows us to do a DFA construction from the compressed table later,
2113 and ensures that any .base pointers we calculate later are greater
2116 - We set 'pos' to indicate the first entry of the second node.
2118 - We then iterate over the columns of the node, finding the first and
2119 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2120 and set the .check pointers accordingly, and advance pos
2121 appropriately and repreat for the next node. Note that when we copy
2122 the next pointers we have to convert them from the original
2123 NODEIDX form to NODENUM form as the former is not valid post
2126 - If a node has no transitions used we mark its base as 0 and do not
2127 advance the pos pointer.
2129 - If a node only has one transition we use a second pointer into the
2130 structure to fill in allocated fail transitions from other states.
2131 This pointer is independent of the main pointer and scans forward
2132 looking for null transitions that are allocated to a state. When it
2133 finds one it writes the single transition into the "hole". If the
2134 pointer doesnt find one the single transition is appended as normal.
2136 - Once compressed we can Renew/realloc the structures to release the
2139 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2140 specifically Fig 3.47 and the associated pseudocode.
2144 const U32 laststate = TRIE_NODENUM( next_alloc );
2147 trie->statecount = laststate;
2149 for ( state = 1 ; state < laststate ; state++ ) {
2151 const U32 stateidx = TRIE_NODEIDX( state );
2152 const U32 o_used = trie->trans[ stateidx ].check;
2153 U32 used = trie->trans[ stateidx ].check;
2154 trie->trans[ stateidx ].check = 0;
2156 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2157 if ( flag || trie->trans[ stateidx + charid ].next ) {
2158 if ( trie->trans[ stateidx + charid ].next ) {
2160 for ( ; zp < pos ; zp++ ) {
2161 if ( ! trie->trans[ zp ].next ) {
2165 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2166 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2167 trie->trans[ zp ].check = state;
2168 if ( ++zp > pos ) pos = zp;
2175 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2177 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2178 trie->trans[ pos ].check = state;
2183 trie->lasttrans = pos + 1;
2184 trie->states = (reg_trie_state *)
2185 PerlMemShared_realloc( trie->states, laststate
2186 * sizeof(reg_trie_state) );
2187 DEBUG_TRIE_COMPILE_MORE_r(
2188 PerlIO_printf( Perl_debug_log,
2189 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2190 (int)depth * 2 + 2,"",
2191 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2194 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2197 } /* end table compress */
2199 DEBUG_TRIE_COMPILE_MORE_r(
2200 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2201 (int)depth * 2 + 2, "",
2202 (UV)trie->statecount,
2203 (UV)trie->lasttrans)
2205 /* resize the trans array to remove unused space */
2206 trie->trans = (reg_trie_trans *)
2207 PerlMemShared_realloc( trie->trans, trie->lasttrans
2208 * sizeof(reg_trie_trans) );
2210 { /* Modify the program and insert the new TRIE node */
2211 U8 nodetype =(U8)(flags & 0xFF);
2215 regnode *optimize = NULL;
2216 #ifdef RE_TRACK_PATTERN_OFFSETS
2219 U32 mjd_nodelen = 0;
2220 #endif /* RE_TRACK_PATTERN_OFFSETS */
2221 #endif /* DEBUGGING */
2223 This means we convert either the first branch or the first Exact,
2224 depending on whether the thing following (in 'last') is a branch
2225 or not and whther first is the startbranch (ie is it a sub part of
2226 the alternation or is it the whole thing.)
2227 Assuming its a sub part we convert the EXACT otherwise we convert
2228 the whole branch sequence, including the first.
2230 /* Find the node we are going to overwrite */
2231 if ( first != startbranch || OP( last ) == BRANCH ) {
2232 /* branch sub-chain */
2233 NEXT_OFF( first ) = (U16)(last - first);
2234 #ifdef RE_TRACK_PATTERN_OFFSETS
2236 mjd_offset= Node_Offset((convert));
2237 mjd_nodelen= Node_Length((convert));
2240 /* whole branch chain */
2242 #ifdef RE_TRACK_PATTERN_OFFSETS
2245 const regnode *nop = NEXTOPER( convert );
2246 mjd_offset= Node_Offset((nop));
2247 mjd_nodelen= Node_Length((nop));
2251 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2252 (int)depth * 2 + 2, "",
2253 (UV)mjd_offset, (UV)mjd_nodelen)
2256 /* But first we check to see if there is a common prefix we can
2257 split out as an EXACT and put in front of the TRIE node. */
2258 trie->startstate= 1;
2259 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2261 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2265 const U32 base = trie->states[ state ].trans.base;
2267 if ( trie->states[state].wordnum )
2270 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2271 if ( ( base + ofs >= trie->uniquecharcount ) &&
2272 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2273 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2275 if ( ++count > 1 ) {
2276 SV **tmp = av_fetch( revcharmap, ofs, 0);
2277 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2278 if ( state == 1 ) break;
2280 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2282 PerlIO_printf(Perl_debug_log,
2283 "%*sNew Start State=%"UVuf" Class: [",
2284 (int)depth * 2 + 2, "",
2287 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2288 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2290 TRIE_BITMAP_SET(trie,*ch);
2292 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2294 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2298 TRIE_BITMAP_SET(trie,*ch);
2300 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2301 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2307 SV **tmp = av_fetch( revcharmap, idx, 0);
2309 char *ch = SvPV( *tmp, len );
2311 SV *sv=sv_newmortal();
2312 PerlIO_printf( Perl_debug_log,
2313 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2314 (int)depth * 2 + 2, "",
2316 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2317 PL_colors[0], PL_colors[1],
2318 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2319 PERL_PV_ESCAPE_FIRSTCHAR
2324 OP( convert ) = nodetype;
2325 str=STRING(convert);
2328 STR_LEN(convert) += len;
2334 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2339 trie->prefixlen = (state-1);
2341 regnode *n = convert+NODE_SZ_STR(convert);
2342 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2343 trie->startstate = state;
2344 trie->minlen -= (state - 1);
2345 trie->maxlen -= (state - 1);
2347 /* At least the UNICOS C compiler choked on this
2348 * being argument to DEBUG_r(), so let's just have
2351 #ifdef PERL_EXT_RE_BUILD
2357 regnode *fix = convert;
2358 U32 word = trie->wordcount;
2360 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2361 while( ++fix < n ) {
2362 Set_Node_Offset_Length(fix, 0, 0);
2365 SV ** const tmp = av_fetch( trie_words, word, 0 );
2367 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2368 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2370 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2378 NEXT_OFF(convert) = (U16)(tail - convert);
2379 DEBUG_r(optimize= n);
2385 if ( trie->maxlen ) {
2386 NEXT_OFF( convert ) = (U16)(tail - convert);
2387 ARG_SET( convert, data_slot );
2388 /* Store the offset to the first unabsorbed branch in
2389 jump[0], which is otherwise unused by the jump logic.
2390 We use this when dumping a trie and during optimisation. */
2392 trie->jump[0] = (U16)(nextbranch - convert);
2394 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2395 * and there is a bitmap
2396 * and the first "jump target" node we found leaves enough room
2397 * then convert the TRIE node into a TRIEC node, with the bitmap
2398 * embedded inline in the opcode - this is hypothetically faster.
2400 if ( !trie->states[trie->startstate].wordnum
2402 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2404 OP( convert ) = TRIEC;
2405 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2406 PerlMemShared_free(trie->bitmap);
2409 OP( convert ) = TRIE;
2411 /* store the type in the flags */
2412 convert->flags = nodetype;
2416 + regarglen[ OP( convert ) ];
2418 /* XXX We really should free up the resource in trie now,
2419 as we won't use them - (which resources?) dmq */
2421 /* needed for dumping*/
2422 DEBUG_r(if (optimize) {
2423 regnode *opt = convert;
2425 while ( ++opt < optimize) {
2426 Set_Node_Offset_Length(opt,0,0);
2429 Try to clean up some of the debris left after the
2432 while( optimize < jumper ) {
2433 mjd_nodelen += Node_Length((optimize));
2434 OP( optimize ) = OPTIMIZED;
2435 Set_Node_Offset_Length(optimize,0,0);
2438 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2440 } /* end node insert */
2442 /* Finish populating the prev field of the wordinfo array. Walk back
2443 * from each accept state until we find another accept state, and if
2444 * so, point the first word's .prev field at the second word. If the
2445 * second already has a .prev field set, stop now. This will be the
2446 * case either if we've already processed that word's accept state,
2447 * or that state had multiple words, and the overspill words were
2448 * already linked up earlier.
2455 for (word=1; word <= trie->wordcount; word++) {
2457 if (trie->wordinfo[word].prev)
2459 state = trie->wordinfo[word].accept;
2461 state = prev_states[state];
2464 prev = trie->states[state].wordnum;
2468 trie->wordinfo[word].prev = prev;
2470 Safefree(prev_states);
2474 /* and now dump out the compressed format */
2475 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2477 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2479 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2480 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2482 SvREFCNT_dec_NN(revcharmap);
2486 : trie->startstate>1
2492 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2494 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2496 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2497 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2500 We find the fail state for each state in the trie, this state is the longest proper
2501 suffix of the current state's 'word' that is also a proper prefix of another word in our
2502 trie. State 1 represents the word '' and is thus the default fail state. This allows
2503 the DFA not to have to restart after its tried and failed a word at a given point, it
2504 simply continues as though it had been matching the other word in the first place.
2506 'abcdgu'=~/abcdefg|cdgu/
2507 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2508 fail, which would bring us to the state representing 'd' in the second word where we would
2509 try 'g' and succeed, proceeding to match 'cdgu'.
2511 /* add a fail transition */
2512 const U32 trie_offset = ARG(source);
2513 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2515 const U32 ucharcount = trie->uniquecharcount;
2516 const U32 numstates = trie->statecount;
2517 const U32 ubound = trie->lasttrans + ucharcount;
2521 U32 base = trie->states[ 1 ].trans.base;
2524 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2525 GET_RE_DEBUG_FLAGS_DECL;
2527 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2529 PERL_UNUSED_ARG(depth);
2533 ARG_SET( stclass, data_slot );
2534 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2535 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2536 aho->trie=trie_offset;
2537 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2538 Copy( trie->states, aho->states, numstates, reg_trie_state );
2539 Newxz( q, numstates, U32);
2540 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2543 /* initialize fail[0..1] to be 1 so that we always have
2544 a valid final fail state */
2545 fail[ 0 ] = fail[ 1 ] = 1;
2547 for ( charid = 0; charid < ucharcount ; charid++ ) {
2548 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2550 q[ q_write ] = newstate;
2551 /* set to point at the root */
2552 fail[ q[ q_write++ ] ]=1;
2555 while ( q_read < q_write) {
2556 const U32 cur = q[ q_read++ % numstates ];
2557 base = trie->states[ cur ].trans.base;
2559 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2560 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2562 U32 fail_state = cur;
2565 fail_state = fail[ fail_state ];
2566 fail_base = aho->states[ fail_state ].trans.base;
2567 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2569 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2570 fail[ ch_state ] = fail_state;
2571 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2573 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2575 q[ q_write++ % numstates] = ch_state;
2579 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2580 when we fail in state 1, this allows us to use the
2581 charclass scan to find a valid start char. This is based on the principle
2582 that theres a good chance the string being searched contains lots of stuff
2583 that cant be a start char.
2585 fail[ 0 ] = fail[ 1 ] = 0;
2586 DEBUG_TRIE_COMPILE_r({
2587 PerlIO_printf(Perl_debug_log,
2588 "%*sStclass Failtable (%"UVuf" states): 0",
2589 (int)(depth * 2), "", (UV)numstates
2591 for( q_read=1; q_read<numstates; q_read++ ) {
2592 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2594 PerlIO_printf(Perl_debug_log, "\n");
2597 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2602 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2603 * These need to be revisited when a newer toolchain becomes available.
2605 #if defined(__sparc64__) && defined(__GNUC__)
2606 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2607 # undef SPARC64_GCC_WORKAROUND
2608 # define SPARC64_GCC_WORKAROUND 1
2612 #define DEBUG_PEEP(str,scan,depth) \
2613 DEBUG_OPTIMISE_r({if (scan){ \
2614 SV * const mysv=sv_newmortal(); \
2615 regnode *Next = regnext(scan); \
2616 regprop(RExC_rx, mysv, scan); \
2617 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2618 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2619 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2623 /* The below joins as many adjacent EXACTish nodes as possible into a single
2624 * one. The regop may be changed if the node(s) contain certain sequences that
2625 * require special handling. The joining is only done if:
2626 * 1) there is room in the current conglomerated node to entirely contain the
2628 * 2) they are the exact same node type
2630 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2631 * these get optimized out
2633 * If a node is to match under /i (folded), the number of characters it matches
2634 * can be different than its character length if it contains a multi-character
2635 * fold. *min_subtract is set to the total delta of the input nodes.
2637 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2638 * and contains LATIN SMALL LETTER SHARP S
2640 * This is as good a place as any to discuss the design of handling these
2641 * multi-character fold sequences. It's been wrong in Perl for a very long
2642 * time. There are three code points in Unicode whose multi-character folds
2643 * were long ago discovered to mess things up. The previous designs for
2644 * dealing with these involved assigning a special node for them. This
2645 * approach doesn't work, as evidenced by this example:
2646 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2647 * Both these fold to "sss", but if the pattern is parsed to create a node that
2648 * would match just the \xDF, it won't be able to handle the case where a
2649 * successful match would have to cross the node's boundary. The new approach
2650 * that hopefully generally solves the problem generates an EXACTFU_SS node
2653 * It turns out that there are problems with all multi-character folds, and not
2654 * just these three. Now the code is general, for all such cases, but the
2655 * three still have some special handling. The approach taken is:
2656 * 1) This routine examines each EXACTFish node that could contain multi-
2657 * character fold sequences. It returns in *min_subtract how much to
2658 * subtract from the the actual length of the string to get a real minimum
2659 * match length; it is 0 if there are no multi-char folds. This delta is
2660 * used by the caller to adjust the min length of the match, and the delta
2661 * between min and max, so that the optimizer doesn't reject these
2662 * possibilities based on size constraints.
2663 * 2) Certain of these sequences require special handling by the trie code,
2664 * so, if found, this code changes the joined node type to special ops:
2665 * EXACTFU_TRICKYFOLD and EXACTFU_SS.
2666 * 3) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2667 * is used for an EXACTFU node that contains at least one "ss" sequence in
2668 * it. For non-UTF-8 patterns and strings, this is the only case where
2669 * there is a possible fold length change. That means that a regular
2670 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
2671 * with length changes, and so can be processed faster. regexec.c takes
2672 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
2673 * pre-folded by regcomp.c. This saves effort in regex matching.
2674 * However, the pre-folding isn't done for non-UTF8 patterns because the
2675 * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2676 * down by forcing the pattern into UTF8 unless necessary. Also what
2677 * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold
2678 * possibilities for the non-UTF8 patterns are quite simple, except for
2679 * the sharp s. All the ones that don't involve a UTF-8 target string are
2680 * members of a fold-pair, and arrays are set up for all of them so that
2681 * the other member of the pair can be found quickly. Code elsewhere in
2682 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2683 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
2684 * described in the next item.
2685 * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
2686 * 'ss' or not is not knowable at compile time. It will match iff the
2687 * target string is in UTF-8, unlike the EXACTFU nodes, where it always
2688 * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
2689 * it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2690 * described in item 3). An assumption that the optimizer part of
2691 * regexec.c (probably unwittingly) makes is that a character in the
2692 * pattern corresponds to at most a single character in the target string.
2693 * (And I do mean character, and not byte here, unlike other parts of the
2694 * documentation that have never been updated to account for multibyte
2695 * Unicode.) This assumption is wrong only in this case, as all other
2696 * cases are either 1-1 folds when no UTF-8 is involved; or is true by
2697 * virtue of having this file pre-fold UTF-8 patterns. I'm
2698 * reluctant to try to change this assumption, so instead the code punts.
2699 * This routine examines EXACTF nodes for the sharp s, and returns a
2700 * boolean indicating whether or not the node is an EXACTF node that
2701 * contains a sharp s. When it is true, the caller sets a flag that later
2702 * causes the optimizer in this file to not set values for the floating
2703 * and fixed string lengths, and thus avoids the optimizer code in
2704 * regexec.c that makes the invalid assumption. Thus, there is no
2705 * optimization based on string lengths for EXACTF nodes that contain the
2706 * sharp s. This only happens for /id rules (which means the pattern
2710 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2711 if (PL_regkind[OP(scan)] == EXACT) \
2712 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2715 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2716 /* Merge several consecutive EXACTish nodes into one. */
2717 regnode *n = regnext(scan);
2719 regnode *next = scan + NODE_SZ_STR(scan);
2723 regnode *stop = scan;
2724 GET_RE_DEBUG_FLAGS_DECL;
2726 PERL_UNUSED_ARG(depth);
2729 PERL_ARGS_ASSERT_JOIN_EXACT;
2730 #ifndef EXPERIMENTAL_INPLACESCAN
2731 PERL_UNUSED_ARG(flags);
2732 PERL_UNUSED_ARG(val);
2734 DEBUG_PEEP("join",scan,depth);
2736 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2737 * EXACT ones that are mergeable to the current one. */
2739 && (PL_regkind[OP(n)] == NOTHING
2740 || (stringok && OP(n) == OP(scan)))
2742 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2745 if (OP(n) == TAIL || n > next)
2747 if (PL_regkind[OP(n)] == NOTHING) {
2748 DEBUG_PEEP("skip:",n,depth);
2749 NEXT_OFF(scan) += NEXT_OFF(n);
2750 next = n + NODE_STEP_REGNODE;
2757 else if (stringok) {
2758 const unsigned int oldl = STR_LEN(scan);
2759 regnode * const nnext = regnext(n);
2761 /* XXX I (khw) kind of doubt that this works on platforms where
2762 * U8_MAX is above 255 because of lots of other assumptions */
2763 /* Don't join if the sum can't fit into a single node */
2764 if (oldl + STR_LEN(n) > U8_MAX)
2767 DEBUG_PEEP("merg",n,depth);
2770 NEXT_OFF(scan) += NEXT_OFF(n);
2771 STR_LEN(scan) += STR_LEN(n);
2772 next = n + NODE_SZ_STR(n);
2773 /* Now we can overwrite *n : */
2774 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2782 #ifdef EXPERIMENTAL_INPLACESCAN
2783 if (flags && !NEXT_OFF(n)) {
2784 DEBUG_PEEP("atch", val, depth);
2785 if (reg_off_by_arg[OP(n)]) {
2786 ARG_SET(n, val - n);
2789 NEXT_OFF(n) = val - n;
2797 *has_exactf_sharp_s = FALSE;
2799 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2800 * can now analyze for sequences of problematic code points. (Prior to
2801 * this final joining, sequences could have been split over boundaries, and
2802 * hence missed). The sequences only happen in folding, hence for any
2803 * non-EXACT EXACTish node */
2804 if (OP(scan) != EXACT) {
2805 const U8 * const s0 = (U8*) STRING(scan);
2807 const U8 * const s_end = s0 + STR_LEN(scan);
2809 /* One pass is made over the node's string looking for all the
2810 * possibilities. to avoid some tests in the loop, there are two main
2811 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2815 /* Examine the string for a multi-character fold sequence. UTF-8
2816 * patterns have all characters pre-folded by the time this code is
2818 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2819 length sequence we are looking for is 2 */
2822 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2823 if (! len) { /* Not a multi-char fold: get next char */
2828 /* Nodes with 'ss' require special handling, except for EXACTFL
2829 * and EXACTFA for which there is no multi-char fold to this */
2830 if (len == 2 && *s == 's' && *(s+1) == 's'
2831 && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2834 OP(scan) = EXACTFU_SS;
2837 else if (len == 6 /* len is the same in both ASCII and EBCDIC for these */
2838 && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
2839 COMBINING_DIAERESIS_UTF8
2840 COMBINING_ACUTE_ACCENT_UTF8,
2842 || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
2843 COMBINING_DIAERESIS_UTF8
2844 COMBINING_ACUTE_ACCENT_UTF8,
2849 /* These two folds require special handling by trie's, so
2850 * change the node type to indicate this. If EXACTFA and
2851 * EXACTFL were ever to be handled by trie's, this would
2852 * have to be changed. If this node has already been
2853 * changed to EXACTFU_SS in this loop, leave it as is. (I
2854 * (khw) think it doesn't matter in regexec.c for UTF
2855 * patterns, but no need to change it */
2856 if (OP(scan) == EXACTFU) {
2857 OP(scan) = EXACTFU_TRICKYFOLD;
2861 else { /* Here is a generic multi-char fold. */
2862 const U8* multi_end = s + len;
2864 /* Count how many characters in it. In the case of /l and
2865 * /aa, no folds which contain ASCII code points are
2866 * allowed, so check for those, and skip if found. (In
2867 * EXACTFL, no folds are allowed to any Latin1 code point,
2868 * not just ASCII. But there aren't any of these
2869 * currently, nor ever likely, so don't take the time to
2870 * test for them. The code that generates the
2871 * is_MULTI_foo() macros croaks should one actually get put
2872 * into Unicode .) */
2873 if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2874 count = utf8_length(s, multi_end);
2878 while (s < multi_end) {
2881 goto next_iteration;
2891 /* The delta is how long the sequence is minus 1 (1 is how long
2892 * the character that folds to the sequence is) */
2893 *min_subtract += count - 1;
2897 else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2899 /* Here, the pattern is not UTF-8. Look for the multi-char folds
2900 * that are all ASCII. As in the above case, EXACTFL and EXACTFA
2901 * nodes can't have multi-char folds to this range (and there are
2902 * no existing ones in the upper latin1 range). In the EXACTF
2903 * case we look also for the sharp s, which can be in the final
2904 * position. Otherwise we can stop looking 1 byte earlier because
2905 * have to find at least two characters for a multi-fold */
2906 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2908 /* The below is perhaps overboard, but this allows us to save a
2909 * test each time through the loop at the expense of a mask. This
2910 * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
2911 * by a single bit. On ASCII they are 32 apart; on EBCDIC, they
2912 * are 64. This uses an exclusive 'or' to find that bit and then
2913 * inverts it to form a mask, with just a single 0, in the bit
2914 * position where 'S' and 's' differ. */
2915 const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2916 const U8 s_masked = 's' & S_or_s_mask;
2919 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2920 if (! len) { /* Not a multi-char fold. */
2921 if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2923 *has_exactf_sharp_s = TRUE;
2930 && ((*s & S_or_s_mask) == s_masked)
2931 && ((*(s+1) & S_or_s_mask) == s_masked))
2934 /* EXACTF nodes need to know that the minimum length
2935 * changed so that a sharp s in the string can match this
2936 * ss in the pattern, but they remain EXACTF nodes, as they
2937 * won't match this unless the target string is is UTF-8,
2938 * which we don't know until runtime */
2939 if (OP(scan) != EXACTF) {
2940 OP(scan) = EXACTFU_SS;
2944 *min_subtract += len - 1;
2951 /* Allow dumping but overwriting the collection of skipped
2952 * ops and/or strings with fake optimized ops */
2953 n = scan + NODE_SZ_STR(scan);
2961 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2965 /* REx optimizer. Converts nodes into quicker variants "in place".
2966 Finds fixed substrings. */
2968 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2969 to the position after last scanned or to NULL. */
2971 #define INIT_AND_WITHP \
2972 assert(!and_withp); \
2973 Newx(and_withp,1,struct regnode_charclass_class); \
2974 SAVEFREEPV(and_withp)
2976 /* this is a chain of data about sub patterns we are processing that
2977 need to be handled separately/specially in study_chunk. Its so
2978 we can simulate recursion without losing state. */
2980 typedef struct scan_frame {
2981 regnode *last; /* last node to process in this frame */
2982 regnode *next; /* next node to process when last is reached */
2983 struct scan_frame *prev; /*previous frame*/
2984 I32 stop; /* what stopparen do we use */
2988 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2991 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2992 I32 *minlenp, I32 *deltap,
2997 struct regnode_charclass_class *and_withp,
2998 U32 flags, U32 depth)
2999 /* scanp: Start here (read-write). */
3000 /* deltap: Write maxlen-minlen here. */
3001 /* last: Stop before this one. */
3002 /* data: string data about the pattern */
3003 /* stopparen: treat close N as END */
3004 /* recursed: which subroutines have we recursed into */
3005 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3008 I32 min = 0; /* There must be at least this number of characters to match */
3010 regnode *scan = *scanp, *next;
3012 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3013 int is_inf_internal = 0; /* The studied chunk is infinite */
3014 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3015 scan_data_t data_fake;
3016 SV *re_trie_maxbuff = NULL;
3017 regnode *first_non_open = scan;
3018 I32 stopmin = I32_MAX;
3019 scan_frame *frame = NULL;
3020 GET_RE_DEBUG_FLAGS_DECL;
3022 PERL_ARGS_ASSERT_STUDY_CHUNK;
3025 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3029 while (first_non_open && OP(first_non_open) == OPEN)
3030 first_non_open=regnext(first_non_open);
3035 while ( scan && OP(scan) != END && scan < last ){
3036 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3037 node length to get a real minimum (because
3038 the folded version may be shorter) */
3039 bool has_exactf_sharp_s = FALSE;
3040 /* Peephole optimizer: */
3041 DEBUG_STUDYDATA("Peep:", data,depth);
3042 DEBUG_PEEP("Peep",scan,depth);
3044 /* Its not clear to khw or hv why this is done here, and not in the
3045 * clauses that deal with EXACT nodes. khw's guess is that it's
3046 * because of a previous design */
3047 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3049 /* Follow the next-chain of the current node and optimize
3050 away all the NOTHINGs from it. */
3051 if (OP(scan) != CURLYX) {
3052 const int max = (reg_off_by_arg[OP(scan)]
3054 /* I32 may be smaller than U16 on CRAYs! */
3055 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3056 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3060 /* Skip NOTHING and LONGJMP. */
3061 while ((n = regnext(n))
3062 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3063 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3064 && off + noff < max)
3066 if (reg_off_by_arg[OP(scan)])
3069 NEXT_OFF(scan) = off;
3074 /* The principal pseudo-switch. Cannot be a switch, since we
3075 look into several different things. */
3076 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3077 || OP(scan) == IFTHEN) {
3078 next = regnext(scan);
3080 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3082 if (OP(next) == code || code == IFTHEN) {
3083 /* NOTE - There is similar code to this block below for handling
3084 TRIE nodes on a re-study. If you change stuff here check there
3086 I32 max1 = 0, min1 = I32_MAX, num = 0;
3087 struct regnode_charclass_class accum;
3088 regnode * const startbranch=scan;
3090 if (flags & SCF_DO_SUBSTR)
3091 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3092 if (flags & SCF_DO_STCLASS)
3093 cl_init_zero(pRExC_state, &accum);
3095 while (OP(scan) == code) {
3096 I32 deltanext, minnext, f = 0, fake;
3097 struct regnode_charclass_class this_class;
3100 data_fake.flags = 0;
3102 data_fake.whilem_c = data->whilem_c;
3103 data_fake.last_closep = data->last_closep;
3106 data_fake.last_closep = &fake;
3108 data_fake.pos_delta = delta;
3109 next = regnext(scan);
3110 scan = NEXTOPER(scan);
3112 scan = NEXTOPER(scan);
3113 if (flags & SCF_DO_STCLASS) {
3114 cl_init(pRExC_state, &this_class);
3115 data_fake.start_class = &this_class;
3116 f = SCF_DO_STCLASS_AND;
3118 if (flags & SCF_WHILEM_VISITED_POS)
3119 f |= SCF_WHILEM_VISITED_POS;
3121 /* we suppose the run is continuous, last=next...*/
3122 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3124 stopparen, recursed, NULL, f,depth+1);
3127 if (max1 < minnext + deltanext)
3128 max1 = minnext + deltanext;
3129 if (deltanext == I32_MAX)
3130 is_inf = is_inf_internal = 1;
3132 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3134 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3135 if ( stopmin > minnext)
3136 stopmin = min + min1;
3137 flags &= ~SCF_DO_SUBSTR;
3139 data->flags |= SCF_SEEN_ACCEPT;
3142 if (data_fake.flags & SF_HAS_EVAL)
3143 data->flags |= SF_HAS_EVAL;
3144 data->whilem_c = data_fake.whilem_c;
3146 if (flags & SCF_DO_STCLASS)
3147 cl_or(pRExC_state, &accum, &this_class);
3149 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3151 if (flags & SCF_DO_SUBSTR) {
3152 data->pos_min += min1;
3153 data->pos_delta += max1 - min1;
3154 if (max1 != min1 || is_inf)
3155 data->longest = &(data->longest_float);
3158 delta += max1 - min1;
3159 if (flags & SCF_DO_STCLASS_OR) {
3160 cl_or(pRExC_state, data->start_class, &accum);
3162 cl_and(data->start_class, and_withp);
3163 flags &= ~SCF_DO_STCLASS;
3166 else if (flags & SCF_DO_STCLASS_AND) {
3168 cl_and(data->start_class, &accum);
3169 flags &= ~SCF_DO_STCLASS;
3172 /* Switch to OR mode: cache the old value of
3173 * data->start_class */
3175 StructCopy(data->start_class, and_withp,
3176 struct regnode_charclass_class);
3177 flags &= ~SCF_DO_STCLASS_AND;
3178 StructCopy(&accum, data->start_class,
3179 struct regnode_charclass_class);
3180 flags |= SCF_DO_STCLASS_OR;
3181 SET_SSC_EOS(data->start_class);
3185 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3188 Assuming this was/is a branch we are dealing with: 'scan' now
3189 points at the item that follows the branch sequence, whatever
3190 it is. We now start at the beginning of the sequence and look
3197 which would be constructed from a pattern like /A|LIST|OF|WORDS/
3199 If we can find such a subsequence we need to turn the first
3200 element into a trie and then add the subsequent branch exact
3201 strings to the trie.
3205 1. patterns where the whole set of branches can be converted.
3207 2. patterns where only a subset can be converted.
3209 In case 1 we can replace the whole set with a single regop
3210 for the trie. In case 2 we need to keep the start and end
3213 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3214 becomes BRANCH TRIE; BRANCH X;
3216 There is an additional case, that being where there is a
3217 common prefix, which gets split out into an EXACT like node
3218 preceding the TRIE node.
3220 If x(1..n)==tail then we can do a simple trie, if not we make
3221 a "jump" trie, such that when we match the appropriate word
3222 we "jump" to the appropriate tail node. Essentially we turn
3223 a nested if into a case structure of sorts.
3228 if (!re_trie_maxbuff) {
3229 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3230 if (!SvIOK(re_trie_maxbuff))
3231 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3233 if ( SvIV(re_trie_maxbuff)>=0 ) {
3235 regnode *first = (regnode *)NULL;
3236 regnode *last = (regnode *)NULL;
3237 regnode *tail = scan;
3242 SV * const mysv = sv_newmortal(); /* for dumping */
3244 /* var tail is used because there may be a TAIL
3245 regop in the way. Ie, the exacts will point to the
3246 thing following the TAIL, but the last branch will
3247 point at the TAIL. So we advance tail. If we
3248 have nested (?:) we may have to move through several
3252 while ( OP( tail ) == TAIL ) {
3253 /* this is the TAIL generated by (?:) */
3254 tail = regnext( tail );
3258 DEBUG_TRIE_COMPILE_r({
3259 regprop(RExC_rx, mysv, tail );
3260 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3261 (int)depth * 2 + 2, "",
3262 "Looking for TRIE'able sequences. Tail node is: ",
3263 SvPV_nolen_const( mysv )
3269 Step through the branches
3270 cur represents each branch,
3271 noper is the first thing to be matched as part of that branch
3272 noper_next is the regnext() of that node.
3274 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3275 via a "jump trie" but we also support building with NOJUMPTRIE,
3276 which restricts the trie logic to structures like /FOO|BAR/.
3278 If noper is a trieable nodetype then the branch is a possible optimization
3279 target. If we are building under NOJUMPTRIE then we require that noper_next
3280 is the same as scan (our current position in the regex program).
3282 Once we have two or more consecutive such branches we can create a
3283 trie of the EXACT's contents and stitch it in place into the program.
3285 If the sequence represents all of the branches in the alternation we
3286 replace the entire thing with a single TRIE node.
3288 Otherwise when it is a subsequence we need to stitch it in place and
3289 replace only the relevant branches. This means the first branch has
3290 to remain as it is used by the alternation logic, and its next pointer,
3291 and needs to be repointed at the item on the branch chain following
3292 the last branch we have optimized away.
3294 This could be either a BRANCH, in which case the subsequence is internal,
3295 or it could be the item following the branch sequence in which case the
3296 subsequence is at the end (which does not necessarily mean the first node
3297 is the start of the alternation).
3299 TRIE_TYPE(X) is a define which maps the optype to a trietype.
3302 ----------------+-----------
3306 EXACTFU_SS | EXACTFU
3307 EXACTFU_TRICKYFOLD | EXACTFU
3312 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3313 ( EXACT == (X) ) ? EXACT : \
3314 ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
3317 /* dont use tail as the end marker for this traverse */
3318 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3319 regnode * const noper = NEXTOPER( cur );
3320 U8 noper_type = OP( noper );
3321 U8 noper_trietype = TRIE_TYPE( noper_type );
3322 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3323 regnode * const noper_next = regnext( noper );
3324 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3325 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3328 DEBUG_TRIE_COMPILE_r({
3329 regprop(RExC_rx, mysv, cur);
3330 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3331 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3333 regprop(RExC_rx, mysv, noper);
3334 PerlIO_printf( Perl_debug_log, " -> %s",
3335 SvPV_nolen_const(mysv));
3338 regprop(RExC_rx, mysv, noper_next );
3339 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3340 SvPV_nolen_const(mysv));
3342 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3343 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3344 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3348 /* Is noper a trieable nodetype that can be merged with the
3349 * current trie (if there is one)? */
3353 ( noper_trietype == NOTHING)
3354 || ( trietype == NOTHING )
3355 || ( trietype == noper_trietype )
3358 && noper_next == tail
3362 /* Handle mergable triable node
3363 * Either we are the first node in a new trieable sequence,
3364 * in which case we do some bookkeeping, otherwise we update
3365 * the end pointer. */
3368 if ( noper_trietype == NOTHING ) {
3369 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3370 regnode * const noper_next = regnext( noper );
3371 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3372 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3375 if ( noper_next_trietype ) {
3376 trietype = noper_next_trietype;
3377 } else if (noper_next_type) {
3378 /* a NOTHING regop is 1 regop wide. We need at least two
3379 * for a trie so we can't merge this in */
3383 trietype = noper_trietype;
3386 if ( trietype == NOTHING )
3387 trietype = noper_trietype;
3392 } /* end handle mergable triable node */
3394 /* handle unmergable node -
3395 * noper may either be a triable node which can not be tried
3396 * together with the current trie, or a non triable node */
3398 /* If last is set and trietype is not NOTHING then we have found
3399 * at least two triable branch sequences in a row of a similar
3400 * trietype so we can turn them into a trie. If/when we
3401 * allow NOTHING to start a trie sequence this condition will be
3402 * required, and it isn't expensive so we leave it in for now. */
3403 if ( trietype && trietype != NOTHING )
3404 make_trie( pRExC_state,
3405 startbranch, first, cur, tail, count,
3406 trietype, depth+1 );
3407 last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3411 && noper_next == tail
3414 /* noper is triable, so we can start a new trie sequence */
3417 trietype = noper_trietype;
3419 /* if we already saw a first but the current node is not triable then we have
3420 * to reset the first information. */
3425 } /* end handle unmergable node */
3426 } /* loop over branches */
3427 DEBUG_TRIE_COMPILE_r({
3428 regprop(RExC_rx, mysv, cur);
3429 PerlIO_printf( Perl_debug_log,
3430 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3431 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3434 if ( last && trietype ) {
3435 if ( trietype != NOTHING ) {
3436 /* the last branch of the sequence was part of a trie,
3437 * so we have to construct it here outside of the loop
3439 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3440 #ifdef TRIE_STUDY_OPT
3441 if ( ((made == MADE_EXACT_TRIE &&
3442 startbranch == first)
3443 || ( first_non_open == first )) &&
3445 flags |= SCF_TRIE_RESTUDY;
3446 if ( startbranch == first
3449 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3454 /* at this point we know whatever we have is a NOTHING sequence/branch
3455 * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3457 if ( startbranch == first ) {
3459 /* the entire thing is a NOTHING sequence, something like this:
3460 * (?:|) So we can turn it into a plain NOTHING op. */
3461 DEBUG_TRIE_COMPILE_r({
3462 regprop(RExC_rx, mysv, cur);
3463 PerlIO_printf( Perl_debug_log,
3464 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3465 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3468 OP(startbranch)= NOTHING;
3469 NEXT_OFF(startbranch)= tail - startbranch;
3470 for ( opt= startbranch + 1; opt < tail ; opt++ )
3474 } /* end if ( last) */
3475 } /* TRIE_MAXBUF is non zero */
3480 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3481 scan = NEXTOPER(NEXTOPER(scan));
3482 } else /* single branch is optimized. */
3483 scan = NEXTOPER(scan);
3485 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3486 scan_frame *newframe = NULL;
3491 if (OP(scan) != SUSPEND) {
3492 /* set the pointer */
3493 if (OP(scan) == GOSUB) {
3495 RExC_recurse[ARG2L(scan)] = scan;
3496 start = RExC_open_parens[paren-1];
3497 end = RExC_close_parens[paren-1];
3500 start = RExC_rxi->program + 1;
3504 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3505 SAVEFREEPV(recursed);
3507 if (!PAREN_TEST(recursed,paren+1)) {
3508 PAREN_SET(recursed,paren+1);
3509 Newx(newframe,1,scan_frame);
3511 if (flags & SCF_DO_SUBSTR) {
3512 SCAN_COMMIT(pRExC_state,data,minlenp);
3513 data->longest = &(data->longest_float);
3515 is_inf = is_inf_internal = 1;
3516 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3517 cl_anything(pRExC_state, data->start_class);
3518 flags &= ~SCF_DO_STCLASS;
3521 Newx(newframe,1,scan_frame);
3524 end = regnext(scan);
3529 SAVEFREEPV(newframe);
3530 newframe->next = regnext(scan);
3531 newframe->last = last;
3532 newframe->stop = stopparen;
3533 newframe->prev = frame;
3543 else if (OP(scan) == EXACT) {
3544 I32 l = STR_LEN(scan);
3547 const U8 * const s = (U8*)STRING(scan);
3548 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3549 l = utf8_length(s, s + l);
3551 uc = *((U8*)STRING(scan));
3554 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3555 /* The code below prefers earlier match for fixed
3556 offset, later match for variable offset. */
3557 if (data->last_end == -1) { /* Update the start info. */
3558 data->last_start_min = data->pos_min;
3559 data->last_start_max = is_inf
3560 ? I32_MAX : data->pos_min + data->pos_delta;
3562 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3564 SvUTF8_on(data->last_found);
3566 SV * const sv = data->last_found;
3567 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3568 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3569 if (mg && mg->mg_len >= 0)
3570 mg->mg_len += utf8_length((U8*)STRING(scan),
3571 (U8*)STRING(scan)+STR_LEN(scan));
3573 data->last_end = data->pos_min + l;
3574 data->pos_min += l; /* As in the first entry. */
3575 data->flags &= ~SF_BEFORE_EOL;
3577 if (flags & SCF_DO_STCLASS_AND) {
3578 /* Check whether it is compatible with what we know already! */
3582 /* If compatible, we or it in below. It is compatible if is
3583 * in the bitmp and either 1) its bit or its fold is set, or 2)
3584 * it's for a locale. Even if there isn't unicode semantics
3585 * here, at runtime there may be because of matching against a
3586 * utf8 string, so accept a possible false positive for
3587 * latin1-range folds */
3589 (!(data->start_class->flags & ANYOF_LOCALE)
3590 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3591 && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3592 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3597 ANYOF_CLASS_ZERO(data->start_class);
3598 ANYOF_BITMAP_ZERO(data->start_class);
3600 ANYOF_BITMAP_SET(data->start_class, uc);
3601 else if (uc >= 0x100) {
3604 /* Some Unicode code points fold to the Latin1 range; as
3605 * XXX temporary code, instead of figuring out if this is
3606 * one, just assume it is and set all the start class bits
3607 * that could be some such above 255 code point's fold
3608 * which will generate fals positives. As the code
3609 * elsewhere that does compute the fold settles down, it
3610 * can be extracted out and re-used here */
3611 for (i = 0; i < 256; i++){
3612 if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3613 ANYOF_BITMAP_SET(data->start_class, i);
3617 CLEAR_SSC_EOS(data->start_class);
3619 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3621 else if (flags & SCF_DO_STCLASS_OR) {
3622 /* false positive possible if the class is case-folded */
3624 ANYOF_BITMAP_SET(data->start_class, uc);
3626 data->start_class->flags |= ANYOF_UNICODE_ALL;
3627 CLEAR_SSC_EOS(data->start_class);
3628 cl_and(data->start_class, and_withp);
3630 flags &= ~SCF_DO_STCLASS;
3632 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3633 I32 l = STR_LEN(scan);
3634 UV uc = *((U8*)STRING(scan));
3636 /* Search for fixed substrings supports EXACT only. */
3637 if (flags & SCF_DO_SUBSTR) {
3639 SCAN_COMMIT(pRExC_state, data, minlenp);
3642 const U8 * const s = (U8 *)STRING(scan);
3643 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3644 l = utf8_length(s, s + l);
3646 if (has_exactf_sharp_s) {
3647 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3649 min += l - min_subtract;
3651 delta += min_subtract;
3652 if (flags & SCF_DO_SUBSTR) {
3653 data->pos_min += l - min_subtract;
3654 if (data->pos_min < 0) {
3657 data->pos_delta += min_subtract;
3659 data->longest = &(data->longest_float);
3662 if (flags & SCF_DO_STCLASS_AND) {
3663 /* Check whether it is compatible with what we know already! */
3666 (!(data->start_class->flags & ANYOF_LOCALE)
3667 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3668 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3672 ANYOF_CLASS_ZERO(data->start_class);
3673 ANYOF_BITMAP_ZERO(data->start_class);
3675 ANYOF_BITMAP_SET(data->start_class, uc);
3676 CLEAR_SSC_EOS(data->start_class);
3677 if (OP(scan) == EXACTFL) {
3678 /* XXX This set is probably no longer necessary, and
3679 * probably wrong as LOCALE now is on in the initial
3681 data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3685 /* Also set the other member of the fold pair. In case
3686 * that unicode semantics is called for at runtime, use
3687 * the full latin1 fold. (Can't do this for locale,
3688 * because not known until runtime) */
3689 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3691 /* All other (EXACTFL handled above) folds except under
3692 * /iaa that include s, S, and sharp_s also may include
3694 if (OP(scan) != EXACTFA) {
3695 if (uc == 's' || uc == 'S') {
3696 ANYOF_BITMAP_SET(data->start_class,
3697 LATIN_SMALL_LETTER_SHARP_S);
3699 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3700 ANYOF_BITMAP_SET(data->start_class, 's');
3701 ANYOF_BITMAP_SET(data->start_class, 'S');
3706 else if (uc >= 0x100) {
3708 for (i = 0; i < 256; i++){
3709 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3710 ANYOF_BITMAP_SET(data->start_class, i);
3715 else if (flags & SCF_DO_STCLASS_OR) {
3716 if (data->start_class->flags & ANYOF_LOC_FOLD) {
3717 /* false positive possible if the class is case-folded.
3718 Assume that the locale settings are the same... */
3720 ANYOF_BITMAP_SET(data->start_class, uc);
3721 if (OP(scan) != EXACTFL) {
3723 /* And set the other member of the fold pair, but
3724 * can't do that in locale because not known until
3726 ANYOF_BITMAP_SET(data->start_class,
3727 PL_fold_latin1[uc]);
3729 /* All folds except under /iaa that include s, S,
3730 * and sharp_s also may include the others */
3731 if (OP(scan) != EXACTFA) {
3732 if (uc == 's' || uc == 'S') {
3733 ANYOF_BITMAP_SET(data->start_class,
3734 LATIN_SMALL_LETTER_SHARP_S);
3736 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3737 ANYOF_BITMAP_SET(data->start_class, 's');
3738 ANYOF_BITMAP_SET(data->start_class, 'S');
3743 CLEAR_SSC_EOS(data->start_class);
3745 cl_and(data->start_class, and_withp);
3747 flags &= ~SCF_DO_STCLASS;
3749 else if (REGNODE_VARIES(OP(scan))) {
3750 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3751 I32 f = flags, pos_before = 0;
3752 regnode * const oscan = scan;
3753 struct regnode_charclass_class this_class;
3754 struct regnode_charclass_class *oclass = NULL;
3755 I32 next_is_eval = 0;
3757 switch (PL_regkind[OP(scan)]) {
3758 case WHILEM: /* End of (?:...)* . */
3759 scan = NEXTOPER(scan);
3762 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3763 next = NEXTOPER(scan);
3764 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3766 maxcount = REG_INFTY;
3767 next = regnext(scan);
3768 scan = NEXTOPER(scan);
3772 if (flags & SCF_DO_SUBSTR)
3777 if (flags & SCF_DO_STCLASS) {
3779 maxcount = REG_INFTY;
3780 next = regnext(scan);
3781 scan = NEXTOPER(scan);
3784 is_inf = is_inf_internal = 1;
3785 scan = regnext(scan);
3786 if (flags & SCF_DO_SUBSTR) {
3787 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3788 data->longest = &(data->longest_float);
3790 goto optimize_curly_tail;
3792 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3793 && (scan->flags == stopparen))
3798 mincount = ARG1(scan);
3799 maxcount = ARG2(scan);
3801 next = regnext(scan);
3802 if (OP(scan) == CURLYX) {
3803 I32 lp = (data ? *(data->last_closep) : 0);
3804 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3806 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3807 next_is_eval = (OP(scan) == EVAL);
3809 if (flags & SCF_DO_SUBSTR) {
3810 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3811 pos_before = data->pos_min;
3815 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3817 data->flags |= SF_IS_INF;
3819 if (flags & SCF_DO_STCLASS) {
3820 cl_init(pRExC_state, &this_class);
3821 oclass = data->start_class;
3822 data->start_class = &this_class;
3823 f |= SCF_DO_STCLASS_AND;
3824 f &= ~SCF_DO_STCLASS_OR;
3826 /* Exclude from super-linear cache processing any {n,m}
3827 regops for which the combination of input pos and regex
3828 pos is not enough information to determine if a match
3831 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3832 regex pos at the \s*, the prospects for a match depend not
3833 only on the input position but also on how many (bar\s*)
3834 repeats into the {4,8} we are. */
3835 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3836 f &= ~SCF_WHILEM_VISITED_POS;
3838 /* This will finish on WHILEM, setting scan, or on NULL: */
3839 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3840 last, data, stopparen, recursed, NULL,
3842 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3844 if (flags & SCF_DO_STCLASS)
3845 data->start_class = oclass;
3846 if (mincount == 0 || minnext == 0) {
3847 if (flags & SCF_DO_STCLASS_OR) {
3848 cl_or(pRExC_state, data->start_class, &this_class);
3850 else if (flags & SCF_DO_STCLASS_AND) {
3851 /* Switch to OR mode: cache the old value of
3852 * data->start_class */
3854 StructCopy(data->start_class, and_withp,
3855 struct regnode_charclass_class);
3856 flags &= ~SCF_DO_STCLASS_AND;
3857 StructCopy(&this_class, data->start_class,
3858 struct regnode_charclass_class);
3859 flags |= SCF_DO_STCLASS_OR;
3860 SET_SSC_EOS(data->start_class);
3862 } else { /* Non-zero len */
3863 if (flags & SCF_DO_STCLASS_OR) {
3864 cl_or(pRExC_state, data->start_class, &this_class);
3865 cl_and(data->start_class, and_withp);
3867 else if (flags & SCF_DO_STCLASS_AND)
3868 cl_and(data->start_class, &this_class);
3869 flags &= ~SCF_DO_STCLASS;
3871 if (!scan) /* It was not CURLYX, but CURLY. */
3873 if ( /* ? quantifier ok, except for (?{ ... }) */
3874 (next_is_eval || !(mincount == 0 && maxcount == 1))
3875 && (minnext == 0) && (deltanext == 0)
3876 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3877 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3879 /* Fatal warnings may leak the regexp without this: */
3880 SAVEFREESV(RExC_rx_sv);
3881 ckWARNreg(RExC_parse,
3882 "Quantifier unexpected on zero-length expression");
3883 (void)ReREFCNT_inc(RExC_rx_sv);
3886 min += minnext * mincount;
3887 is_inf_internal |= ((maxcount == REG_INFTY
3888 && (minnext + deltanext) > 0)
3889 || deltanext == I32_MAX);
3890 is_inf |= is_inf_internal;
3891 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3893 /* Try powerful optimization CURLYX => CURLYN. */
3894 if ( OP(oscan) == CURLYX && data
3895 && data->flags & SF_IN_PAR
3896 && !(data->flags & SF_HAS_EVAL)
3897 && !deltanext && minnext == 1 ) {
3898 /* Try to optimize to CURLYN. */
3899 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3900 regnode * const nxt1 = nxt;
3907 if (!REGNODE_SIMPLE(OP(nxt))
3908 && !(PL_regkind[OP(nxt)] == EXACT
3909 && STR_LEN(nxt) == 1))
3915 if (OP(nxt) != CLOSE)
3917 if (RExC_open_parens) {
3918 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3919 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3921 /* Now we know that nxt2 is the only contents: */
3922 oscan->flags = (U8)ARG(nxt);
3924 OP(nxt1) = NOTHING; /* was OPEN. */
3927 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3928 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3929 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3930 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3931 OP(nxt + 1) = OPTIMIZED; /* was count. */
3932 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3937 /* Try optimization CURLYX => CURLYM. */
3938 if ( OP(oscan) == CURLYX && data
3939 && !(data->flags & SF_HAS_PAR)
3940 && !(data->flags & SF_HAS_EVAL)
3941 && !deltanext /* atom is fixed width */
3942 && minnext != 0 /* CURLYM can't handle zero width */
3943 && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3945 /* XXXX How to optimize if data == 0? */
3946 /* Optimize to a simpler form. */
3947 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3951 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3952 && (OP(nxt2) != WHILEM))
3954 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3955 /* Need to optimize away parenths. */
3956 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3957 /* Set the parenth number. */
3958 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3960 oscan->flags = (U8)ARG(nxt);
3961 if (RExC_open_parens) {
3962 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3963 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3965 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3966 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3969 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3970 OP(nxt + 1) = OPTIMIZED; /* was count. */
3971 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3972 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3975 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3976 regnode *nnxt = regnext(nxt1);
3978 if (reg_off_by_arg[OP(nxt1)])
3979 ARG_SET(nxt1, nxt2 - nxt1);
3980 else if (nxt2 - nxt1 < U16_MAX)
3981 NEXT_OFF(nxt1) = nxt2 - nxt1;
3983 OP(nxt) = NOTHING; /* Cannot beautify */
3988 /* Optimize again: */
3989 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3990 NULL, stopparen, recursed, NULL, 0,depth+1);
3995 else if ((OP(oscan) == CURLYX)
3996 && (flags & SCF_WHILEM_VISITED_POS)
3997 /* See the comment on a similar expression above.
3998 However, this time it's not a subexpression
3999 we care about, but the expression itself. */
4000 && (maxcount == REG_INFTY)
4001 && data && ++data->whilem_c < 16) {
4002 /* This stays as CURLYX, we can put the count/of pair. */
4003 /* Find WHILEM (as in regexec.c) */
4004 regnode *nxt = oscan + NEXT_OFF(oscan);
4006 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4008 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4009 | (RExC_whilem_seen << 4)); /* On WHILEM */
4011 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4013 if (flags & SCF_DO_SUBSTR) {
4014 SV *last_str = NULL;
4015 int counted = mincount != 0;
4017 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4018 #if defined(SPARC64_GCC_WORKAROUND)
4021 const char *s = NULL;
4024 if (pos_before >= data->last_start_min)
4027 b = data->last_start_min;
4030 s = SvPV_const(data->last_found, l);
4031 old = b - data->last_start_min;
4034 I32 b = pos_before >= data->last_start_min
4035 ? pos_before : data->last_start_min;
4037 const char * const s = SvPV_const(data->last_found, l);
4038 I32 old = b - data->last_start_min;
4042 old = utf8_hop((U8*)s, old) - (U8*)s;
4044 /* Get the added string: */
4045 last_str = newSVpvn_utf8(s + old, l, UTF);
4046 if (deltanext == 0 && pos_before == b) {
4047 /* What was added is a constant string */
4049 SvGROW(last_str, (mincount * l) + 1);
4050 repeatcpy(SvPVX(last_str) + l,
4051 SvPVX_const(last_str), l, mincount - 1);
4052 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4053 /* Add additional parts. */
4054 SvCUR_set(data->last_found,
4055 SvCUR(data->last_found) - l);
4056 sv_catsv(data->last_found, last_str);
4058 SV * sv = data->last_found;
4060 SvUTF8(sv) && SvMAGICAL(sv) ?
4061 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4062 if (mg && mg->mg_len >= 0)
4063 mg->mg_len += CHR_SVLEN(last_str) - l;
4065 data->last_end += l * (mincount - 1);
4068 /* start offset must point into the last copy */
4069 data->last_start_min += minnext * (mincount - 1);
4070 data->last_start_max += is_inf ? I32_MAX
4071 : (maxcount - 1) * (minnext + data->pos_delta);
4074 /* It is counted once already... */
4075 data->pos_min += minnext * (mincount - counted);
4076 data->pos_delta += - counted * deltanext +
4077 (minnext + deltanext) * maxcount - minnext * mincount;
4078 if (mincount != maxcount) {
4079 /* Cannot extend fixed substrings found inside
4081 SCAN_COMMIT(pRExC_state,data,minlenp);
4082 if (mincount && last_str) {
4083 SV * const sv = data->last_found;
4084 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4085 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4089 sv_setsv(sv, last_str);
4090 data->last_end = data->pos_min;
4091 data->last_start_min =
4092 data->pos_min - CHR_SVLEN(last_str);
4093 data->last_start_max = is_inf
4095 : data->pos_min + data->pos_delta
4096 - CHR_SVLEN(last_str);
4098 data->longest = &(data->longest_float);
4100 SvREFCNT_dec(last_str);
4102 if (data && (fl & SF_HAS_EVAL))
4103 data->flags |= SF_HAS_EVAL;
4104 optimize_curly_tail:
4105 if (OP(oscan) != CURLYX) {
4106 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4108 NEXT_OFF(oscan) += NEXT_OFF(next);
4111 default: /* REF, ANYOFV, and CLUMP only? */
4112 if (flags & SCF_DO_SUBSTR) {
4113 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4114 data->longest = &(data->longest_float);
4116 is_inf = is_inf_internal = 1;
4117 if (flags & SCF_DO_STCLASS_OR)
4118 cl_anything(pRExC_state, data->start_class);
4119 flags &= ~SCF_DO_STCLASS;
4123 else if (OP(scan) == LNBREAK) {
4124 if (flags & SCF_DO_STCLASS) {
4126 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4127 if (flags & SCF_DO_STCLASS_AND) {
4128 for (value = 0; value < 256; value++)
4129 if (!is_VERTWS_cp(value))
4130 ANYOF_BITMAP_CLEAR(data->start_class, value);
4133 for (value = 0; value < 256; value++)
4134 if (is_VERTWS_cp(value))
4135 ANYOF_BITMAP_SET(data->start_class, value);
4137 if (flags & SCF_DO_STCLASS_OR)
4138 cl_and(data->start_class, and_withp);
4139 flags &= ~SCF_DO_STCLASS;
4142 delta++; /* Because of the 2 char string cr-lf */
4143 if (flags & SCF_DO_SUBSTR) {
4144 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4146 data->pos_delta += 1;
4147 data->longest = &(data->longest_float);
4150 else if (REGNODE_SIMPLE(OP(scan))) {
4153 if (flags & SCF_DO_SUBSTR) {
4154 SCAN_COMMIT(pRExC_state,data,minlenp);
4158 if (flags & SCF_DO_STCLASS) {
4160 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4162 /* Some of the logic below assumes that switching
4163 locale on will only add false positives. */
4164 switch (PL_regkind[OP(scan)]) {
4170 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4173 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4174 cl_anything(pRExC_state, data->start_class);
4177 if (OP(scan) == SANY)
4179 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4180 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4181 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4182 cl_anything(pRExC_state, data->start_class);
4184 if (flags & SCF_DO_STCLASS_AND || !value)
4185 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4188 if (flags & SCF_DO_STCLASS_AND)
4189 cl_and(data->start_class,
4190 (struct regnode_charclass_class*)scan);
4192 cl_or(pRExC_state, data->start_class,
4193 (struct regnode_charclass_class*)scan);
4201 classnum = FLAGS(scan);
4202 if (flags & SCF_DO_STCLASS_AND) {
4203 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4204 ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
4205 for (value = 0; value < loop_max; value++) {
4206 if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4207 ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4213 if (data->start_class->flags & ANYOF_LOCALE) {
4214 ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum));
4218 /* Even if under locale, set the bits for non-locale
4219 * in case it isn't a true locale-node. This will
4220 * create false positives if it truly is locale */
4221 for (value = 0; value < loop_max; value++) {
4222 if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4223 ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4235 classnum = FLAGS(scan);
4236 if (flags & SCF_DO_STCLASS_AND) {
4237 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4238 ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
4239 for (value = 0; value < loop_max; value++) {
4240 if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4241 ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4247 if (data->start_class->flags & ANYOF_LOCALE) {
4248 ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1);
4252 /* Even if under locale, set the bits for non-locale in
4253 * case it isn't a true locale-node. This will create
4254 * false positives if it truly is locale */
4255 for (value = 0; value < loop_max; value++) {
4256 if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4257 ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4260 if (PL_regkind[OP(scan)] == NPOSIXD) {
4261 data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
4267 if (flags & SCF_DO_STCLASS_OR)
4268 cl_and(data->start_class, and_withp);
4269 flags &= ~SCF_DO_STCLASS;
4272 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4273 data->flags |= (OP(scan) == MEOL
4276 SCAN_COMMIT(pRExC_state, data, minlenp);
4279 else if ( PL_regkind[OP(scan)] == BRANCHJ
4280 /* Lookbehind, or need to calculate parens/evals/stclass: */
4281 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4282 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4283 if ( OP(scan) == UNLESSM &&
4285 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4286 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4289 regnode *upto= regnext(scan);
4291 SV * const mysv_val=sv_newmortal();
4292 DEBUG_STUDYDATA("OPFAIL",data,depth);
4294 /*DEBUG_PARSE_MSG("opfail");*/
4295 regprop(RExC_rx, mysv_val, upto);
4296 PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4297 SvPV_nolen_const(mysv_val),
4298 (IV)REG_NODE_NUM(upto),
4303 NEXT_OFF(scan) = upto - scan;
4304 for (opt= scan + 1; opt < upto ; opt++)
4305 OP(opt) = OPTIMIZED;
4309 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4310 || OP(scan) == UNLESSM )
4312 /* Negative Lookahead/lookbehind
4313 In this case we can't do fixed string optimisation.
4316 I32 deltanext, minnext, fake = 0;
4318 struct regnode_charclass_class intrnl;
4321 data_fake.flags = 0;
4323 data_fake.whilem_c = data->whilem_c;
4324 data_fake.last_closep = data->last_closep;
4327 data_fake.last_closep = &fake;
4328 data_fake.pos_delta = delta;
4329 if ( flags & SCF_DO_STCLASS && !scan->flags
4330 && OP(scan) == IFMATCH ) { /* Lookahead */
4331 cl_init(pRExC_state, &intrnl);
4332 data_fake.start_class = &intrnl;
4333 f |= SCF_DO_STCLASS_AND;
4335 if (flags & SCF_WHILEM_VISITED_POS)
4336 f |= SCF_WHILEM_VISITED_POS;
4337 next = regnext(scan);
4338 nscan = NEXTOPER(NEXTOPER(scan));
4339 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4340 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4343 FAIL("Variable length lookbehind not implemented");
4345 else if (minnext > (I32)U8_MAX) {
4346 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4348 scan->flags = (U8)minnext;
4351 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4353 if (data_fake.flags & SF_HAS_EVAL)
4354 data->flags |= SF_HAS_EVAL;
4355 data->whilem_c = data_fake.whilem_c;
4357 if (f & SCF_DO_STCLASS_AND) {
4358 if (flags & SCF_DO_STCLASS_OR) {
4359 /* OR before, AND after: ideally we would recurse with
4360 * data_fake to get the AND applied by study of the
4361 * remainder of the pattern, and then derecurse;
4362 * *** HACK *** for now just treat as "no information".
4363 * See [perl #56690].
4365 cl_init(pRExC_state, data->start_class);
4367 /* AND before and after: combine and continue */
4368 const int was = TEST_SSC_EOS(data->start_class);
4370 cl_and(data->start_class, &intrnl);
4372 SET_SSC_EOS(data->start_class);
4376 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4378 /* Positive Lookahead/lookbehind
4379 In this case we can do fixed string optimisation,
4380 but we must be careful about it. Note in the case of
4381 lookbehind the positions will be offset by the minimum
4382 length of the pattern, something we won't know about
4383 until after the recurse.
4385 I32 deltanext, fake = 0;
4387 struct regnode_charclass_class intrnl;
4389 /* We use SAVEFREEPV so that when the full compile
4390 is finished perl will clean up the allocated
4391 minlens when it's all done. This way we don't
4392 have to worry about freeing them when we know
4393 they wont be used, which would be a pain.
4396 Newx( minnextp, 1, I32 );
4397 SAVEFREEPV(minnextp);
4400 StructCopy(data, &data_fake, scan_data_t);
4401 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4404 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4405 data_fake.last_found=newSVsv(data->last_found);
4409 data_fake.last_closep = &fake;
4410 data_fake.flags = 0;
4411 data_fake.pos_delta = delta;
4413 data_fake.flags |= SF_IS_INF;
4414 if ( flags & SCF_DO_STCLASS && !scan->flags
4415 && OP(scan) == IFMATCH ) { /* Lookahead */
4416 cl_init(pRExC_state, &intrnl);
4417 data_fake.start_class = &intrnl;
4418 f |= SCF_DO_STCLASS_AND;
4420 if (flags & SCF_WHILEM_VISITED_POS)
4421 f |= SCF_WHILEM_VISITED_POS;
4422 next = regnext(scan);
4423 nscan = NEXTOPER(NEXTOPER(scan));
4425 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4426 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4429 FAIL("Variable length lookbehind not implemented");
4431 else if (*minnextp > (I32)U8_MAX) {
4432 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4434 scan->flags = (U8)*minnextp;
4439 if (f & SCF_DO_STCLASS_AND) {
4440 const int was = TEST_SSC_EOS(data.start_class);
4442 cl_and(data->start_class, &intrnl);
4444 SET_SSC_EOS(data->start_class);
4447 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4449 if (data_fake.flags & SF_HAS_EVAL)
4450 data->flags |= SF_HAS_EVAL;
4451 data->whilem_c = data_fake.whilem_c;
4452 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4453 if (RExC_rx->minlen<*minnextp)
4454 RExC_rx->minlen=*minnextp;
4455 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4456 SvREFCNT_dec_NN(data_fake.last_found);
4458 if ( data_fake.minlen_fixed != minlenp )
4460 data->offset_fixed= data_fake.offset_fixed;
4461 data->minlen_fixed= data_fake.minlen_fixed;
4462 data->lookbehind_fixed+= scan->flags;
4464 if ( data_fake.minlen_float != minlenp )
4466 data->minlen_float= data_fake.minlen_float;
4467 data->offset_float_min=data_fake.offset_float_min;
4468 data->offset_float_max=data_fake.offset_float_max;
4469 data->lookbehind_float+= scan->flags;
4476 else if (OP(scan) == OPEN) {
4477 if (stopparen != (I32)ARG(scan))
4480 else if (OP(scan) == CLOSE) {
4481 if (stopparen == (I32)ARG(scan)) {
4484 if ((I32)ARG(scan) == is_par) {
4485 next = regnext(scan);
4487 if ( next && (OP(next) != WHILEM) && next < last)
4488 is_par = 0; /* Disable optimization */
4491 *(data->last_closep) = ARG(scan);
4493 else if (OP(scan) == EVAL) {
4495 data->flags |= SF_HAS_EVAL;
4497 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4498 if (flags & SCF_DO_SUBSTR) {
4499 SCAN_COMMIT(pRExC_state,data,minlenp);
4500 flags &= ~SCF_DO_SUBSTR;
4502 if (data && OP(scan)==ACCEPT) {
4503 data->flags |= SCF_SEEN_ACCEPT;
4508 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4510 if (flags & SCF_DO_SUBSTR) {
4511 SCAN_COMMIT(pRExC_state,data,minlenp);
4512 data->longest = &(data->longest_float);
4514 is_inf = is_inf_internal = 1;
4515 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4516 cl_anything(pRExC_state, data->start_class);
4517 flags &= ~SCF_DO_STCLASS;
4519 else if (OP(scan) == GPOS) {
4520 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4521 !(delta || is_inf || (data && data->pos_delta)))
4523 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4524 RExC_rx->extflags |= RXf_ANCH_GPOS;
4525 if (RExC_rx->gofs < (U32)min)
4526 RExC_rx->gofs = min;
4528 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4532 #ifdef TRIE_STUDY_OPT
4533 #ifdef FULL_TRIE_STUDY
4534 else if (PL_regkind[OP(scan)] == TRIE) {
4535 /* NOTE - There is similar code to this block above for handling
4536 BRANCH nodes on the initial study. If you change stuff here
4538 regnode *trie_node= scan;
4539 regnode *tail= regnext(scan);
4540 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4541 I32 max1 = 0, min1 = I32_MAX;
4542 struct regnode_charclass_class accum;
4544 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4545 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4546 if (flags & SCF_DO_STCLASS)
4547 cl_init_zero(pRExC_state, &accum);
4553 const regnode *nextbranch= NULL;
4556 for ( word=1 ; word <= trie->wordcount ; word++)
4558 I32 deltanext=0, minnext=0, f = 0, fake;
4559 struct regnode_charclass_class this_class;
4561 data_fake.flags = 0;
4563 data_fake.whilem_c = data->whilem_c;
4564 data_fake.last_closep = data->last_closep;
4567 data_fake.last_closep = &fake;
4568 data_fake.pos_delta = delta;
4569 if (flags & SCF_DO_STCLASS) {
4570 cl_init(pRExC_state, &this_class);
4571 data_fake.start_class = &this_class;
4572 f = SCF_DO_STCLASS_AND;
4574 if (flags & SCF_WHILEM_VISITED_POS)
4575 f |= SCF_WHILEM_VISITED_POS;
4577 if (trie->jump[word]) {
4579 nextbranch = trie_node + trie->jump[0];
4580 scan= trie_node + trie->jump[word];
4581 /* We go from the jump point to the branch that follows
4582 it. Note this means we need the vestigal unused branches
4583 even though they arent otherwise used.
4585 minnext = study_chunk(pRExC_state, &scan, minlenp,
4586 &deltanext, (regnode *)nextbranch, &data_fake,
4587 stopparen, recursed, NULL, f,depth+1);
4589 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4590 nextbranch= regnext((regnode*)nextbranch);
4592 if (min1 > (I32)(minnext + trie->minlen))
4593 min1 = minnext + trie->minlen;
4594 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4595 max1 = minnext + deltanext + trie->maxlen;
4596 if (deltanext == I32_MAX)
4597 is_inf = is_inf_internal = 1;
4599 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4601 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4602 if ( stopmin > min + min1)
4603 stopmin = min + min1;
4604 flags &= ~SCF_DO_SUBSTR;
4606 data->flags |= SCF_SEEN_ACCEPT;
4609 if (data_fake.flags & SF_HAS_EVAL)
4610 data->flags |= SF_HAS_EVAL;
4611 data->whilem_c = data_fake.whilem_c;
4613 if (flags & SCF_DO_STCLASS)
4614 cl_or(pRExC_state, &accum, &this_class);
4617 if (flags & SCF_DO_SUBSTR) {
4618 data->pos_min += min1;
4619 data->pos_delta += max1 - min1;
4620 if (max1 != min1 || is_inf)
4621 data->longest = &(data->longest_float);
4624 delta += max1 - min1;
4625 if (flags & SCF_DO_STCLASS_OR) {
4626 cl_or(pRExC_state, data->start_class, &accum);
4628 cl_and(data->start_class, and_withp);
4629 flags &= ~SCF_DO_STCLASS;
4632 else if (flags & SCF_DO_STCLASS_AND) {
4634 cl_and(data->start_class, &accum);
4635 flags &= ~SCF_DO_STCLASS;
4638 /* Switch to OR mode: cache the old value of
4639 * data->start_class */
4641 StructCopy(data->start_class, and_withp,
4642 struct regnode_charclass_class);
4643 flags &= ~SCF_DO_STCLASS_AND;
4644 StructCopy(&accum, data->start_class,
4645 struct regnode_charclass_class);
4646 flags |= SCF_DO_STCLASS_OR;
4647 SET_SSC_EOS(data->start_class);
4654 else if (PL_regkind[OP(scan)] == TRIE) {
4655 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4658 min += trie->minlen;
4659 delta += (trie->maxlen - trie->minlen);
4660 flags &= ~SCF_DO_STCLASS; /* xxx */
4661 if (flags & SCF_DO_SUBSTR) {
4662 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4663 data->pos_min += trie->minlen;
4664 data->pos_delta += (trie->maxlen - trie->minlen);
4665 if (trie->maxlen != trie->minlen)
4666 data->longest = &(data->longest_float);
4668 if (trie->jump) /* no more substrings -- for now /grr*/
4669 flags &= ~SCF_DO_SUBSTR;
4671 #endif /* old or new */
4672 #endif /* TRIE_STUDY_OPT */
4674 /* Else: zero-length, ignore. */
4675 scan = regnext(scan);
4680 stopparen = frame->stop;
4681 frame = frame->prev;
4682 goto fake_study_recurse;
4687 DEBUG_STUDYDATA("pre-fin:",data,depth);
4690 *deltap = is_inf_internal ? I32_MAX : delta;
4691 if (flags & SCF_DO_SUBSTR && is_inf)
4692 data->pos_delta = I32_MAX - data->pos_min;
4693 if (is_par > (I32)U8_MAX)
4695 if (is_par && pars==1 && data) {
4696 data->flags |= SF_IN_PAR;
4697 data->flags &= ~SF_HAS_PAR;
4699 else if (pars && data) {
4700 data->flags |= SF_HAS_PAR;
4701 data->flags &= ~SF_IN_PAR;
4703 if (flags & SCF_DO_STCLASS_OR)
4704 cl_and(data->start_class, and_withp);
4705 if (flags & SCF_TRIE_RESTUDY)
4706 data->flags |= SCF_TRIE_RESTUDY;
4708 DEBUG_STUDYDATA("post-fin:",data,depth);
4710 return min < stopmin ? min : stopmin;
4714 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4716 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4718 PERL_ARGS_ASSERT_ADD_DATA;
4720 Renewc(RExC_rxi->data,
4721 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4722 char, struct reg_data);
4724 Renew(RExC_rxi->data->what, count + n, U8);
4726 Newx(RExC_rxi->data->what, n, U8);
4727 RExC_rxi->data->count = count + n;
4728 Copy(s, RExC_rxi->data->what + count, n, U8);
4732 /*XXX: todo make this not included in a non debugging perl */
4733 #ifndef PERL_IN_XSUB_RE
4735 Perl_reginitcolors(pTHX)
4738 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4740 char *t = savepv(s);
4744 t = strchr(t, '\t');
4750 PL_colors[i] = t = (char *)"";
4755 PL_colors[i++] = (char *)"";
4762 #ifdef TRIE_STUDY_OPT
4763 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
4766 (data.flags & SCF_TRIE_RESTUDY) \
4774 #define CHECK_RESTUDY_GOTO_butfirst
4778 * pregcomp - compile a regular expression into internal code
4780 * Decides which engine's compiler to call based on the hint currently in
4784 #ifndef PERL_IN_XSUB_RE
4786 /* return the currently in-scope regex engine (or the default if none) */
4788 regexp_engine const *
4789 Perl_current_re_engine(pTHX)
4793 if (IN_PERL_COMPILETIME) {
4794 HV * const table = GvHV(PL_hintgv);
4798 return &PL_core_reg_engine;
4799 ptr = hv_fetchs(table, "regcomp", FALSE);
4800 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4801 return &PL_core_reg_engine;
4802 return INT2PTR(regexp_engine*,SvIV(*ptr));
4806 if (!PL_curcop->cop_hints_hash)
4807 return &PL_core_reg_engine;
4808 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4809 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4810 return &PL_core_reg_engine;
4811 return INT2PTR(regexp_engine*,SvIV(ptr));
4817 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4820 regexp_engine const *eng = current_re_engine();
4821 GET_RE_DEBUG_FLAGS_DECL;
4823 PERL_ARGS_ASSERT_PREGCOMP;
4825 /* Dispatch a request to compile a regexp to correct regexp engine. */
4827 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4830 return CALLREGCOMP_ENG(eng, pattern, flags);
4834 /* public(ish) entry point for the perl core's own regex compiling code.
4835 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4836 * pattern rather than a list of OPs, and uses the internal engine rather
4837 * than the current one */
4840 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4842 SV *pat = pattern; /* defeat constness! */
4843 PERL_ARGS_ASSERT_RE_COMPILE;
4844 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4845 #ifdef PERL_IN_XSUB_RE
4848 &PL_core_reg_engine,
4850 NULL, NULL, rx_flags, 0);
4853 /* see if there are any run-time code blocks in the pattern.
4854 * False positives are allowed */
4857 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4858 U32 pm_flags, char *pat, STRLEN plen)
4863 /* avoid infinitely recursing when we recompile the pattern parcelled up
4864 * as qr'...'. A single constant qr// string can't have have any
4865 * run-time component in it, and thus, no runtime code. (A non-qr
4866 * string, however, can, e.g. $x =~ '(?{})') */
4867 if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
4870 for (s = 0; s < plen; s++) {
4871 if (n < pRExC_state->num_code_blocks
4872 && s == pRExC_state->code_blocks[n].start)
4874 s = pRExC_state->code_blocks[n].end;
4878 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
4880 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
4882 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
4889 /* Handle run-time code blocks. We will already have compiled any direct
4890 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
4891 * copy of it, but with any literal code blocks blanked out and
4892 * appropriate chars escaped; then feed it into
4894 * eval "qr'modified_pattern'"
4898 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
4902 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
4904 * After eval_sv()-ing that, grab any new code blocks from the returned qr
4905 * and merge them with any code blocks of the original regexp.
4907 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
4908 * instead, just save the qr and return FALSE; this tells our caller that
4909 * the original pattern needs upgrading to utf8.
4913 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
4914 char *pat, STRLEN plen)
4918 GET_RE_DEBUG_FLAGS_DECL;
4920 if (pRExC_state->runtime_code_qr) {
4921 /* this is the second time we've been called; this should
4922 * only happen if the main pattern got upgraded to utf8
4923 * during compilation; re-use the qr we compiled first time
4924 * round (which should be utf8 too)
4926 qr = pRExC_state->runtime_code_qr;
4927 pRExC_state->runtime_code_qr = NULL;
4928 assert(RExC_utf8 && SvUTF8(qr));
4934 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
4938 /* determine how many extra chars we need for ' and \ escaping */
4939 for (s = 0; s < plen; s++) {
4940 if (pat[s] == '\'' || pat[s] == '\\')
4944 Newx(newpat, newlen, char);
4946 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
4948 for (s = 0; s < plen; s++) {
4949 if (n < pRExC_state->num_code_blocks
4950 && s == pRExC_state->code_blocks[n].start)
4952 /* blank out literal code block */
4953 assert(pat[s] == '(');
4954 while (s <= pRExC_state->code_blocks[n].end) {
4962 if (pat[s] == '\'' || pat[s] == '\\')
4967 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
4971 PerlIO_printf(Perl_debug_log,
4972 "%sre-parsing pattern for runtime code:%s %s\n",
4973 PL_colors[4],PL_colors[5],newpat);
4976 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
4982 PUSHSTACKi(PERLSI_REQUIRE);
4983 /* this causes the toker to collapse \\ into \ when parsing
4984 * qr''; normally only q'' does this. It also alters hints
4986 PL_reg_state.re_reparsing = TRUE;
4987 eval_sv(sv, G_SCALAR);
4988 SvREFCNT_dec_NN(sv);
4993 SV * const errsv = ERRSV;
4994 if (SvTRUE_NN(errsv))
4996 Safefree(pRExC_state->code_blocks);
4997 /* use croak_sv ? */
4998 Perl_croak_nocontext("%s", SvPV_nolen_const(errsv));
5001 assert(SvROK(qr_ref));
5003 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5004 /* the leaving below frees the tmp qr_ref.
5005 * Give qr a life of its own */
5013 if (!RExC_utf8 && SvUTF8(qr)) {
5014 /* first time through; the pattern got upgraded; save the
5015 * qr for the next time through */
5016 assert(!pRExC_state->runtime_code_qr);
5017 pRExC_state->runtime_code_qr = qr;
5022 /* extract any code blocks within the returned qr// */
5025 /* merge the main (r1) and run-time (r2) code blocks into one */
5027 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5028 struct reg_code_block *new_block, *dst;
5029 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5032 if (!r2->num_code_blocks) /* we guessed wrong */
5034 SvREFCNT_dec_NN(qr);
5039 r1->num_code_blocks + r2->num_code_blocks,
5040 struct reg_code_block);
5043 while ( i1 < r1->num_code_blocks
5044 || i2 < r2->num_code_blocks)
5046 struct reg_code_block *src;
5049 if (i1 == r1->num_code_blocks) {
5050 src = &r2->code_blocks[i2++];
5053 else if (i2 == r2->num_code_blocks)
5054 src = &r1->code_blocks[i1++];
5055 else if ( r1->code_blocks[i1].start
5056 < r2->code_blocks[i2].start)
5058 src = &r1->code_blocks[i1++];
5059 assert(src->end < r2->code_blocks[i2].start);
5062 assert( r1->code_blocks[i1].start
5063 > r2->code_blocks[i2].start);
5064 src = &r2->code_blocks[i2++];
5066 assert(src->end < r1->code_blocks[i1].start);
5069 assert(pat[src->start] == '(');
5070 assert(pat[src->end] == ')');
5071 dst->start = src->start;
5072 dst->end = src->end;
5073 dst->block = src->block;
5074 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5078 r1->num_code_blocks += r2->num_code_blocks;
5079 Safefree(r1->code_blocks);
5080 r1->code_blocks = new_block;
5083 SvREFCNT_dec_NN(qr);
5089 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5091 /* This is the common code for setting up the floating and fixed length
5092 * string data extracted from Perlre_op_compile() below. Returns a boolean
5093 * as to whether succeeded or not */
5097 if (! (longest_length
5098 || (eol /* Can't have SEOL and MULTI */
5099 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5101 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5102 || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5107 /* copy the information about the longest from the reg_scan_data
5108 over to the program. */
5109 if (SvUTF8(sv_longest)) {
5110 *rx_utf8 = sv_longest;
5113 *rx_substr = sv_longest;
5116 /* end_shift is how many chars that must be matched that
5117 follow this item. We calculate it ahead of time as once the
5118 lookbehind offset is added in we lose the ability to correctly
5120 ml = minlen ? *(minlen) : (I32)longest_length;
5121 *rx_end_shift = ml - offset
5122 - longest_length + (SvTAIL(sv_longest) != 0)
5125 t = (eol/* Can't have SEOL and MULTI */
5126 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5127 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5133 * Perl_re_op_compile - the perl internal RE engine's function to compile a
5134 * regular expression into internal code.
5135 * The pattern may be passed either as:
5136 * a list of SVs (patternp plus pat_count)
5137 * a list of OPs (expr)
5138 * If both are passed, the SV list is used, but the OP list indicates
5139 * which SVs are actually pre-compiled code blocks
5141 * The SVs in the list have magic and qr overloading applied to them (and
5142 * the list may be modified in-place with replacement SVs in the latter
5145 * If the pattern hasn't changed from old_re, then old_re will be
5148 * eng is the current engine. If that engine has an op_comp method, then
5149 * handle directly (i.e. we assume that op_comp was us); otherwise, just
5150 * do the initial concatenation of arguments and pass on to the external
5153 * If is_bare_re is not null, set it to a boolean indicating whether the
5154 * arg list reduced (after overloading) to a single bare regex which has
5155 * been returned (i.e. /$qr/).
5157 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5159 * pm_flags contains the PMf_* flags, typically based on those from the
5160 * pm_flags field of the related PMOP. Currently we're only interested in
5161 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5163 * We can't allocate space until we know how big the compiled form will be,
5164 * but we can't compile it (and thus know how big it is) until we've got a
5165 * place to put the code. So we cheat: we compile it twice, once with code
5166 * generation turned off and size counting turned on, and once "for real".
5167 * This also means that we don't allocate space until we are sure that the
5168 * thing really will compile successfully, and we never have to move the
5169 * code and thus invalidate pointers into it. (Note that it has to be in
5170 * one piece because free() must be able to free it all.) [NB: not true in perl]
5172 * Beware that the optimization-preparation code in here knows about some
5173 * of the structure of the compiled regexp. [I'll say.]
5177 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5178 OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5179 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5184 regexp_internal *ri;
5193 SV * VOL code_blocksv = NULL;
5195 /* these are all flags - maybe they should be turned
5196 * into a single int with different bit masks */
5197 I32 sawlookahead = 0;
5200 bool used_setjump = FALSE;
5201 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5202 bool code_is_utf8 = 0;
5203 bool VOL recompile = 0;
5204 bool runtime_code = 0;
5208 RExC_state_t RExC_state;
5209 RExC_state_t * const pRExC_state = &RExC_state;
5210 #ifdef TRIE_STUDY_OPT
5212 RExC_state_t copyRExC_state;
5214 GET_RE_DEBUG_FLAGS_DECL;
5216 PERL_ARGS_ASSERT_RE_OP_COMPILE;
5218 DEBUG_r(if (!PL_colorset) reginitcolors());
5220 #ifndef PERL_IN_XSUB_RE
5221 /* Initialize these here instead of as-needed, as is quick and avoids
5222 * having to test them each time otherwise */
5223 if (! PL_AboveLatin1) {
5224 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5225 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5226 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5228 PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5229 = _new_invlist_C_array(L1PosixAlnum_invlist);
5230 PL_Posix_ptrs[_CC_ALPHANUMERIC]
5231 = _new_invlist_C_array(PosixAlnum_invlist);
5233 PL_L1Posix_ptrs[_CC_ALPHA]
5234 = _new_invlist_C_array(L1PosixAlpha_invlist);
5235 PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5237 PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5238 PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5240 /* Cased is the same as Alpha in the ASCII range */
5241 PL_L1Posix_ptrs[_CC_CASED] = _new_invlist_C_array(L1Cased_invlist);
5242 PL_Posix_ptrs[_CC_CASED] = _new_invlist_C_array(PosixAlpha_invlist);
5244 PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5245 PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5247 PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5248 PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5250 PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5251 PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5253 PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5254 PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5256 PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5257 PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5259 PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5260 PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5262 PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5263 PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5264 PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5265 PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5267 PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5268 PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
5270 PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
5272 PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
5273 PL_L1Posix_ptrs[_CC_WORDCHAR]
5274 = _new_invlist_C_array(L1PosixWord_invlist);
5276 PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
5277 PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
5279 PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5283 pRExC_state->code_blocks = NULL;
5284 pRExC_state->num_code_blocks = 0;
5287 *is_bare_re = FALSE;
5289 if (expr && (expr->op_type == OP_LIST ||
5290 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5292 /* is the source UTF8, and how many code blocks are there? */
5296 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5297 if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5299 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5300 /* count of DO blocks */
5304 pRExC_state->num_code_blocks = ncode;
5305 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5310 /* handle a list of SVs */
5314 /* apply magic and RE overloading to each arg */
5315 for (svp = patternp; svp < patternp + pat_count; svp++) {
5318 if (SvROK(rx) && SvAMAGIC(rx)) {
5319 SV *sv = AMG_CALLunary(rx, regexp_amg);
5323 if (SvTYPE(sv) != SVt_REGEXP)
5324 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5330 if (pat_count > 1) {
5331 /* concat multiple args and find any code block indexes */
5336 STRLEN orig_patlen = 0;
5338 if (pRExC_state->num_code_blocks) {
5339 o = cLISTOPx(expr)->op_first;
5340 assert( o->op_type == OP_PUSHMARK
5341 || (o->op_type == OP_NULL && o->op_targ == OP_PUSHMARK)
5342 || o->op_type == OP_PADRANGE);
5346 pat = newSVpvn("", 0);
5349 /* determine if the pattern is going to be utf8 (needed
5350 * in advance to align code block indices correctly).
5351 * XXX This could fail to be detected for an arg with
5352 * overloading but not concat overloading; but the main effect
5353 * in this obscure case is to need a 'use re eval' for a
5354 * literal code block */
5355 for (svp = patternp; svp < patternp + pat_count; svp++) {
5362 for (svp = patternp; svp < patternp + pat_count; svp++) {
5363 SV *sv, *msv = *svp;
5366 /* we make the assumption here that each op in the list of
5367 * op_siblings maps to one SV pushed onto the stack,
5368 * except for code blocks, with have both an OP_NULL and
5370 * This allows us to match up the list of SVs against the
5371 * list of OPs to find the next code block.
5373 * Note that PUSHMARK PADSV PADSV ..
5375 * PADRANGE NULL NULL ..
5376 * so the alignment still works. */
5378 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5379 assert(n < pRExC_state->num_code_blocks);
5380 pRExC_state->code_blocks[n].start = SvCUR(pat);
5381 pRExC_state->code_blocks[n].block = o;
5382 pRExC_state->code_blocks[n].src_regex = NULL;
5385 o = o->op_sibling; /* skip CONST */
5391 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5392 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5395 /* overloading involved: all bets are off over literal
5396 * code. Pretend we haven't seen it */
5397 pRExC_state->num_code_blocks -= n;
5403 while (SvAMAGIC(msv)
5404 && (sv = AMG_CALLunary(msv, string_amg))
5408 && SvRV(msv) == SvRV(sv))
5413 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5415 orig_patlen = SvCUR(pat);
5416 sv_catsv_nomg(pat, msv);
5419 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5422 /* extract any code blocks within any embedded qr//'s */
5423 if (rx && SvTYPE(rx) == SVt_REGEXP
5424 && RX_ENGINE((REGEXP*)rx)->op_comp)
5427 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5428 if (ri->num_code_blocks) {
5430 /* the presence of an embedded qr// with code means
5431 * we should always recompile: the text of the
5432 * qr// may not have changed, but it may be a
5433 * different closure than last time */
5435 Renew(pRExC_state->code_blocks,
5436 pRExC_state->num_code_blocks + ri->num_code_blocks,
5437 struct reg_code_block);
5438 pRExC_state->num_code_blocks += ri->num_code_blocks;
5439 for (i=0; i < ri->num_code_blocks; i++) {
5440 struct reg_code_block *src, *dst;
5441 STRLEN offset = orig_patlen
5442 + ReANY((REGEXP *)rx)->pre_prefix;
5443 assert(n < pRExC_state->num_code_blocks);
5444 src = &ri->code_blocks[i];
5445 dst = &pRExC_state->code_blocks[n];
5446 dst->start = src->start + offset;
5447 dst->end = src->end + offset;
5448 dst->block = src->block;
5449 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5463 while (SvAMAGIC(pat)
5464 && (sv = AMG_CALLunary(pat, string_amg))
5472 /* handle bare regex: foo =~ $re */
5477 if (SvTYPE(re) == SVt_REGEXP) {
5481 Safefree(pRExC_state->code_blocks);
5487 /* not a list of SVs, so must be a list of OPs */
5489 if (expr->op_type == OP_LIST) {
5494 pat = newSVpvn("", 0);
5499 /* given a list of CONSTs and DO blocks in expr, append all
5500 * the CONSTs to pat, and record the start and end of each
5501 * code block in code_blocks[] (each DO{} op is followed by an
5502 * OP_CONST containing the corresponding literal '(?{...})
5505 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5506 if (o->op_type == OP_CONST) {
5507 sv_catsv(pat, cSVOPo_sv);
5509 pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5513 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5514 assert(i+1 < pRExC_state->num_code_blocks);
5515 pRExC_state->code_blocks[++i].start = SvCUR(pat);
5516 pRExC_state->code_blocks[i].block = o;
5517 pRExC_state->code_blocks[i].src_regex = NULL;
5523 assert(expr->op_type == OP_CONST);
5524 pat = cSVOPx_sv(expr);
5528 exp = SvPV_nomg(pat, plen);
5530 if (!eng->op_comp) {
5531 if ((SvUTF8(pat) && IN_BYTES)
5532 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5534 /* make a temporary copy; either to convert to bytes,
5535 * or to avoid repeating get-magic / overloaded stringify */
5536 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5537 (IN_BYTES ? 0 : SvUTF8(pat)));
5539 Safefree(pRExC_state->code_blocks);
5540 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5543 /* ignore the utf8ness if the pattern is 0 length */
5544 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5545 RExC_uni_semantics = 0;
5546 RExC_contains_locale = 0;
5547 pRExC_state->runtime_code_qr = NULL;
5549 /****************** LONG JUMP TARGET HERE***********************/
5550 /* Longjmp back to here if have to switch in midstream to utf8 */
5551 if (! RExC_orig_utf8) {
5552 JMPENV_PUSH(jump_ret);
5553 used_setjump = TRUE;
5556 if (jump_ret == 0) { /* First time through */
5560 SV *dsv= sv_newmortal();
5561 RE_PV_QUOTED_DECL(s, RExC_utf8,
5562 dsv, exp, plen, 60);
5563 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5564 PL_colors[4],PL_colors[5],s);
5567 else { /* longjumped back */
5570 STRLEN s = 0, d = 0;
5573 /* If the cause for the longjmp was other than changing to utf8, pop
5574 * our own setjmp, and longjmp to the correct handler */
5575 if (jump_ret != UTF8_LONGJMP) {
5577 JMPENV_JUMP(jump_ret);
5582 /* It's possible to write a regexp in ascii that represents Unicode
5583 codepoints outside of the byte range, such as via \x{100}. If we
5584 detect such a sequence we have to convert the entire pattern to utf8
5585 and then recompile, as our sizing calculation will have been based
5586 on 1 byte == 1 character, but we will need to use utf8 to encode
5587 at least some part of the pattern, and therefore must convert the whole
5590 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5591 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5593 /* upgrade pattern to UTF8, and if there are code blocks,
5594 * recalculate the indices.
5595 * This is essentially an unrolled Perl_bytes_to_utf8() */
5597 src = (U8*)SvPV_nomg(pat, plen);
5598 Newx(dst, plen * 2 + 1, U8);
5601 const UV uv = NATIVE_TO_ASCII(src[s]);
5602 if (UNI_IS_INVARIANT(uv))
5603 dst[d] = (U8)UTF_TO_NATIVE(uv);
5605 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5606 dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
5608 if (n < pRExC_state->num_code_blocks) {
5609 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5610 pRExC_state->code_blocks[n].start = d;
5611 assert(dst[d] == '(');
5614 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5615 pRExC_state->code_blocks[n].end = d;
5616 assert(dst[d] == ')');
5629 RExC_orig_utf8 = RExC_utf8 = 1;
5632 /* return old regex if pattern hasn't changed */
5636 && !!RX_UTF8(old_re) == !!RExC_utf8
5637 && RX_PRECOMP(old_re)
5638 && RX_PRELEN(old_re) == plen
5639 && memEQ(RX_PRECOMP(old_re), exp, plen))
5641 /* with runtime code, always recompile */
5642 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5644 if (!runtime_code) {
5648 Safefree(pRExC_state->code_blocks);
5652 else if ((pm_flags & PMf_USE_RE_EVAL)
5653 /* this second condition covers the non-regex literal case,
5654 * i.e. $foo =~ '(?{})'. */
5655 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5656 && (PL_hints & HINT_RE_EVAL))
5658 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5661 #ifdef TRIE_STUDY_OPT
5665 rx_flags = orig_rx_flags;
5667 if (initial_charset == REGEX_LOCALE_CHARSET) {
5668 RExC_contains_locale = 1;
5670 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5672 /* Set to use unicode semantics if the pattern is in utf8 and has the
5673 * 'depends' charset specified, as it means unicode when utf8 */
5674 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5678 RExC_flags = rx_flags;
5679 RExC_pm_flags = pm_flags;
5682 if (TAINTING_get && TAINT_get)
5683 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5685 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5686 /* whoops, we have a non-utf8 pattern, whilst run-time code
5687 * got compiled as utf8. Try again with a utf8 pattern */
5688 JMPENV_JUMP(UTF8_LONGJMP);
5691 assert(!pRExC_state->runtime_code_qr);
5696 RExC_in_lookbehind = 0;
5697 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5699 RExC_override_recoding = 0;
5700 RExC_in_multi_char_class = 0;
5702 /* First pass: determine size, legality. */
5710 RExC_emit = &PL_regdummy;
5711 RExC_whilem_seen = 0;
5712 RExC_open_parens = NULL;
5713 RExC_close_parens = NULL;
5715 RExC_paren_names = NULL;
5717 RExC_paren_name_list = NULL;
5719 RExC_recurse = NULL;
5720 RExC_recurse_count = 0;
5721 pRExC_state->code_index = 0;
5723 #if 0 /* REGC() is (currently) a NOP at the first pass.
5724 * Clever compilers notice this and complain. --jhi */
5725 REGC((U8)REG_MAGIC, (char*)RExC_emit);
5728 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5730 RExC_lastparse=NULL;
5732 /* reg may croak on us, not giving us a chance to free
5733 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
5734 need it to survive as long as the regexp (qr/(?{})/).
5735 We must check that code_blocksv is not already set, because we may
5736 have longjmped back. */
5737 if (pRExC_state->code_blocks && !code_blocksv) {
5738 code_blocksv = newSV_type(SVt_PV);
5739 SAVEFREESV(code_blocksv);
5740 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
5741 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
5743 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5744 RExC_precomp = NULL;
5748 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
5750 /* Here, finished first pass. Get rid of any added setjmp */
5756 PerlIO_printf(Perl_debug_log,
5757 "Required size %"IVdf" nodes\n"
5758 "Starting second pass (creation)\n",
5761 RExC_lastparse=NULL;
5764 /* The first pass could have found things that force Unicode semantics */
5765 if ((RExC_utf8 || RExC_uni_semantics)
5766 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5768 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5771 /* Small enough for pointer-storage convention?
5772 If extralen==0, this means that we will not need long jumps. */
5773 if (RExC_size >= 0x10000L && RExC_extralen)
5774 RExC_size += RExC_extralen;
5777 if (RExC_whilem_seen > 15)
5778 RExC_whilem_seen = 15;
5780 /* Allocate space and zero-initialize. Note, the two step process
5781 of zeroing when in debug mode, thus anything assigned has to
5782 happen after that */
5783 rx = (REGEXP*) newSV_type(SVt_REGEXP);
5785 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5786 char, regexp_internal);
5787 if ( r == NULL || ri == NULL )
5788 FAIL("Regexp out of space");
5790 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5791 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5793 /* bulk initialize base fields with 0. */
5794 Zero(ri, sizeof(regexp_internal), char);
5797 /* non-zero initialization begins here */
5800 r->extflags = rx_flags;
5801 if (pm_flags & PMf_IS_QR) {
5802 ri->code_blocks = pRExC_state->code_blocks;
5803 ri->num_code_blocks = pRExC_state->num_code_blocks;
5808 for (n = 0; n < pRExC_state->num_code_blocks; n++)
5809 if (pRExC_state->code_blocks[n].src_regex)
5810 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
5811 SAVEFREEPV(pRExC_state->code_blocks);
5815 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5816 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5818 /* The caret is output if there are any defaults: if not all the STD
5819 * flags are set, or if no character set specifier is needed */
5821 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5823 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5824 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5825 >> RXf_PMf_STD_PMMOD_SHIFT);
5826 const char *fptr = STD_PAT_MODS; /*"msix"*/
5828 /* Allocate for the worst case, which is all the std flags are turned
5829 * on. If more precision is desired, we could do a population count of
5830 * the flags set. This could be done with a small lookup table, or by
5831 * shifting, masking and adding, or even, when available, assembly
5832 * language for a machine-language population count.
5833 * We never output a minus, as all those are defaults, so are
5834 * covered by the caret */
5835 const STRLEN wraplen = plen + has_p + has_runon
5836 + has_default /* If needs a caret */
5838 /* If needs a character set specifier */
5839 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5840 + (sizeof(STD_PAT_MODS) - 1)
5841 + (sizeof("(?:)") - 1);
5843 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
5844 r->xpv_len_u.xpvlenu_pv = p;
5846 SvFLAGS(rx) |= SVf_UTF8;
5849 /* If a default, cover it using the caret */
5851 *p++= DEFAULT_PAT_MOD;
5855 const char* const name = get_regex_charset_name(r->extflags, &len);
5856 Copy(name, p, len, char);
5860 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5863 while((ch = *fptr++)) {
5871 Copy(RExC_precomp, p, plen, char);
5872 assert ((RX_WRAPPED(rx) - p) < 16);
5873 r->pre_prefix = p - RX_WRAPPED(rx);
5879 SvCUR_set(rx, p - RX_WRAPPED(rx));
5883 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5885 if (RExC_seen & REG_SEEN_RECURSE) {
5886 Newxz(RExC_open_parens, RExC_npar,regnode *);
5887 SAVEFREEPV(RExC_open_parens);
5888 Newxz(RExC_close_parens,RExC_npar,regnode *);
5889 SAVEFREEPV(RExC_close_parens);
5892 /* Useful during FAIL. */
5893 #ifdef RE_TRACK_PATTERN_OFFSETS
5894 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5895 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5896 "%s %"UVuf" bytes for offset annotations.\n",
5897 ri->u.offsets ? "Got" : "Couldn't get",
5898 (UV)((2*RExC_size+1) * sizeof(U32))));
5900 SetProgLen(ri,RExC_size);
5905 /* Second pass: emit code. */
5906 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
5907 RExC_pm_flags = pm_flags;
5912 RExC_emit_start = ri->program;
5913 RExC_emit = ri->program;
5914 RExC_emit_bound = ri->program + RExC_size + 1;
5915 pRExC_state->code_index = 0;
5917 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5918 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5922 /* XXXX To minimize changes to RE engine we always allocate
5923 3-units-long substrs field. */
5924 Newx(r->substrs, 1, struct reg_substr_data);
5925 if (RExC_recurse_count) {
5926 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5927 SAVEFREEPV(RExC_recurse);
5931 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5932 Zero(r->substrs, 1, struct reg_substr_data);
5934 #ifdef TRIE_STUDY_OPT
5936 StructCopy(&zero_scan_data, &data, scan_data_t);
5937 copyRExC_state = RExC_state;
5940 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5942 RExC_state = copyRExC_state;
5943 if (seen & REG_TOP_LEVEL_BRANCHES)
5944 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5946 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5947 StructCopy(&zero_scan_data, &data, scan_data_t);
5950 StructCopy(&zero_scan_data, &data, scan_data_t);
5953 /* Dig out information for optimizations. */
5954 r->extflags = RExC_flags; /* was pm_op */
5955 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5958 SvUTF8_on(rx); /* Unicode in it? */
5959 ri->regstclass = NULL;
5960 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
5961 r->intflags |= PREGf_NAUGHTY;
5962 scan = ri->program + 1; /* First BRANCH. */
5964 /* testing for BRANCH here tells us whether there is "must appear"
5965 data in the pattern. If there is then we can use it for optimisations */
5966 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
5968 STRLEN longest_float_length, longest_fixed_length;
5969 struct regnode_charclass_class ch_class; /* pointed to by data */
5971 I32 last_close = 0; /* pointed to by data */
5972 regnode *first= scan;
5973 regnode *first_next= regnext(first);
5975 * Skip introductions and multiplicators >= 1
5976 * so that we can extract the 'meat' of the pattern that must
5977 * match in the large if() sequence following.
5978 * NOTE that EXACT is NOT covered here, as it is normally
5979 * picked up by the optimiser separately.
5981 * This is unfortunate as the optimiser isnt handling lookahead
5982 * properly currently.
5985 while ((OP(first) == OPEN && (sawopen = 1)) ||
5986 /* An OR of *one* alternative - should not happen now. */
5987 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
5988 /* for now we can't handle lookbehind IFMATCH*/
5989 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
5990 (OP(first) == PLUS) ||
5991 (OP(first) == MINMOD) ||
5992 /* An {n,m} with n>0 */
5993 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
5994 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
5997 * the only op that could be a regnode is PLUS, all the rest
5998 * will be regnode_1 or regnode_2.
6001 if (OP(first) == PLUS)
6004 first += regarglen[OP(first)];
6006 first = NEXTOPER(first);
6007 first_next= regnext(first);
6010 /* Starting-point info. */
6012 DEBUG_PEEP("first:",first,0);
6013 /* Ignore EXACT as we deal with it later. */
6014 if (PL_regkind[OP(first)] == EXACT) {
6015 if (OP(first) == EXACT)
6016 NOOP; /* Empty, get anchored substr later. */
6018 ri->regstclass = first;
6021 else if (PL_regkind[OP(first)] == TRIE &&
6022 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6025 /* this can happen only on restudy */
6026 if ( OP(first) == TRIE ) {
6027 struct regnode_1 *trieop = (struct regnode_1 *)
6028 PerlMemShared_calloc(1, sizeof(struct regnode_1));
6029 StructCopy(first,trieop,struct regnode_1);
6030 trie_op=(regnode *)trieop;
6032 struct regnode_charclass *trieop = (struct regnode_charclass *)
6033 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6034 StructCopy(first,trieop,struct regnode_charclass);
6035 trie_op=(regnode *)trieop;
6038 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6039 ri->regstclass = trie_op;
6042 else if (REGNODE_SIMPLE(OP(first)))
6043 ri->regstclass = first;
6044 else if (PL_regkind[OP(first)] == BOUND ||
6045 PL_regkind[OP(first)] == NBOUND)
6046 ri->regstclass = first;
6047 else if (PL_regkind[OP(first)] == BOL) {
6048 r->extflags |= (OP(first) == MBOL
6050 : (OP(first) == SBOL
6053 first = NEXTOPER(first);
6056 else if (OP(first) == GPOS) {
6057 r->extflags |= RXf_ANCH_GPOS;
6058 first = NEXTOPER(first);
6061 else if ((!sawopen || !RExC_sawback) &&
6062 (OP(first) == STAR &&
6063 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6064 !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6066 /* turn .* into ^.* with an implied $*=1 */
6068 (OP(NEXTOPER(first)) == REG_ANY)
6071 r->extflags |= type;
6072 r->intflags |= PREGf_IMPLICIT;
6073 first = NEXTOPER(first);
6076 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6077 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6078 /* x+ must match at the 1st pos of run of x's */
6079 r->intflags |= PREGf_SKIP;
6081 /* Scan is after the zeroth branch, first is atomic matcher. */
6082 #ifdef TRIE_STUDY_OPT
6085 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6086 (IV)(first - scan + 1))
6090 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6091 (IV)(first - scan + 1))
6097 * If there's something expensive in the r.e., find the
6098 * longest literal string that must appear and make it the
6099 * regmust. Resolve ties in favor of later strings, since
6100 * the regstart check works with the beginning of the r.e.
6101 * and avoiding duplication strengthens checking. Not a
6102 * strong reason, but sufficient in the absence of others.
6103 * [Now we resolve ties in favor of the earlier string if
6104 * it happens that c_offset_min has been invalidated, since the
6105 * earlier string may buy us something the later one won't.]
6108 data.longest_fixed = newSVpvs("");
6109 data.longest_float = newSVpvs("");
6110 data.last_found = newSVpvs("");
6111 data.longest = &(data.longest_fixed);
6112 ENTER_with_name("study_chunk");
6113 SAVEFREESV(data.longest_fixed);
6114 SAVEFREESV(data.longest_float);
6115 SAVEFREESV(data.last_found);
6117 if (!ri->regstclass) {
6118 cl_init(pRExC_state, &ch_class);
6119 data.start_class = &ch_class;
6120 stclass_flag = SCF_DO_STCLASS_AND;
6121 } else /* XXXX Check for BOUND? */
6123 data.last_closep = &last_close;
6125 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6126 &data, -1, NULL, NULL,
6127 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6130 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6133 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6134 && data.last_start_min == 0 && data.last_end > 0
6135 && !RExC_seen_zerolen
6136 && !(RExC_seen & REG_SEEN_VERBARG)
6137 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6138 r->extflags |= RXf_CHECK_ALL;
6139 scan_commit(pRExC_state, &data,&minlen,0);
6141 longest_float_length = CHR_SVLEN(data.longest_float);
6143 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6144 && data.offset_fixed == data.offset_float_min
6145 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6146 && S_setup_longest (aTHX_ pRExC_state,
6150 &(r->float_end_shift),
6151 data.lookbehind_float,
6152 data.offset_float_min,
6154 longest_float_length,
6155 cBOOL(data.flags & SF_FL_BEFORE_EOL),
6156 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6158 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6159 r->float_max_offset = data.offset_float_max;
6160 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6161 r->float_max_offset -= data.lookbehind_float;
6162 SvREFCNT_inc_simple_void_NN(data.longest_float);
6165 r->float_substr = r->float_utf8 = NULL;
6166 longest_float_length = 0;
6169 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6171 if (S_setup_longest (aTHX_ pRExC_state,
6173 &(r->anchored_utf8),
6174 &(r->anchored_substr),
6175 &(r->anchored_end_shift),
6176 data.lookbehind_fixed,
6179 longest_fixed_length,
6180 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6181 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6183 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6184 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6187 r->anchored_substr = r->anchored_utf8 = NULL;
6188 longest_fixed_length = 0;
6190 LEAVE_with_name("study_chunk");
6193 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6194 ri->regstclass = NULL;
6196 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6198 && ! TEST_SSC_EOS(data.start_class)
6199 && !cl_is_anything(data.start_class))
6201 const U32 n = add_data(pRExC_state, 1, "f");
6202 OP(data.start_class) = ANYOF_SYNTHETIC;
6204 Newx(RExC_rxi->data->data[n], 1,
6205 struct regnode_charclass_class);
6206 StructCopy(data.start_class,
6207 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6208 struct regnode_charclass_class);
6209 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6210 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6211 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6212 regprop(r, sv, (regnode*)data.start_class);
6213 PerlIO_printf(Perl_debug_log,
6214 "synthetic stclass \"%s\".\n",
6215 SvPVX_const(sv));});
6218 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6219 if (longest_fixed_length > longest_float_length) {
6220 r->check_end_shift = r->anchored_end_shift;
6221 r->check_substr = r->anchored_substr;
6222 r->check_utf8 = r->anchored_utf8;
6223 r->check_offset_min = r->check_offset_max = r->anchored_offset;
6224 if (r->extflags & RXf_ANCH_SINGLE)
6225 r->extflags |= RXf_NOSCAN;
6228 r->check_end_shift = r->float_end_shift;
6229 r->check_substr = r->float_substr;
6230 r->check_utf8 = r->float_utf8;
6231 r->check_offset_min = r->float_min_offset;
6232 r->check_offset_max = r->float_max_offset;
6234 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6235 This should be changed ASAP! */
6236 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6237 r->extflags |= RXf_USE_INTUIT;
6238 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6239 r->extflags |= RXf_INTUIT_TAIL;
6241 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6242 if ( (STRLEN)minlen < longest_float_length )
6243 minlen= longest_float_length;
6244 if ( (STRLEN)minlen < longest_fixed_length )
6245 minlen= longest_fixed_length;
6249 /* Several toplevels. Best we can is to set minlen. */
6251 struct regnode_charclass_class ch_class;
6254 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6256 scan = ri->program + 1;
6257 cl_init(pRExC_state, &ch_class);
6258 data.start_class = &ch_class;
6259 data.last_closep = &last_close;
6262 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6263 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6265 CHECK_RESTUDY_GOTO_butfirst(NOOP);
6267 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6268 = r->float_substr = r->float_utf8 = NULL;
6270 if (! TEST_SSC_EOS(data.start_class)
6271 && !cl_is_anything(data.start_class))
6273 const U32 n = add_data(pRExC_state, 1, "f");
6274 OP(data.start_class) = ANYOF_SYNTHETIC;
6276 Newx(RExC_rxi->data->data[n], 1,
6277 struct regnode_charclass_class);
6278 StructCopy(data.start_class,
6279 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6280 struct regnode_charclass_class);
6281 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6282 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6283 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6284 regprop(r, sv, (regnode*)data.start_class);
6285 PerlIO_printf(Perl_debug_log,
6286 "synthetic stclass \"%s\".\n",
6287 SvPVX_const(sv));});
6291 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6292 the "real" pattern. */
6294 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6295 (IV)minlen, (IV)r->minlen);
6297 r->minlenret = minlen;
6298 if (r->minlen < minlen)
6301 if (RExC_seen & REG_SEEN_GPOS)
6302 r->extflags |= RXf_GPOS_SEEN;
6303 if (RExC_seen & REG_SEEN_LOOKBEHIND)
6304 r->extflags |= RXf_LOOKBEHIND_SEEN;
6305 if (pRExC_state->num_code_blocks)
6306 r->extflags |= RXf_EVAL_SEEN;
6307 if (RExC_seen & REG_SEEN_CANY)
6308 r->extflags |= RXf_CANY_SEEN;
6309 if (RExC_seen & REG_SEEN_VERBARG)
6311 r->intflags |= PREGf_VERBARG_SEEN;
6312 r->extflags |= RXf_MODIFIES_VARS;
6314 if (RExC_seen & REG_SEEN_CUTGROUP)
6315 r->intflags |= PREGf_CUTGROUP_SEEN;
6316 if (pm_flags & PMf_USE_RE_EVAL)
6317 r->intflags |= PREGf_USE_RE_EVAL;
6318 if (RExC_paren_names)
6319 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6321 RXp_PAREN_NAMES(r) = NULL;
6323 #ifdef STUPID_PATTERN_CHECKS
6324 if (RX_PRELEN(rx) == 0)
6325 r->extflags |= RXf_NULL;
6326 if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6327 r->extflags |= RXf_WHITE;
6328 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6329 r->extflags |= RXf_START_ONLY;
6332 regnode *first = ri->program + 1;
6335 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6336 r->extflags |= RXf_NULL;
6337 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6338 r->extflags |= RXf_START_ONLY;
6339 else if (fop == PLUS && PL_regkind[OP(NEXTOPER(first))] == POSIXD && FLAGS(NEXTOPER(first)) == _CC_SPACE
6340 && OP(regnext(first)) == END)
6341 r->extflags |= RXf_WHITE;
6345 if (RExC_paren_names) {
6346 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6347 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6350 ri->name_list_idx = 0;
6352 if (RExC_recurse_count) {
6353 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6354 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6355 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6358 Newxz(r->offs, RExC_npar, regexp_paren_pair);
6359 /* assume we don't need to swap parens around before we match */
6362 PerlIO_printf(Perl_debug_log,"Final program:\n");
6365 #ifdef RE_TRACK_PATTERN_OFFSETS
6366 DEBUG_OFFSETS_r(if (ri->u.offsets) {
6367 const U32 len = ri->u.offsets[0];
6369 GET_RE_DEBUG_FLAGS_DECL;
6370 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6371 for (i = 1; i <= len; i++) {
6372 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6373 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6374 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6376 PerlIO_printf(Perl_debug_log, "\n");
6381 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6382 * by setting the regexp SV to readonly-only instead. If the
6383 * pattern's been recompiled, the USEDness should remain. */
6384 if (old_re && SvREADONLY(old_re))
6392 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6395 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6397 PERL_UNUSED_ARG(value);
6399 if (flags & RXapif_FETCH) {
6400 return reg_named_buff_fetch(rx, key, flags);
6401 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6402 Perl_croak_no_modify();
6404 } else if (flags & RXapif_EXISTS) {
6405 return reg_named_buff_exists(rx, key, flags)
6408 } else if (flags & RXapif_REGNAMES) {
6409 return reg_named_buff_all(rx, flags);
6410 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6411 return reg_named_buff_scalar(rx, flags);
6413 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6419 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6422 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6423 PERL_UNUSED_ARG(lastkey);
6425 if (flags & RXapif_FIRSTKEY)
6426 return reg_named_buff_firstkey(rx, flags);
6427 else if (flags & RXapif_NEXTKEY)
6428 return reg_named_buff_nextkey(rx, flags);
6430 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6436 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6439 AV *retarray = NULL;
6441 struct regexp *const rx = ReANY(r);
6443 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6445 if (flags & RXapif_ALL)
6448 if (rx && RXp_PAREN_NAMES(rx)) {
6449 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6452 SV* sv_dat=HeVAL(he_str);
6453 I32 *nums=(I32*)SvPVX(sv_dat);
6454 for ( i=0; i<SvIVX(sv_dat); i++ ) {
6455 if ((I32)(rx->nparens) >= nums[i]
6456 && rx->offs[nums[i]].start != -1
6457 && rx->offs[nums[i]].end != -1)
6460 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6465 ret = newSVsv(&PL_sv_undef);
6468 av_push(retarray, ret);
6471 return newRV_noinc(MUTABLE_SV(retarray));
6478 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6481 struct regexp *const rx = ReANY(r);
6483 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6485 if (rx && RXp_PAREN_NAMES(rx)) {
6486 if (flags & RXapif_ALL) {
6487 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6489 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6491 SvREFCNT_dec_NN(sv);
6503 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6505 struct regexp *const rx = ReANY(r);
6507 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6509 if ( rx && RXp_PAREN_NAMES(rx) ) {
6510 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6512 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6519 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6521 struct regexp *const rx = ReANY(r);
6522 GET_RE_DEBUG_FLAGS_DECL;
6524 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6526 if (rx && RXp_PAREN_NAMES(rx)) {
6527 HV *hv = RXp_PAREN_NAMES(rx);
6529 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6532 SV* sv_dat = HeVAL(temphe);
6533 I32 *nums = (I32*)SvPVX(sv_dat);
6534 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6535 if ((I32)(rx->lastparen) >= nums[i] &&
6536 rx->offs[nums[i]].start != -1 &&
6537 rx->offs[nums[i]].end != -1)
6543 if (parno || flags & RXapif_ALL) {
6544 return newSVhek(HeKEY_hek(temphe));
6552 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6557 struct regexp *const rx = ReANY(r);
6559 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6561 if (rx && RXp_PAREN_NAMES(rx)) {
6562 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6563 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6564 } else if (flags & RXapif_ONE) {
6565 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6566 av = MUTABLE_AV(SvRV(ret));
6567 length = av_len(av);
6568 SvREFCNT_dec_NN(ret);
6569 return newSViv(length + 1);
6571 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6575 return &PL_sv_undef;
6579 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6581 struct regexp *const rx = ReANY(r);
6584 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6586 if (rx && RXp_PAREN_NAMES(rx)) {
6587 HV *hv= RXp_PAREN_NAMES(rx);
6589 (void)hv_iterinit(hv);
6590 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6593 SV* sv_dat = HeVAL(temphe);
6594 I32 *nums = (I32*)SvPVX(sv_dat);
6595 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6596 if ((I32)(rx->lastparen) >= nums[i] &&
6597 rx->offs[nums[i]].start != -1 &&
6598 rx->offs[nums[i]].end != -1)
6604 if (parno || flags & RXapif_ALL) {
6605 av_push(av, newSVhek(HeKEY_hek(temphe)));
6610 return newRV_noinc(MUTABLE_SV(av));
6614 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6617 struct regexp *const rx = ReANY(r);
6623 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6625 if ( ( n == RX_BUFF_IDX_CARET_PREMATCH
6626 || n == RX_BUFF_IDX_CARET_FULLMATCH
6627 || n == RX_BUFF_IDX_CARET_POSTMATCH
6629 && !(rx->extflags & RXf_PMf_KEEPCOPY)
6636 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6637 /* no need to distinguish between them any more */
6638 n = RX_BUFF_IDX_FULLMATCH;
6640 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6641 && rx->offs[0].start != -1)
6643 /* $`, ${^PREMATCH} */
6644 i = rx->offs[0].start;
6648 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6649 && rx->offs[0].end != -1)
6651 /* $', ${^POSTMATCH} */
6652 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6653 i = rx->sublen + rx->suboffset - rx->offs[0].end;
6656 if ( 0 <= n && n <= (I32)rx->nparens &&
6657 (s1 = rx->offs[n].start) != -1 &&
6658 (t1 = rx->offs[n].end) != -1)
6660 /* $&, ${^MATCH}, $1 ... */
6662 s = rx->subbeg + s1 - rx->suboffset;
6667 assert(s >= rx->subbeg);
6668 assert(rx->sublen >= (s - rx->subbeg) + i );
6670 #if NO_TAINT_SUPPORT
6671 sv_setpvn(sv, s, i);
6673 const int oldtainted = TAINT_get;
6675 sv_setpvn(sv, s, i);
6676 TAINT_set(oldtainted);
6678 if ( (rx->extflags & RXf_CANY_SEEN)
6679 ? (RXp_MATCH_UTF8(rx)
6680 && (!i || is_utf8_string((U8*)s, i)))
6681 : (RXp_MATCH_UTF8(rx)) )
6688 if (RXp_MATCH_TAINTED(rx)) {
6689 if (SvTYPE(sv) >= SVt_PVMG) {
6690 MAGIC* const mg = SvMAGIC(sv);
6693 SvMAGIC_set(sv, mg->mg_moremagic);
6695 if ((mgt = SvMAGIC(sv))) {
6696 mg->mg_moremagic = mgt;
6697 SvMAGIC_set(sv, mg);
6708 sv_setsv(sv,&PL_sv_undef);
6714 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6715 SV const * const value)
6717 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6719 PERL_UNUSED_ARG(rx);
6720 PERL_UNUSED_ARG(paren);
6721 PERL_UNUSED_ARG(value);
6724 Perl_croak_no_modify();
6728 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6731 struct regexp *const rx = ReANY(r);
6735 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6737 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6739 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6740 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6744 case RX_BUFF_IDX_PREMATCH: /* $` */
6745 if (rx->offs[0].start != -1) {
6746 i = rx->offs[0].start;
6755 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6756 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6758 case RX_BUFF_IDX_POSTMATCH: /* $' */
6759 if (rx->offs[0].end != -1) {
6760 i = rx->sublen - rx->offs[0].end;
6762 s1 = rx->offs[0].end;
6769 case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
6770 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6774 /* $& / ${^MATCH}, $1, $2, ... */
6776 if (paren <= (I32)rx->nparens &&
6777 (s1 = rx->offs[paren].start) != -1 &&
6778 (t1 = rx->offs[paren].end) != -1)
6784 if (ckWARN(WARN_UNINITIALIZED))
6785 report_uninit((const SV *)sv);
6790 if (i > 0 && RXp_MATCH_UTF8(rx)) {
6791 const char * const s = rx->subbeg - rx->suboffset + s1;
6796 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6803 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6805 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6806 PERL_UNUSED_ARG(rx);
6810 return newSVpvs("Regexp");
6813 /* Scans the name of a named buffer from the pattern.
6814 * If flags is REG_RSN_RETURN_NULL returns null.
6815 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6816 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6817 * to the parsed name as looked up in the RExC_paren_names hash.
6818 * If there is an error throws a vFAIL().. type exception.
6821 #define REG_RSN_RETURN_NULL 0
6822 #define REG_RSN_RETURN_NAME 1
6823 #define REG_RSN_RETURN_DATA 2
6826 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6828 char *name_start = RExC_parse;
6830 PERL_ARGS_ASSERT_REG_SCAN_NAME;
6832 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6833 /* skip IDFIRST by using do...while */
6836 RExC_parse += UTF8SKIP(RExC_parse);
6837 } while (isWORDCHAR_utf8((U8*)RExC_parse));
6841 } while (isWORDCHAR(*RExC_parse));
6843 RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6844 vFAIL("Group name must start with a non-digit word character");
6848 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6849 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6850 if ( flags == REG_RSN_RETURN_NAME)
6852 else if (flags==REG_RSN_RETURN_DATA) {
6855 if ( ! sv_name ) /* should not happen*/
6856 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6857 if (RExC_paren_names)
6858 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6860 sv_dat = HeVAL(he_str);
6862 vFAIL("Reference to nonexistent named group");
6866 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6867 (unsigned long) flags);
6869 assert(0); /* NOT REACHED */
6874 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
6875 int rem=(int)(RExC_end - RExC_parse); \
6884 if (RExC_lastparse!=RExC_parse) \
6885 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
6888 iscut ? "..." : "<" \
6891 PerlIO_printf(Perl_debug_log,"%16s",""); \
6894 num = RExC_size + 1; \
6896 num=REG_NODE_NUM(RExC_emit); \
6897 if (RExC_lastnum!=num) \
6898 PerlIO_printf(Perl_debug_log,"|%4d",num); \
6900 PerlIO_printf(Perl_debug_log,"|%4s",""); \
6901 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
6902 (int)((depth*2)), "", \
6906 RExC_lastparse=RExC_parse; \
6911 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
6912 DEBUG_PARSE_MSG((funcname)); \
6913 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
6915 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
6916 DEBUG_PARSE_MSG((funcname)); \
6917 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
6920 /* This section of code defines the inversion list object and its methods. The
6921 * interfaces are highly subject to change, so as much as possible is static to
6922 * this file. An inversion list is here implemented as a malloc'd C UV array
6923 * with some added info that is placed as UVs at the beginning in a header
6924 * portion. An inversion list for Unicode is an array of code points, sorted
6925 * by ordinal number. The zeroth element is the first code point in the list.
6926 * The 1th element is the first element beyond that not in the list. In other
6927 * words, the first range is
6928 * invlist[0]..(invlist[1]-1)
6929 * The other ranges follow. Thus every element whose index is divisible by two
6930 * marks the beginning of a range that is in the list, and every element not
6931 * divisible by two marks the beginning of a range not in the list. A single
6932 * element inversion list that contains the single code point N generally
6933 * consists of two elements
6936 * (The exception is when N is the highest representable value on the
6937 * machine, in which case the list containing just it would be a single
6938 * element, itself. By extension, if the last range in the list extends to
6939 * infinity, then the first element of that range will be in the inversion list
6940 * at a position that is divisible by two, and is the final element in the
6942 * Taking the complement (inverting) an inversion list is quite simple, if the
6943 * first element is 0, remove it; otherwise add a 0 element at the beginning.
6944 * This implementation reserves an element at the beginning of each inversion
6945 * list to contain 0 when the list contains 0, and contains 1 otherwise. The
6946 * actual beginning of the list is either that element if 0, or the next one if
6949 * More about inversion lists can be found in "Unicode Demystified"
6950 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6951 * More will be coming when functionality is added later.
6953 * The inversion list data structure is currently implemented as an SV pointing
6954 * to an array of UVs that the SV thinks are bytes. This allows us to have an
6955 * array of UV whose memory management is automatically handled by the existing
6956 * facilities for SV's.
6958 * Some of the methods should always be private to the implementation, and some
6959 * should eventually be made public */
6961 /* The header definitions are in F<inline_invlist.c> */
6962 #define TO_INTERNAL_SIZE(x) (((x) + HEADER_LENGTH) * sizeof(UV))
6963 #define FROM_INTERNAL_SIZE(x) (((x)/ sizeof(UV)) - HEADER_LENGTH)
6965 #define INVLIST_INITIAL_LEN 10
6967 PERL_STATIC_INLINE UV*
6968 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
6970 /* Returns a pointer to the first element in the inversion list's array.
6971 * This is called upon initialization of an inversion list. Where the
6972 * array begins depends on whether the list has the code point U+0000
6973 * in it or not. The other parameter tells it whether the code that
6974 * follows this call is about to put a 0 in the inversion list or not.
6975 * The first element is either the element with 0, if 0, or the next one,
6978 UV* zero = get_invlist_zero_addr(invlist);
6980 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
6983 assert(! *_get_invlist_len_addr(invlist));
6985 /* 1^1 = 0; 1^0 = 1 */
6986 *zero = 1 ^ will_have_0;
6987 return zero + *zero;
6990 PERL_STATIC_INLINE UV*
6991 S_invlist_array(pTHX_ SV* const invlist)
6993 /* Returns the pointer to the inversion list's array. Every time the
6994 * length changes, this needs to be called in case malloc or realloc moved
6997 PERL_ARGS_ASSERT_INVLIST_ARRAY;
6999 /* Must not be empty. If these fail, you probably didn't check for <len>
7000 * being non-zero before trying to get the array */
7001 assert(*_get_invlist_len_addr(invlist));
7002 assert(*get_invlist_zero_addr(invlist) == 0
7003 || *get_invlist_zero_addr(invlist) == 1);
7005 /* The array begins either at the element reserved for zero if the
7006 * list contains 0 (that element will be set to 0), or otherwise the next
7007 * element (in which case the reserved element will be set to 1). */
7008 return (UV *) (get_invlist_zero_addr(invlist)
7009 + *get_invlist_zero_addr(invlist));
7012 PERL_STATIC_INLINE void
7013 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7015 /* Sets the current number of elements stored in the inversion list */
7017 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7019 *_get_invlist_len_addr(invlist) = len;
7021 assert(len <= SvLEN(invlist));
7023 SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7024 /* If the list contains U+0000, that element is part of the header,
7025 * and should not be counted as part of the array. It will contain
7026 * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
7028 * SvCUR_set(invlist,
7029 * TO_INTERNAL_SIZE(len
7030 * - (*get_invlist_zero_addr(inv_list) ^ 1)));
7031 * But, this is only valid if len is not 0. The consequences of not doing
7032 * this is that the memory allocation code may think that 1 more UV is
7033 * being used than actually is, and so might do an unnecessary grow. That
7034 * seems worth not bothering to make this the precise amount.
7036 * Note that when inverting, SvCUR shouldn't change */
7039 PERL_STATIC_INLINE IV*
7040 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7042 /* Return the address of the UV that is reserved to hold the cached index
7045 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7047 return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
7050 PERL_STATIC_INLINE IV
7051 S_invlist_previous_index(pTHX_ SV* const invlist)
7053 /* Returns cached index of previous search */
7055 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7057 return *get_invlist_previous_index_addr(invlist);
7060 PERL_STATIC_INLINE void
7061 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7063 /* Caches <index> for later retrieval */
7065 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7067 assert(index == 0 || index < (int) _invlist_len(invlist));
7069 *get_invlist_previous_index_addr(invlist) = index;
7072 PERL_STATIC_INLINE UV
7073 S_invlist_max(pTHX_ SV* const invlist)
7075 /* Returns the maximum number of elements storable in the inversion list's
7076 * array, without having to realloc() */
7078 PERL_ARGS_ASSERT_INVLIST_MAX;
7080 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
7081 ? _invlist_len(invlist)
7082 : FROM_INTERNAL_SIZE(SvLEN(invlist));
7085 PERL_STATIC_INLINE UV*
7086 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7088 /* Return the address of the UV that is reserved to hold 0 if the inversion
7089 * list contains 0. This has to be the last element of the heading, as the
7090 * list proper starts with either it if 0, or the next element if not.
7091 * (But we force it to contain either 0 or 1) */
7093 PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7095 return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7098 #ifndef PERL_IN_XSUB_RE
7100 Perl__new_invlist(pTHX_ IV initial_size)
7103 /* Return a pointer to a newly constructed inversion list, with enough
7104 * space to store 'initial_size' elements. If that number is negative, a
7105 * system default is used instead */
7109 if (initial_size < 0) {
7110 initial_size = INVLIST_INITIAL_LEN;
7113 /* Allocate the initial space */
7114 new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7115 invlist_set_len(new_list, 0);
7117 /* Force iterinit() to be used to get iteration to work */
7118 *get_invlist_iter_addr(new_list) = UV_MAX;
7120 /* This should force a segfault if a method doesn't initialize this
7122 *get_invlist_zero_addr(new_list) = UV_MAX;
7124 *get_invlist_previous_index_addr(new_list) = 0;
7125 *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7126 #if HEADER_LENGTH != 5
7127 # error Need to regenerate INVLIST_VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7135 S__new_invlist_C_array(pTHX_ UV* list)
7137 /* Return a pointer to a newly constructed inversion list, initialized to
7138 * point to <list>, which has to be in the exact correct inversion list
7139 * form, including internal fields. Thus this is a dangerous routine that
7140 * should not be used in the wrong hands */
7142 SV* invlist = newSV_type(SVt_PV);
7144 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7146 SvPV_set(invlist, (char *) list);
7147 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
7148 shouldn't touch it */
7149 SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
7151 if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7152 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7155 /* Initialize the iteration pointer.
7156 * XXX This could be done at compile time in charclass_invlists.h, but I
7157 * (khw) am not confident that the suffixes for specifying the C constant
7158 * UV_MAX are portable, e.g. 'ull' on a 32 bit machine that is configured
7159 * to use 64 bits; might need a Configure probe */
7160 invlist_iterfinish(invlist);
7166 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7168 /* Grow the maximum size of an inversion list */
7170 PERL_ARGS_ASSERT_INVLIST_EXTEND;
7172 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7175 PERL_STATIC_INLINE void
7176 S_invlist_trim(pTHX_ SV* const invlist)
7178 PERL_ARGS_ASSERT_INVLIST_TRIM;
7180 /* Change the length of the inversion list to how many entries it currently
7183 SvPV_shrink_to_cur((SV *) invlist);
7186 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7189 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7191 /* Subject to change or removal. Append the range from 'start' to 'end' at
7192 * the end of the inversion list. The range must be above any existing
7196 UV max = invlist_max(invlist);
7197 UV len = _invlist_len(invlist);
7199 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7201 if (len == 0) { /* Empty lists must be initialized */
7202 array = _invlist_array_init(invlist, start == 0);
7205 /* Here, the existing list is non-empty. The current max entry in the
7206 * list is generally the first value not in the set, except when the
7207 * set extends to the end of permissible values, in which case it is
7208 * the first entry in that final set, and so this call is an attempt to
7209 * append out-of-order */
7211 UV final_element = len - 1;
7212 array = invlist_array(invlist);
7213 if (array[final_element] > start
7214 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7216 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7217 array[final_element], start,
7218 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7221 /* Here, it is a legal append. If the new range begins with the first
7222 * value not in the set, it is extending the set, so the new first
7223 * value not in the set is one greater than the newly extended range.
7225 if (array[final_element] == start) {
7226 if (end != UV_MAX) {
7227 array[final_element] = end + 1;
7230 /* But if the end is the maximum representable on the machine,
7231 * just let the range that this would extend to have no end */
7232 invlist_set_len(invlist, len - 1);
7238 /* Here the new range doesn't extend any existing set. Add it */
7240 len += 2; /* Includes an element each for the start and end of range */
7242 /* If overflows the existing space, extend, which may cause the array to be
7245 invlist_extend(invlist, len);
7246 invlist_set_len(invlist, len); /* Have to set len here to avoid assert
7247 failure in invlist_array() */
7248 array = invlist_array(invlist);
7251 invlist_set_len(invlist, len);
7254 /* The next item on the list starts the range, the one after that is
7255 * one past the new range. */
7256 array[len - 2] = start;
7257 if (end != UV_MAX) {
7258 array[len - 1] = end + 1;
7261 /* But if the end is the maximum representable on the machine, just let
7262 * the range have no end */
7263 invlist_set_len(invlist, len - 1);
7267 #ifndef PERL_IN_XSUB_RE
7270 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7272 /* Searches the inversion list for the entry that contains the input code
7273 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
7274 * return value is the index into the list's array of the range that
7279 IV high = _invlist_len(invlist);
7280 const IV highest_element = high - 1;
7283 PERL_ARGS_ASSERT__INVLIST_SEARCH;
7285 /* If list is empty, return failure. */
7290 /* (We can't get the array unless we know the list is non-empty) */
7291 array = invlist_array(invlist);
7293 mid = invlist_previous_index(invlist);
7294 assert(mid >=0 && mid <= highest_element);
7296 /* <mid> contains the cache of the result of the previous call to this
7297 * function (0 the first time). See if this call is for the same result,
7298 * or if it is for mid-1. This is under the theory that calls to this
7299 * function will often be for related code points that are near each other.
7300 * And benchmarks show that caching gives better results. We also test
7301 * here if the code point is within the bounds of the list. These tests
7302 * replace others that would have had to be made anyway to make sure that
7303 * the array bounds were not exceeded, and these give us extra information
7304 * at the same time */
7305 if (cp >= array[mid]) {
7306 if (cp >= array[highest_element]) {
7307 return highest_element;
7310 /* Here, array[mid] <= cp < array[highest_element]. This means that
7311 * the final element is not the answer, so can exclude it; it also
7312 * means that <mid> is not the final element, so can refer to 'mid + 1'
7314 if (cp < array[mid + 1]) {
7320 else { /* cp < aray[mid] */
7321 if (cp < array[0]) { /* Fail if outside the array */
7325 if (cp >= array[mid - 1]) {
7330 /* Binary search. What we are looking for is <i> such that
7331 * array[i] <= cp < array[i+1]
7332 * The loop below converges on the i+1. Note that there may not be an
7333 * (i+1)th element in the array, and things work nonetheless */
7334 while (low < high) {
7335 mid = (low + high) / 2;
7336 assert(mid <= highest_element);
7337 if (array[mid] <= cp) { /* cp >= array[mid] */
7340 /* We could do this extra test to exit the loop early.
7341 if (cp < array[low]) {
7346 else { /* cp < array[mid] */
7353 invlist_set_previous_index(invlist, high);
7358 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7360 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7361 * but is used when the swash has an inversion list. This makes this much
7362 * faster, as it uses a binary search instead of a linear one. This is
7363 * intimately tied to that function, and perhaps should be in utf8.c,
7364 * except it is intimately tied to inversion lists as well. It assumes
7365 * that <swatch> is all 0's on input */
7368 const IV len = _invlist_len(invlist);
7372 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7374 if (len == 0) { /* Empty inversion list */
7378 array = invlist_array(invlist);
7380 /* Find which element it is */
7381 i = _invlist_search(invlist, start);
7383 /* We populate from <start> to <end> */
7384 while (current < end) {
7387 /* The inversion list gives the results for every possible code point
7388 * after the first one in the list. Only those ranges whose index is
7389 * even are ones that the inversion list matches. For the odd ones,
7390 * and if the initial code point is not in the list, we have to skip
7391 * forward to the next element */
7392 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7394 if (i >= len) { /* Finished if beyond the end of the array */
7398 if (current >= end) { /* Finished if beyond the end of what we
7400 if (LIKELY(end < UV_MAX)) {
7404 /* We get here when the upper bound is the maximum
7405 * representable on the machine, and we are looking for just
7406 * that code point. Have to special case it */
7408 goto join_end_of_list;
7411 assert(current >= start);
7413 /* The current range ends one below the next one, except don't go past
7416 upper = (i < len && array[i] < end) ? array[i] : end;
7418 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
7419 * for each code point in it */
7420 for (; current < upper; current++) {
7421 const STRLEN offset = (STRLEN)(current - start);
7422 swatch[offset >> 3] |= 1 << (offset & 7);
7427 /* Quit if at the end of the list */
7430 /* But first, have to deal with the highest possible code point on
7431 * the platform. The previous code assumes that <end> is one
7432 * beyond where we want to populate, but that is impossible at the
7433 * platform's infinity, so have to handle it specially */
7434 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7436 const STRLEN offset = (STRLEN)(end - start);
7437 swatch[offset >> 3] |= 1 << (offset & 7);
7442 /* Advance to the next range, which will be for code points not in the
7451 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7453 /* Take the union of two inversion lists and point <output> to it. *output
7454 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7455 * the reference count to that list will be decremented. The first list,
7456 * <a>, may be NULL, in which case a copy of the second list is returned.
7457 * If <complement_b> is TRUE, the union is taken of the complement
7458 * (inversion) of <b> instead of b itself.
7460 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7461 * Richard Gillam, published by Addison-Wesley, and explained at some
7462 * length there. The preface says to incorporate its examples into your
7463 * code at your own risk.
7465 * The algorithm is like a merge sort.
7467 * XXX A potential performance improvement is to keep track as we go along
7468 * if only one of the inputs contributes to the result, meaning the other
7469 * is a subset of that one. In that case, we can skip the final copy and
7470 * return the larger of the input lists, but then outside code might need
7471 * to keep track of whether to free the input list or not */
7473 UV* array_a; /* a's array */
7475 UV len_a; /* length of a's array */
7478 SV* u; /* the resulting union */
7482 UV i_a = 0; /* current index into a's array */
7486 /* running count, as explained in the algorithm source book; items are
7487 * stopped accumulating and are output when the count changes to/from 0.
7488 * The count is incremented when we start a range that's in the set, and
7489 * decremented when we start a range that's not in the set. So its range
7490 * is 0 to 2. Only when the count is zero is something not in the set.
7494 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7497 /* If either one is empty, the union is the other one */
7498 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7505 *output = invlist_clone(b);
7507 _invlist_invert(*output);
7509 } /* else *output already = b; */
7512 else if ((len_b = _invlist_len(b)) == 0) {
7517 /* The complement of an empty list is a list that has everything in it,
7518 * so the union with <a> includes everything too */
7523 *output = _new_invlist(1);
7524 _append_range_to_invlist(*output, 0, UV_MAX);
7526 else if (*output != a) {
7527 *output = invlist_clone(a);
7529 /* else *output already = a; */
7533 /* Here both lists exist and are non-empty */
7534 array_a = invlist_array(a);
7535 array_b = invlist_array(b);
7537 /* If are to take the union of 'a' with the complement of b, set it
7538 * up so are looking at b's complement. */
7541 /* To complement, we invert: if the first element is 0, remove it. To
7542 * do this, we just pretend the array starts one later, and clear the
7543 * flag as we don't have to do anything else later */
7544 if (array_b[0] == 0) {
7547 complement_b = FALSE;
7551 /* But if the first element is not zero, we unshift a 0 before the
7552 * array. The data structure reserves a space for that 0 (which
7553 * should be a '1' right now), so physical shifting is unneeded,
7554 * but temporarily change that element to 0. Before exiting the
7555 * routine, we must restore the element to '1' */
7562 /* Size the union for the worst case: that the sets are completely
7564 u = _new_invlist(len_a + len_b);
7566 /* Will contain U+0000 if either component does */
7567 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7568 || (len_b > 0 && array_b[0] == 0));
7570 /* Go through each list item by item, stopping when exhausted one of
7572 while (i_a < len_a && i_b < len_b) {
7573 UV cp; /* The element to potentially add to the union's array */
7574 bool cp_in_set; /* is it in the the input list's set or not */
7576 /* We need to take one or the other of the two inputs for the union.
7577 * Since we are merging two sorted lists, we take the smaller of the
7578 * next items. In case of a tie, we take the one that is in its set
7579 * first. If we took one not in the set first, it would decrement the
7580 * count, possibly to 0 which would cause it to be output as ending the
7581 * range, and the next time through we would take the same number, and
7582 * output it again as beginning the next range. By doing it the
7583 * opposite way, there is no possibility that the count will be
7584 * momentarily decremented to 0, and thus the two adjoining ranges will
7585 * be seamlessly merged. (In a tie and both are in the set or both not
7586 * in the set, it doesn't matter which we take first.) */
7587 if (array_a[i_a] < array_b[i_b]
7588 || (array_a[i_a] == array_b[i_b]
7589 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7591 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7595 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7596 cp = array_b[i_b++];
7599 /* Here, have chosen which of the two inputs to look at. Only output
7600 * if the running count changes to/from 0, which marks the
7601 * beginning/end of a range in that's in the set */
7604 array_u[i_u++] = cp;
7611 array_u[i_u++] = cp;
7616 /* Here, we are finished going through at least one of the lists, which
7617 * means there is something remaining in at most one. We check if the list
7618 * that hasn't been exhausted is positioned such that we are in the middle
7619 * of a range in its set or not. (i_a and i_b point to the element beyond
7620 * the one we care about.) If in the set, we decrement 'count'; if 0, there
7621 * is potentially more to output.
7622 * There are four cases:
7623 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
7624 * in the union is entirely from the non-exhausted set.
7625 * 2) Both were in their sets, count is 2. Nothing further should
7626 * be output, as everything that remains will be in the exhausted
7627 * list's set, hence in the union; decrementing to 1 but not 0 insures
7629 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7630 * Nothing further should be output because the union includes
7631 * everything from the exhausted set. Not decrementing ensures that.
7632 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7633 * decrementing to 0 insures that we look at the remainder of the
7634 * non-exhausted set */
7635 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7636 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7641 /* The final length is what we've output so far, plus what else is about to
7642 * be output. (If 'count' is non-zero, then the input list we exhausted
7643 * has everything remaining up to the machine's limit in its set, and hence
7644 * in the union, so there will be no further output. */
7647 /* At most one of the subexpressions will be non-zero */
7648 len_u += (len_a - i_a) + (len_b - i_b);
7651 /* Set result to final length, which can change the pointer to array_u, so
7653 if (len_u != _invlist_len(u)) {
7654 invlist_set_len(u, len_u);
7656 array_u = invlist_array(u);
7659 /* When 'count' is 0, the list that was exhausted (if one was shorter than
7660 * the other) ended with everything above it not in its set. That means
7661 * that the remaining part of the union is precisely the same as the
7662 * non-exhausted list, so can just copy it unchanged. (If both list were
7663 * exhausted at the same time, then the operations below will be both 0.)
7666 IV copy_count; /* At most one will have a non-zero copy count */
7667 if ((copy_count = len_a - i_a) > 0) {
7668 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7670 else if ((copy_count = len_b - i_b) > 0) {
7671 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7675 /* If we've changed b, restore it */
7680 /* We may be removing a reference to one of the inputs */
7681 if (a == *output || b == *output) {
7682 assert(! invlist_is_iterating(*output));
7683 SvREFCNT_dec_NN(*output);
7691 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7693 /* Take the intersection of two inversion lists and point <i> to it. *i
7694 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7695 * the reference count to that list will be decremented.
7696 * If <complement_b> is TRUE, the result will be the intersection of <a>
7697 * and the complement (or inversion) of <b> instead of <b> directly.
7699 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7700 * Richard Gillam, published by Addison-Wesley, and explained at some
7701 * length there. The preface says to incorporate its examples into your
7702 * code at your own risk. In fact, it had bugs
7704 * The algorithm is like a merge sort, and is essentially the same as the
7708 UV* array_a; /* a's array */
7710 UV len_a; /* length of a's array */
7713 SV* r; /* the resulting intersection */
7717 UV i_a = 0; /* current index into a's array */
7721 /* running count, as explained in the algorithm source book; items are
7722 * stopped accumulating and are output when the count changes to/from 2.
7723 * The count is incremented when we start a range that's in the set, and
7724 * decremented when we start a range that's not in the set. So its range
7725 * is 0 to 2. Only when the count is 2 is something in the intersection.
7729 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7732 /* Special case if either one is empty */
7733 len_a = _invlist_len(a);
7734 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7736 if (len_a != 0 && complement_b) {
7738 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7739 * be empty. Here, also we are using 'b's complement, which hence
7740 * must be every possible code point. Thus the intersection is
7743 *i = invlist_clone(a);
7749 /* else *i is already 'a' */
7753 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
7754 * intersection must be empty */
7761 *i = _new_invlist(0);
7765 /* Here both lists exist and are non-empty */
7766 array_a = invlist_array(a);
7767 array_b = invlist_array(b);
7769 /* If are to take the intersection of 'a' with the complement of b, set it
7770 * up so are looking at b's complement. */
7773 /* To complement, we invert: if the first element is 0, remove it. To
7774 * do this, we just pretend the array starts one later, and clear the
7775 * flag as we don't have to do anything else later */
7776 if (array_b[0] == 0) {
7779 complement_b = FALSE;
7783 /* But if the first element is not zero, we unshift a 0 before the
7784 * array. The data structure reserves a space for that 0 (which
7785 * should be a '1' right now), so physical shifting is unneeded,
7786 * but temporarily change that element to 0. Before exiting the
7787 * routine, we must restore the element to '1' */
7794 /* Size the intersection for the worst case: that the intersection ends up
7795 * fragmenting everything to be completely disjoint */
7796 r= _new_invlist(len_a + len_b);
7798 /* Will contain U+0000 iff both components do */
7799 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7800 && len_b > 0 && array_b[0] == 0);
7802 /* Go through each list item by item, stopping when exhausted one of
7804 while (i_a < len_a && i_b < len_b) {
7805 UV cp; /* The element to potentially add to the intersection's
7807 bool cp_in_set; /* Is it in the input list's set or not */
7809 /* We need to take one or the other of the two inputs for the
7810 * intersection. Since we are merging two sorted lists, we take the
7811 * smaller of the next items. In case of a tie, we take the one that
7812 * is not in its set first (a difference from the union algorithm). If
7813 * we took one in the set first, it would increment the count, possibly
7814 * to 2 which would cause it to be output as starting a range in the
7815 * intersection, and the next time through we would take that same
7816 * number, and output it again as ending the set. By doing it the
7817 * opposite of this, there is no possibility that the count will be
7818 * momentarily incremented to 2. (In a tie and both are in the set or
7819 * both not in the set, it doesn't matter which we take first.) */
7820 if (array_a[i_a] < array_b[i_b]
7821 || (array_a[i_a] == array_b[i_b]
7822 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7824 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7828 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7832 /* Here, have chosen which of the two inputs to look at. Only output
7833 * if the running count changes to/from 2, which marks the
7834 * beginning/end of a range that's in the intersection */
7838 array_r[i_r++] = cp;
7843 array_r[i_r++] = cp;
7849 /* Here, we are finished going through at least one of the lists, which
7850 * means there is something remaining in at most one. We check if the list
7851 * that has been exhausted is positioned such that we are in the middle
7852 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
7853 * the ones we care about.) There are four cases:
7854 * 1) Both weren't in their sets, count is 0, and remains 0. There's
7855 * nothing left in the intersection.
7856 * 2) Both were in their sets, count is 2 and perhaps is incremented to
7857 * above 2. What should be output is exactly that which is in the
7858 * non-exhausted set, as everything it has is also in the intersection
7859 * set, and everything it doesn't have can't be in the intersection
7860 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7861 * gets incremented to 2. Like the previous case, the intersection is
7862 * everything that remains in the non-exhausted set.
7863 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7864 * remains 1. And the intersection has nothing more. */
7865 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7866 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7871 /* The final length is what we've output so far plus what else is in the
7872 * intersection. At most one of the subexpressions below will be non-zero */
7875 len_r += (len_a - i_a) + (len_b - i_b);
7878 /* Set result to final length, which can change the pointer to array_r, so
7880 if (len_r != _invlist_len(r)) {
7881 invlist_set_len(r, len_r);
7883 array_r = invlist_array(r);
7886 /* Finish outputting any remaining */
7887 if (count >= 2) { /* At most one will have a non-zero copy count */
7889 if ((copy_count = len_a - i_a) > 0) {
7890 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7892 else if ((copy_count = len_b - i_b) > 0) {
7893 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7897 /* If we've changed b, restore it */
7902 /* We may be removing a reference to one of the inputs */
7903 if (a == *i || b == *i) {
7904 assert(! invlist_is_iterating(*i));
7905 SvREFCNT_dec_NN(*i);
7913 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7915 /* Add the range from 'start' to 'end' inclusive to the inversion list's
7916 * set. A pointer to the inversion list is returned. This may actually be
7917 * a new list, in which case the passed in one has been destroyed. The
7918 * passed in inversion list can be NULL, in which case a new one is created
7919 * with just the one range in it */
7924 if (invlist == NULL) {
7925 invlist = _new_invlist(2);
7929 len = _invlist_len(invlist);
7932 /* If comes after the final entry actually in the list, can just append it
7935 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
7936 && start >= invlist_array(invlist)[len - 1]))
7938 _append_range_to_invlist(invlist, start, end);
7942 /* Here, can't just append things, create and return a new inversion list
7943 * which is the union of this range and the existing inversion list */
7944 range_invlist = _new_invlist(2);
7945 _append_range_to_invlist(range_invlist, start, end);
7947 _invlist_union(invlist, range_invlist, &invlist);
7949 /* The temporary can be freed */
7950 SvREFCNT_dec_NN(range_invlist);
7957 PERL_STATIC_INLINE SV*
7958 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7959 return _add_range_to_invlist(invlist, cp, cp);
7962 #ifndef PERL_IN_XSUB_RE
7964 Perl__invlist_invert(pTHX_ SV* const invlist)
7966 /* Complement the input inversion list. This adds a 0 if the list didn't
7967 * have a zero; removes it otherwise. As described above, the data
7968 * structure is set up so that this is very efficient */
7970 UV* len_pos = _get_invlist_len_addr(invlist);
7972 PERL_ARGS_ASSERT__INVLIST_INVERT;
7974 assert(! invlist_is_iterating(invlist));
7976 /* The inverse of matching nothing is matching everything */
7977 if (*len_pos == 0) {
7978 _append_range_to_invlist(invlist, 0, UV_MAX);
7982 /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
7983 * zero element was a 0, so it is being removed, so the length decrements
7984 * by 1; and vice-versa. SvCUR is unaffected */
7985 if (*get_invlist_zero_addr(invlist) ^= 1) {
7994 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7996 /* Complement the input inversion list (which must be a Unicode property,
7997 * all of which don't match above the Unicode maximum code point.) And
7998 * Perl has chosen to not have the inversion match above that either. This
7999 * adds a 0x110000 if the list didn't end with it, and removes it if it did
8005 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8007 _invlist_invert(invlist);
8009 len = _invlist_len(invlist);
8011 if (len != 0) { /* If empty do nothing */
8012 array = invlist_array(invlist);
8013 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8014 /* Add 0x110000. First, grow if necessary */
8016 if (invlist_max(invlist) < len) {
8017 invlist_extend(invlist, len);
8018 array = invlist_array(invlist);
8020 invlist_set_len(invlist, len);
8021 array[len - 1] = PERL_UNICODE_MAX + 1;
8023 else { /* Remove the 0x110000 */
8024 invlist_set_len(invlist, len - 1);
8032 PERL_STATIC_INLINE SV*
8033 S_invlist_clone(pTHX_ SV* const invlist)
8036 /* Return a new inversion list that is a copy of the input one, which is
8039 /* Need to allocate extra space to accommodate Perl's addition of a
8040 * trailing NUL to SvPV's, since it thinks they are always strings */
8041 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8042 STRLEN length = SvCUR(invlist);
8044 PERL_ARGS_ASSERT_INVLIST_CLONE;
8046 SvCUR_set(new_invlist, length); /* This isn't done automatically */
8047 Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8052 PERL_STATIC_INLINE UV*
8053 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8055 /* Return the address of the UV that contains the current iteration
8058 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8060 return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8063 PERL_STATIC_INLINE UV*
8064 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8066 /* Return the address of the UV that contains the version id. */
8068 PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8070 return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8073 PERL_STATIC_INLINE void
8074 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8076 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8078 *get_invlist_iter_addr(invlist) = 0;
8081 PERL_STATIC_INLINE void
8082 S_invlist_iterfinish(pTHX_ SV* invlist)
8084 /* Terminate iterator for invlist. This is to catch development errors.
8085 * Any iteration that is interrupted before completed should call this
8086 * function. Functions that add code points anywhere else but to the end
8087 * of an inversion list assert that they are not in the middle of an
8088 * iteration. If they were, the addition would make the iteration
8089 * problematical: if the iteration hadn't reached the place where things
8090 * were being added, it would be ok */
8092 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8094 *get_invlist_iter_addr(invlist) = UV_MAX;
8098 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8100 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8101 * This call sets in <*start> and <*end>, the next range in <invlist>.
8102 * Returns <TRUE> if successful and the next call will return the next
8103 * range; <FALSE> if was already at the end of the list. If the latter,
8104 * <*start> and <*end> are unchanged, and the next call to this function
8105 * will start over at the beginning of the list */
8107 UV* pos = get_invlist_iter_addr(invlist);
8108 UV len = _invlist_len(invlist);
8111 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8114 *pos = UV_MAX; /* Force iterinit() to be required next time */
8118 array = invlist_array(invlist);
8120 *start = array[(*pos)++];
8126 *end = array[(*pos)++] - 1;
8132 PERL_STATIC_INLINE bool
8133 S_invlist_is_iterating(pTHX_ SV* const invlist)
8135 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8137 return *(get_invlist_iter_addr(invlist)) < UV_MAX;
8140 PERL_STATIC_INLINE UV
8141 S_invlist_highest(pTHX_ SV* const invlist)
8143 /* Returns the highest code point that matches an inversion list. This API
8144 * has an ambiguity, as it returns 0 under either the highest is actually
8145 * 0, or if the list is empty. If this distinction matters to you, check
8146 * for emptiness before calling this function */
8148 UV len = _invlist_len(invlist);
8151 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8157 array = invlist_array(invlist);
8159 /* The last element in the array in the inversion list always starts a
8160 * range that goes to infinity. That range may be for code points that are
8161 * matched in the inversion list, or it may be for ones that aren't
8162 * matched. In the latter case, the highest code point in the set is one
8163 * less than the beginning of this range; otherwise it is the final element
8164 * of this range: infinity */
8165 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8167 : array[len - 1] - 1;
8170 #ifndef PERL_IN_XSUB_RE
8172 Perl__invlist_contents(pTHX_ SV* const invlist)
8174 /* Get the contents of an inversion list into a string SV so that they can
8175 * be printed out. It uses the format traditionally done for debug tracing
8179 SV* output = newSVpvs("\n");
8181 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8183 assert(! invlist_is_iterating(invlist));
8185 invlist_iterinit(invlist);
8186 while (invlist_iternext(invlist, &start, &end)) {
8187 if (end == UV_MAX) {
8188 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8190 else if (end != start) {
8191 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8195 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8203 #ifdef PERL_ARGS_ASSERT__INVLIST_DUMP
8205 Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header)
8207 /* Dumps out the ranges in an inversion list. The string 'header'
8208 * if present is output on a line before the first range */
8212 PERL_ARGS_ASSERT__INVLIST_DUMP;
8214 if (header && strlen(header)) {
8215 PerlIO_printf(Perl_debug_log, "%s\n", header);
8217 if (invlist_is_iterating(invlist)) {
8218 PerlIO_printf(Perl_debug_log, "Can't dump because is in middle of iterating\n");
8222 invlist_iterinit(invlist);
8223 while (invlist_iternext(invlist, &start, &end)) {
8224 if (end == UV_MAX) {
8225 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8227 else if (end != start) {
8228 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n",
8232 PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start);
8240 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8242 /* Return a boolean as to if the two passed in inversion lists are
8243 * identical. The final argument, if TRUE, says to take the complement of
8244 * the second inversion list before doing the comparison */
8246 UV* array_a = invlist_array(a);
8247 UV* array_b = invlist_array(b);
8248 UV len_a = _invlist_len(a);
8249 UV len_b = _invlist_len(b);
8251 UV i = 0; /* current index into the arrays */
8252 bool retval = TRUE; /* Assume are identical until proven otherwise */
8254 PERL_ARGS_ASSERT__INVLISTEQ;
8256 /* If are to compare 'a' with the complement of b, set it
8257 * up so are looking at b's complement. */
8260 /* The complement of nothing is everything, so <a> would have to have
8261 * just one element, starting at zero (ending at infinity) */
8263 return (len_a == 1 && array_a[0] == 0);
8265 else if (array_b[0] == 0) {
8267 /* Otherwise, to complement, we invert. Here, the first element is
8268 * 0, just remove it. To do this, we just pretend the array starts
8269 * one later, and clear the flag as we don't have to do anything
8274 complement_b = FALSE;
8278 /* But if the first element is not zero, we unshift a 0 before the
8279 * array. The data structure reserves a space for that 0 (which
8280 * should be a '1' right now), so physical shifting is unneeded,
8281 * but temporarily change that element to 0. Before exiting the
8282 * routine, we must restore the element to '1' */
8289 /* Make sure that the lengths are the same, as well as the final element
8290 * before looping through the remainder. (Thus we test the length, final,
8291 * and first elements right off the bat) */
8292 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8295 else for (i = 0; i < len_a - 1; i++) {
8296 if (array_a[i] != array_b[i]) {
8309 #undef HEADER_LENGTH
8310 #undef INVLIST_INITIAL_LENGTH
8311 #undef TO_INTERNAL_SIZE
8312 #undef FROM_INTERNAL_SIZE
8313 #undef INVLIST_LEN_OFFSET
8314 #undef INVLIST_ZERO_OFFSET
8315 #undef INVLIST_ITER_OFFSET
8316 #undef INVLIST_VERSION_ID
8317 #undef INVLIST_PREVIOUS_INDEX_OFFSET
8319 /* End of inversion list object */
8322 S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state)
8324 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
8325 * constructs, and updates RExC_flags with them. On input, RExC_parse
8326 * should point to the first flag; it is updated on output to point to the
8327 * final ')' or ':'. There needs to be at least one flag, or this will
8330 /* for (?g), (?gc), and (?o) warnings; warning
8331 about (?c) will warn about (?g) -- japhy */
8333 #define WASTED_O 0x01
8334 #define WASTED_G 0x02
8335 #define WASTED_C 0x04
8336 #define WASTED_GC (0x02|0x04)
8337 I32 wastedflags = 0x00;
8338 U32 posflags = 0, negflags = 0;
8339 U32 *flagsp = &posflags;
8340 char has_charset_modifier = '\0';
8342 bool has_use_defaults = FALSE;
8343 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
8345 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
8347 /* '^' as an initial flag sets certain defaults */
8348 if (UCHARAT(RExC_parse) == '^') {
8350 has_use_defaults = TRUE;
8351 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8352 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8353 ? REGEX_UNICODE_CHARSET
8354 : REGEX_DEPENDS_CHARSET);
8357 cs = get_regex_charset(RExC_flags);
8358 if (cs == REGEX_DEPENDS_CHARSET
8359 && (RExC_utf8 || RExC_uni_semantics))
8361 cs = REGEX_UNICODE_CHARSET;
8364 while (*RExC_parse) {
8365 /* && strchr("iogcmsx", *RExC_parse) */
8366 /* (?g), (?gc) and (?o) are useless here
8367 and must be globally applied -- japhy */
8368 switch (*RExC_parse) {
8370 /* Code for the imsx flags */
8371 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8373 case LOCALE_PAT_MOD:
8374 if (has_charset_modifier) {
8375 goto excess_modifier;
8377 else if (flagsp == &negflags) {
8380 cs = REGEX_LOCALE_CHARSET;
8381 has_charset_modifier = LOCALE_PAT_MOD;
8382 RExC_contains_locale = 1;
8384 case UNICODE_PAT_MOD:
8385 if (has_charset_modifier) {
8386 goto excess_modifier;
8388 else if (flagsp == &negflags) {
8391 cs = REGEX_UNICODE_CHARSET;
8392 has_charset_modifier = UNICODE_PAT_MOD;
8394 case ASCII_RESTRICT_PAT_MOD:
8395 if (flagsp == &negflags) {
8398 if (has_charset_modifier) {
8399 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8400 goto excess_modifier;
8402 /* Doubled modifier implies more restricted */
8403 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8406 cs = REGEX_ASCII_RESTRICTED_CHARSET;
8408 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8410 case DEPENDS_PAT_MOD:
8411 if (has_use_defaults) {
8412 goto fail_modifiers;
8414 else if (flagsp == &negflags) {
8417 else if (has_charset_modifier) {
8418 goto excess_modifier;
8421 /* The dual charset means unicode semantics if the
8422 * pattern (or target, not known until runtime) are
8423 * utf8, or something in the pattern indicates unicode
8425 cs = (RExC_utf8 || RExC_uni_semantics)
8426 ? REGEX_UNICODE_CHARSET
8427 : REGEX_DEPENDS_CHARSET;
8428 has_charset_modifier = DEPENDS_PAT_MOD;
8432 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8433 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8435 else if (has_charset_modifier == *(RExC_parse - 1)) {
8436 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8439 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8444 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8446 case ONCE_PAT_MOD: /* 'o' */
8447 case GLOBAL_PAT_MOD: /* 'g' */
8448 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8449 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8450 if (! (wastedflags & wflagbit) ) {
8451 wastedflags |= wflagbit;
8454 "Useless (%s%c) - %suse /%c modifier",
8455 flagsp == &negflags ? "?-" : "?",
8457 flagsp == &negflags ? "don't " : "",
8464 case CONTINUE_PAT_MOD: /* 'c' */
8465 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8466 if (! (wastedflags & WASTED_C) ) {
8467 wastedflags |= WASTED_GC;
8470 "Useless (%sc) - %suse /gc modifier",
8471 flagsp == &negflags ? "?-" : "?",
8472 flagsp == &negflags ? "don't " : ""
8477 case KEEPCOPY_PAT_MOD: /* 'p' */
8478 if (flagsp == &negflags) {
8480 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8482 *flagsp |= RXf_PMf_KEEPCOPY;
8486 /* A flag is a default iff it is following a minus, so
8487 * if there is a minus, it means will be trying to
8488 * re-specify a default which is an error */
8489 if (has_use_defaults || flagsp == &negflags) {
8490 goto fail_modifiers;
8493 wastedflags = 0; /* reset so (?g-c) warns twice */
8497 RExC_flags |= posflags;
8498 RExC_flags &= ~negflags;
8499 set_regex_charset(&RExC_flags, cs);
8505 vFAIL3("Sequence (%.*s...) not recognized",
8506 RExC_parse-seqstart, seqstart);
8515 - reg - regular expression, i.e. main body or parenthesized thing
8517 * Caller must absorb opening parenthesis.
8519 * Combining parenthesis handling with the base level of regular expression
8520 * is a trifle forced, but the need to tie the tails of the branches to what
8521 * follows makes it hard to avoid.
8523 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8525 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8527 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8531 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8532 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8535 regnode *ret; /* Will be the head of the group. */
8538 regnode *ender = NULL;
8541 U32 oregflags = RExC_flags;
8542 bool have_branch = 0;
8544 I32 freeze_paren = 0;
8545 I32 after_freeze = 0;
8547 char * parse_start = RExC_parse; /* MJD */
8548 char * const oregcomp_parse = RExC_parse;
8550 GET_RE_DEBUG_FLAGS_DECL;
8552 PERL_ARGS_ASSERT_REG;
8553 DEBUG_PARSE("reg ");
8555 *flagp = 0; /* Tentatively. */
8558 /* Make an OPEN node, if parenthesized. */
8560 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8561 char *start_verb = RExC_parse;
8562 STRLEN verb_len = 0;
8563 char *start_arg = NULL;
8564 unsigned char op = 0;
8566 int internal_argval = 0; /* internal_argval is only useful if !argok */
8567 while ( *RExC_parse && *RExC_parse != ')' ) {
8568 if ( *RExC_parse == ':' ) {
8569 start_arg = RExC_parse + 1;
8575 verb_len = RExC_parse - start_verb;
8578 while ( *RExC_parse && *RExC_parse != ')' )
8580 if ( *RExC_parse != ')' )
8581 vFAIL("Unterminated verb pattern argument");
8582 if ( RExC_parse == start_arg )
8585 if ( *RExC_parse != ')' )
8586 vFAIL("Unterminated verb pattern");
8589 switch ( *start_verb ) {
8590 case 'A': /* (*ACCEPT) */
8591 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8593 internal_argval = RExC_nestroot;
8596 case 'C': /* (*COMMIT) */
8597 if ( memEQs(start_verb,verb_len,"COMMIT") )
8600 case 'F': /* (*FAIL) */
8601 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8606 case ':': /* (*:NAME) */
8607 case 'M': /* (*MARK:NAME) */
8608 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8613 case 'P': /* (*PRUNE) */
8614 if ( memEQs(start_verb,verb_len,"PRUNE") )
8617 case 'S': /* (*SKIP) */
8618 if ( memEQs(start_verb,verb_len,"SKIP") )
8621 case 'T': /* (*THEN) */
8622 /* [19:06] <TimToady> :: is then */
8623 if ( memEQs(start_verb,verb_len,"THEN") ) {
8625 RExC_seen |= REG_SEEN_CUTGROUP;
8631 vFAIL3("Unknown verb pattern '%.*s'",
8632 verb_len, start_verb);
8635 if ( start_arg && internal_argval ) {
8636 vFAIL3("Verb pattern '%.*s' may not have an argument",
8637 verb_len, start_verb);
8638 } else if ( argok < 0 && !start_arg ) {
8639 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8640 verb_len, start_verb);
8642 ret = reganode(pRExC_state, op, internal_argval);
8643 if ( ! internal_argval && ! SIZE_ONLY ) {
8645 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8646 ARG(ret) = add_data( pRExC_state, 1, "S" );
8647 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8654 if (!internal_argval)
8655 RExC_seen |= REG_SEEN_VERBARG;
8656 } else if ( start_arg ) {
8657 vFAIL3("Verb pattern '%.*s' may not have an argument",
8658 verb_len, start_verb);
8660 ret = reg_node(pRExC_state, op);
8662 nextchar(pRExC_state);
8665 if (*RExC_parse == '?') { /* (?...) */
8666 bool is_logical = 0;
8667 const char * const seqstart = RExC_parse;
8670 paren = *RExC_parse++;
8671 ret = NULL; /* For look-ahead/behind. */
8674 case 'P': /* (?P...) variants for those used to PCRE/Python */
8675 paren = *RExC_parse++;
8676 if ( paren == '<') /* (?P<...>) named capture */
8678 else if (paren == '>') { /* (?P>name) named recursion */
8679 goto named_recursion;
8681 else if (paren == '=') { /* (?P=...) named backref */
8682 /* this pretty much dupes the code for \k<NAME> in regatom(), if
8683 you change this make sure you change that */
8684 char* name_start = RExC_parse;
8686 SV *sv_dat = reg_scan_name(pRExC_state,
8687 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8688 if (RExC_parse == name_start || *RExC_parse != ')')
8689 vFAIL2("Sequence %.3s... not terminated",parse_start);
8692 num = add_data( pRExC_state, 1, "S" );
8693 RExC_rxi->data->data[num]=(void*)sv_dat;
8694 SvREFCNT_inc_simple_void(sv_dat);
8697 ret = reganode(pRExC_state,
8700 : (ASCII_FOLD_RESTRICTED)
8702 : (AT_LEAST_UNI_SEMANTICS)
8710 Set_Node_Offset(ret, parse_start+1);
8711 Set_Node_Cur_Length(ret); /* MJD */
8713 nextchar(pRExC_state);
8717 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8719 case '<': /* (?<...) */
8720 if (*RExC_parse == '!')
8722 else if (*RExC_parse != '=')
8728 case '\'': /* (?'...') */
8729 name_start= RExC_parse;
8730 svname = reg_scan_name(pRExC_state,
8731 SIZE_ONLY ? /* reverse test from the others */
8732 REG_RSN_RETURN_NAME :
8733 REG_RSN_RETURN_NULL);
8734 if (RExC_parse == name_start) {
8736 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8739 if (*RExC_parse != paren)
8740 vFAIL2("Sequence (?%c... not terminated",
8741 paren=='>' ? '<' : paren);
8745 if (!svname) /* shouldn't happen */
8747 "panic: reg_scan_name returned NULL");
8748 if (!RExC_paren_names) {
8749 RExC_paren_names= newHV();
8750 sv_2mortal(MUTABLE_SV(RExC_paren_names));
8752 RExC_paren_name_list= newAV();
8753 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8756 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8758 sv_dat = HeVAL(he_str);
8760 /* croak baby croak */
8762 "panic: paren_name hash element allocation failed");
8763 } else if ( SvPOK(sv_dat) ) {
8764 /* (?|...) can mean we have dupes so scan to check
8765 its already been stored. Maybe a flag indicating
8766 we are inside such a construct would be useful,
8767 but the arrays are likely to be quite small, so
8768 for now we punt -- dmq */
8769 IV count = SvIV(sv_dat);
8770 I32 *pv = (I32*)SvPVX(sv_dat);
8772 for ( i = 0 ; i < count ; i++ ) {
8773 if ( pv[i] == RExC_npar ) {
8779 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8780 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8781 pv[count] = RExC_npar;
8782 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8785 (void)SvUPGRADE(sv_dat,SVt_PVNV);
8786 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8788 SvIV_set(sv_dat, 1);
8791 /* Yes this does cause a memory leak in debugging Perls */
8792 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8793 SvREFCNT_dec_NN(svname);
8796 /*sv_dump(sv_dat);*/
8798 nextchar(pRExC_state);
8800 goto capturing_parens;
8802 RExC_seen |= REG_SEEN_LOOKBEHIND;
8803 RExC_in_lookbehind++;
8805 case '=': /* (?=...) */
8806 RExC_seen_zerolen++;
8808 case '!': /* (?!...) */
8809 RExC_seen_zerolen++;
8810 if (*RExC_parse == ')') {
8811 ret=reg_node(pRExC_state, OPFAIL);
8812 nextchar(pRExC_state);
8816 case '|': /* (?|...) */
8817 /* branch reset, behave like a (?:...) except that
8818 buffers in alternations share the same numbers */
8820 after_freeze = freeze_paren = RExC_npar;
8822 case ':': /* (?:...) */
8823 case '>': /* (?>...) */
8825 case '$': /* (?$...) */
8826 case '@': /* (?@...) */
8827 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8829 case '#': /* (?#...) */
8830 while (*RExC_parse && *RExC_parse != ')')
8832 if (*RExC_parse != ')')
8833 FAIL("Sequence (?#... not terminated");
8834 nextchar(pRExC_state);
8837 case '0' : /* (?0) */
8838 case 'R' : /* (?R) */
8839 if (*RExC_parse != ')')
8840 FAIL("Sequence (?R) not terminated");
8841 ret = reg_node(pRExC_state, GOSTART);
8842 *flagp |= POSTPONED;
8843 nextchar(pRExC_state);
8846 { /* named and numeric backreferences */
8848 case '&': /* (?&NAME) */
8849 parse_start = RExC_parse - 1;
8852 SV *sv_dat = reg_scan_name(pRExC_state,
8853 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8854 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8856 goto gen_recurse_regop;
8857 assert(0); /* NOT REACHED */
8859 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8861 vFAIL("Illegal pattern");
8863 goto parse_recursion;
8865 case '-': /* (?-1) */
8866 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8867 RExC_parse--; /* rewind to let it be handled later */
8871 case '1': case '2': case '3': case '4': /* (?1) */
8872 case '5': case '6': case '7': case '8': case '9':
8875 num = atoi(RExC_parse);
8876 parse_start = RExC_parse - 1; /* MJD */
8877 if (*RExC_parse == '-')
8879 while (isDIGIT(*RExC_parse))
8881 if (*RExC_parse!=')')
8882 vFAIL("Expecting close bracket");
8885 if ( paren == '-' ) {
8887 Diagram of capture buffer numbering.
8888 Top line is the normal capture buffer numbers
8889 Bottom line is the negative indexing as from
8893 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8897 num = RExC_npar + num;
8900 vFAIL("Reference to nonexistent group");
8902 } else if ( paren == '+' ) {
8903 num = RExC_npar + num - 1;
8906 ret = reganode(pRExC_state, GOSUB, num);
8908 if (num > (I32)RExC_rx->nparens) {
8910 vFAIL("Reference to nonexistent group");
8912 ARG2L_SET( ret, RExC_recurse_count++);
8914 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8915 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8919 RExC_seen |= REG_SEEN_RECURSE;
8920 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8921 Set_Node_Offset(ret, parse_start); /* MJD */
8923 *flagp |= POSTPONED;
8924 nextchar(pRExC_state);
8926 } /* named and numeric backreferences */
8927 assert(0); /* NOT REACHED */
8929 case '?': /* (??...) */
8931 if (*RExC_parse != '{') {
8933 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8936 *flagp |= POSTPONED;
8937 paren = *RExC_parse++;
8939 case '{': /* (?{...}) */
8942 struct reg_code_block *cb;
8944 RExC_seen_zerolen++;
8946 if ( !pRExC_state->num_code_blocks
8947 || pRExC_state->code_index >= pRExC_state->num_code_blocks
8948 || pRExC_state->code_blocks[pRExC_state->code_index].start
8949 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8952 if (RExC_pm_flags & PMf_USE_RE_EVAL)
8953 FAIL("panic: Sequence (?{...}): no code block found\n");
8954 FAIL("Eval-group not allowed at runtime, use re 'eval'");
8956 /* this is a pre-compiled code block (?{...}) */
8957 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8958 RExC_parse = RExC_start + cb->end;
8961 if (cb->src_regex) {
8962 n = add_data(pRExC_state, 2, "rl");
8963 RExC_rxi->data->data[n] =
8964 (void*)SvREFCNT_inc((SV*)cb->src_regex);
8965 RExC_rxi->data->data[n+1] = (void*)o;
8968 n = add_data(pRExC_state, 1,
8969 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8970 RExC_rxi->data->data[n] = (void*)o;
8973 pRExC_state->code_index++;
8974 nextchar(pRExC_state);
8978 ret = reg_node(pRExC_state, LOGICAL);
8979 eval = reganode(pRExC_state, EVAL, n);
8982 /* for later propagation into (??{}) return value */
8983 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8985 REGTAIL(pRExC_state, ret, eval);
8986 /* deal with the length of this later - MJD */
8989 ret = reganode(pRExC_state, EVAL, n);
8990 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8991 Set_Node_Offset(ret, parse_start);
8994 case '(': /* (?(?{...})...) and (?(?=...)...) */
8997 if (RExC_parse[0] == '?') { /* (?(?...)) */
8998 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8999 || RExC_parse[1] == '<'
9000 || RExC_parse[1] == '{') { /* Lookahead or eval. */
9003 ret = reg_node(pRExC_state, LOGICAL);
9006 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
9010 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
9011 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9013 char ch = RExC_parse[0] == '<' ? '>' : '\'';
9014 char *name_start= RExC_parse++;
9016 SV *sv_dat=reg_scan_name(pRExC_state,
9017 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9018 if (RExC_parse == name_start || *RExC_parse != ch)
9019 vFAIL2("Sequence (?(%c... not terminated",
9020 (ch == '>' ? '<' : ch));
9023 num = add_data( pRExC_state, 1, "S" );
9024 RExC_rxi->data->data[num]=(void*)sv_dat;
9025 SvREFCNT_inc_simple_void(sv_dat);
9027 ret = reganode(pRExC_state,NGROUPP,num);
9028 goto insert_if_check_paren;
9030 else if (RExC_parse[0] == 'D' &&
9031 RExC_parse[1] == 'E' &&
9032 RExC_parse[2] == 'F' &&
9033 RExC_parse[3] == 'I' &&
9034 RExC_parse[4] == 'N' &&
9035 RExC_parse[5] == 'E')
9037 ret = reganode(pRExC_state,DEFINEP,0);
9040 goto insert_if_check_paren;
9042 else if (RExC_parse[0] == 'R') {
9045 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9046 parno = atoi(RExC_parse++);
9047 while (isDIGIT(*RExC_parse))
9049 } else if (RExC_parse[0] == '&') {
9052 sv_dat = reg_scan_name(pRExC_state,
9053 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9054 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9056 ret = reganode(pRExC_state,INSUBP,parno);
9057 goto insert_if_check_paren;
9059 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9062 parno = atoi(RExC_parse++);
9064 while (isDIGIT(*RExC_parse))
9066 ret = reganode(pRExC_state, GROUPP, parno);
9068 insert_if_check_paren:
9069 if ((c = *nextchar(pRExC_state)) != ')')
9070 vFAIL("Switch condition not recognized");
9072 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9073 br = regbranch(pRExC_state, &flags, 1,depth+1);
9075 br = reganode(pRExC_state, LONGJMP, 0);
9077 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
9078 c = *nextchar(pRExC_state);
9083 vFAIL("(?(DEFINE)....) does not allow branches");
9084 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
9085 regbranch(pRExC_state, &flags, 1,depth+1);
9086 REGTAIL(pRExC_state, ret, lastbr);
9089 c = *nextchar(pRExC_state);
9094 vFAIL("Switch (?(condition)... contains too many branches");
9095 ender = reg_node(pRExC_state, TAIL);
9096 REGTAIL(pRExC_state, br, ender);
9098 REGTAIL(pRExC_state, lastbr, ender);
9099 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
9102 REGTAIL(pRExC_state, ret, ender);
9103 RExC_size++; /* XXX WHY do we need this?!!
9104 For large programs it seems to be required
9105 but I can't figure out why. -- dmq*/
9109 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
9112 case '[': /* (?[ ... ]) */
9113 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
9116 RExC_parse--; /* for vFAIL to print correctly */
9117 vFAIL("Sequence (? incomplete");
9119 default: /* e.g., (?i) */
9122 parse_lparen_question_flags(pRExC_state);
9123 if (UCHARAT(RExC_parse) != ':') {
9124 nextchar(pRExC_state);
9129 nextchar(pRExC_state);
9139 ret = reganode(pRExC_state, OPEN, parno);
9142 RExC_nestroot = parno;
9143 if (RExC_seen & REG_SEEN_RECURSE
9144 && !RExC_open_parens[parno-1])
9146 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9147 "Setting open paren #%"IVdf" to %d\n",
9148 (IV)parno, REG_NODE_NUM(ret)));
9149 RExC_open_parens[parno-1]= ret;
9152 Set_Node_Length(ret, 1); /* MJD */
9153 Set_Node_Offset(ret, RExC_parse); /* MJD */
9161 /* Pick up the branches, linking them together. */
9162 parse_start = RExC_parse; /* MJD */
9163 br = regbranch(pRExC_state, &flags, 1,depth+1);
9165 /* branch_len = (paren != 0); */
9169 if (*RExC_parse == '|') {
9170 if (!SIZE_ONLY && RExC_extralen) {
9171 reginsert(pRExC_state, BRANCHJ, br, depth+1);
9174 reginsert(pRExC_state, BRANCH, br, depth+1);
9175 Set_Node_Length(br, paren != 0);
9176 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9180 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
9182 else if (paren == ':') {
9183 *flagp |= flags&SIMPLE;
9185 if (is_open) { /* Starts with OPEN. */
9186 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
9188 else if (paren != '?') /* Not Conditional */
9190 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9192 while (*RExC_parse == '|') {
9193 if (!SIZE_ONLY && RExC_extralen) {
9194 ender = reganode(pRExC_state, LONGJMP,0);
9195 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9198 RExC_extralen += 2; /* Account for LONGJMP. */
9199 nextchar(pRExC_state);
9201 if (RExC_npar > after_freeze)
9202 after_freeze = RExC_npar;
9203 RExC_npar = freeze_paren;
9205 br = regbranch(pRExC_state, &flags, 0, depth+1);
9209 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
9211 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9214 if (have_branch || paren != ':') {
9215 /* Make a closing node, and hook it on the end. */
9218 ender = reg_node(pRExC_state, TAIL);
9221 ender = reganode(pRExC_state, CLOSE, parno);
9222 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9223 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9224 "Setting close paren #%"IVdf" to %d\n",
9225 (IV)parno, REG_NODE_NUM(ender)));
9226 RExC_close_parens[parno-1]= ender;
9227 if (RExC_nestroot == parno)
9230 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9231 Set_Node_Length(ender,1); /* MJD */
9237 *flagp &= ~HASWIDTH;
9240 ender = reg_node(pRExC_state, SUCCEED);
9243 ender = reg_node(pRExC_state, END);
9245 assert(!RExC_opend); /* there can only be one! */
9250 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9251 SV * const mysv_val1=sv_newmortal();
9252 SV * const mysv_val2=sv_newmortal();
9253 DEBUG_PARSE_MSG("lsbr");
9254 regprop(RExC_rx, mysv_val1, lastbr);
9255 regprop(RExC_rx, mysv_val2, ender);
9256 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9257 SvPV_nolen_const(mysv_val1),
9258 (IV)REG_NODE_NUM(lastbr),
9259 SvPV_nolen_const(mysv_val2),
9260 (IV)REG_NODE_NUM(ender),
9261 (IV)(ender - lastbr)
9264 REGTAIL(pRExC_state, lastbr, ender);
9266 if (have_branch && !SIZE_ONLY) {
9269 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9271 /* Hook the tails of the branches to the closing node. */
9272 for (br = ret; br; br = regnext(br)) {
9273 const U8 op = PL_regkind[OP(br)];
9275 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9276 if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9279 else if (op == BRANCHJ) {
9280 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9281 /* for now we always disable this optimisation * /
9282 if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9288 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9289 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9290 SV * const mysv_val1=sv_newmortal();
9291 SV * const mysv_val2=sv_newmortal();
9292 DEBUG_PARSE_MSG("NADA");
9293 regprop(RExC_rx, mysv_val1, ret);
9294 regprop(RExC_rx, mysv_val2, ender);
9295 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9296 SvPV_nolen_const(mysv_val1),
9297 (IV)REG_NODE_NUM(ret),
9298 SvPV_nolen_const(mysv_val2),
9299 (IV)REG_NODE_NUM(ender),
9304 if (OP(ender) == TAIL) {
9309 for ( opt= br + 1; opt < ender ; opt++ )
9311 NEXT_OFF(br)= ender - br;
9319 static const char parens[] = "=!<,>";
9321 if (paren && (p = strchr(parens, paren))) {
9322 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9323 int flag = (p - parens) > 1;
9326 node = SUSPEND, flag = 0;
9327 reginsert(pRExC_state, node,ret, depth+1);
9328 Set_Node_Cur_Length(ret);
9329 Set_Node_Offset(ret, parse_start + 1);
9331 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9335 /* Check for proper termination. */
9337 RExC_flags = oregflags;
9338 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9339 RExC_parse = oregcomp_parse;
9340 vFAIL("Unmatched (");
9343 else if (!paren && RExC_parse < RExC_end) {
9344 if (*RExC_parse == ')') {
9346 vFAIL("Unmatched )");
9349 FAIL("Junk on end of regexp"); /* "Can't happen". */
9350 assert(0); /* NOTREACHED */
9353 if (RExC_in_lookbehind) {
9354 RExC_in_lookbehind--;
9356 if (after_freeze > RExC_npar)
9357 RExC_npar = after_freeze;
9362 - regbranch - one alternative of an | operator
9364 * Implements the concatenation operator.
9367 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9371 regnode *chain = NULL;
9373 I32 flags = 0, c = 0;
9374 GET_RE_DEBUG_FLAGS_DECL;
9376 PERL_ARGS_ASSERT_REGBRANCH;
9378 DEBUG_PARSE("brnc");
9383 if (!SIZE_ONLY && RExC_extralen)
9384 ret = reganode(pRExC_state, BRANCHJ,0);
9386 ret = reg_node(pRExC_state, BRANCH);
9387 Set_Node_Length(ret, 1);
9391 if (!first && SIZE_ONLY)
9392 RExC_extralen += 1; /* BRANCHJ */
9394 *flagp = WORST; /* Tentatively. */
9397 nextchar(pRExC_state);
9398 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9400 latest = regpiece(pRExC_state, &flags,depth+1);
9401 if (latest == NULL) {
9402 if (flags & TRYAGAIN)
9406 else if (ret == NULL)
9408 *flagp |= flags&(HASWIDTH|POSTPONED);
9409 if (chain == NULL) /* First piece. */
9410 *flagp |= flags&SPSTART;
9413 REGTAIL(pRExC_state, chain, latest);
9418 if (chain == NULL) { /* Loop ran zero times. */
9419 chain = reg_node(pRExC_state, NOTHING);
9424 *flagp |= flags&SIMPLE;
9431 - regpiece - something followed by possible [*+?]
9433 * Note that the branching code sequences used for ? and the general cases
9434 * of * and + are somewhat optimized: they use the same NOTHING node as
9435 * both the endmarker for their branch list and the body of the last branch.
9436 * It might seem that this node could be dispensed with entirely, but the
9437 * endmarker role is not redundant.
9440 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9447 const char * const origparse = RExC_parse;
9449 I32 max = REG_INFTY;
9450 #ifdef RE_TRACK_PATTERN_OFFSETS
9453 const char *maxpos = NULL;
9455 /* Save the original in case we change the emitted regop to a FAIL. */
9456 regnode * const orig_emit = RExC_emit;
9458 GET_RE_DEBUG_FLAGS_DECL;
9460 PERL_ARGS_ASSERT_REGPIECE;
9462 DEBUG_PARSE("piec");
9464 ret = regatom(pRExC_state, &flags,depth+1);
9466 if (flags & TRYAGAIN)
9473 if (op == '{' && regcurly(RExC_parse, FALSE)) {
9475 #ifdef RE_TRACK_PATTERN_OFFSETS
9476 parse_start = RExC_parse; /* MJD */
9478 next = RExC_parse + 1;
9479 while (isDIGIT(*next) || *next == ',') {
9488 if (*next == '}') { /* got one */
9492 min = atoi(RExC_parse);
9496 maxpos = RExC_parse;
9498 if (!max && *maxpos != '0')
9499 max = REG_INFTY; /* meaning "infinity" */
9500 else if (max >= REG_INFTY)
9501 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9503 nextchar(pRExC_state);
9504 if (max < min) { /* If can't match, warn and optimize to fail
9507 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9509 /* We can't back off the size because we have to reserve
9510 * enough space for all the things we are about to throw
9511 * away, but we can shrink it by the ammount we are about
9513 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9516 RExC_emit = orig_emit;
9518 ret = reg_node(pRExC_state, OPFAIL);
9521 else if (max == 0) { /* replace {0} with a nothing node */
9523 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)NOTHING];
9526 RExC_emit = orig_emit;
9528 ret = reg_node(pRExC_state, NOTHING);
9533 if ((flags&SIMPLE)) {
9534 RExC_naughty += 2 + RExC_naughty / 2;
9535 reginsert(pRExC_state, CURLY, ret, depth+1);
9536 Set_Node_Offset(ret, parse_start+1); /* MJD */
9537 Set_Node_Cur_Length(ret);
9540 regnode * const w = reg_node(pRExC_state, WHILEM);
9543 REGTAIL(pRExC_state, ret, w);
9544 if (!SIZE_ONLY && RExC_extralen) {
9545 reginsert(pRExC_state, LONGJMP,ret, depth+1);
9546 reginsert(pRExC_state, NOTHING,ret, depth+1);
9547 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9549 reginsert(pRExC_state, CURLYX,ret, depth+1);
9551 Set_Node_Offset(ret, parse_start+1);
9552 Set_Node_Length(ret,
9553 op == '{' ? (RExC_parse - parse_start) : 1);
9555 if (!SIZE_ONLY && RExC_extralen)
9556 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
9557 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9559 RExC_whilem_seen++, RExC_extralen += 3;
9560 RExC_naughty += 4 + RExC_naughty; /* compound interest */
9569 ARG1_SET(ret, (U16)min);
9570 ARG2_SET(ret, (U16)max);
9582 #if 0 /* Now runtime fix should be reliable. */
9584 /* if this is reinstated, don't forget to put this back into perldiag:
9586 =item Regexp *+ operand could be empty at {#} in regex m/%s/
9588 (F) The part of the regexp subject to either the * or + quantifier
9589 could match an empty string. The {#} shows in the regular
9590 expression about where the problem was discovered.
9594 if (!(flags&HASWIDTH) && op != '?')
9595 vFAIL("Regexp *+ operand could be empty");
9598 #ifdef RE_TRACK_PATTERN_OFFSETS
9599 parse_start = RExC_parse;
9601 nextchar(pRExC_state);
9603 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9605 if (op == '*' && (flags&SIMPLE)) {
9606 reginsert(pRExC_state, STAR, ret, depth+1);
9610 else if (op == '*') {
9614 else if (op == '+' && (flags&SIMPLE)) {
9615 reginsert(pRExC_state, PLUS, ret, depth+1);
9619 else if (op == '+') {
9623 else if (op == '?') {
9628 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9629 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
9630 ckWARN3reg(RExC_parse,
9631 "%.*s matches null string many times",
9632 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9634 (void)ReREFCNT_inc(RExC_rx_sv);
9637 if (RExC_parse < RExC_end && *RExC_parse == '?') {
9638 nextchar(pRExC_state);
9639 reginsert(pRExC_state, MINMOD, ret, depth+1);
9640 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9642 #ifndef REG_ALLOW_MINMOD_SUSPEND
9645 if (RExC_parse < RExC_end && *RExC_parse == '+') {
9647 nextchar(pRExC_state);
9648 ender = reg_node(pRExC_state, SUCCEED);
9649 REGTAIL(pRExC_state, ret, ender);
9650 reginsert(pRExC_state, SUSPEND, ret, depth+1);
9652 ender = reg_node(pRExC_state, TAIL);
9653 REGTAIL(pRExC_state, ret, ender);
9657 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9659 vFAIL("Nested quantifiers");
9666 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
9667 const bool strict /* Apply stricter parsing rules? */
9671 /* This is expected to be called by a parser routine that has recognized '\N'
9672 and needs to handle the rest. RExC_parse is expected to point at the first
9673 char following the N at the time of the call. On successful return,
9674 RExC_parse has been updated to point to just after the sequence identified
9675 by this routine, and <*flagp> has been updated.
9677 The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9680 \N may begin either a named sequence, or if outside a character class, mean
9681 to match a non-newline. For non single-quoted regexes, the tokenizer has
9682 attempted to decide which, and in the case of a named sequence, converted it
9683 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9684 where c1... are the characters in the sequence. For single-quoted regexes,
9685 the tokenizer passes the \N sequence through unchanged; this code will not
9686 attempt to determine this nor expand those, instead raising a syntax error.
9687 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9688 or there is no '}', it signals that this \N occurrence means to match a
9691 Only the \N{U+...} form should occur in a character class, for the same
9692 reason that '.' inside a character class means to just match a period: it
9693 just doesn't make sense.
9695 The function raises an error (via vFAIL), and doesn't return for various
9696 syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
9697 success; it returns FALSE otherwise.
9699 If <valuep> is non-null, it means the caller can accept an input sequence
9700 consisting of a just a single code point; <*valuep> is set to that value
9701 if the input is such.
9703 If <node_p> is non-null it signifies that the caller can accept any other
9704 legal sequence (i.e., one that isn't just a single code point). <*node_p>
9706 1) \N means not-a-NL: points to a newly created REG_ANY node;
9707 2) \N{}: points to a new NOTHING node;
9708 3) otherwise: points to a new EXACT node containing the resolved
9710 Note that FALSE is returned for single code point sequences if <valuep> is
9714 char * endbrace; /* '}' following the name */
9716 char *endchar; /* Points to '.' or '}' ending cur char in the input
9718 bool has_multiple_chars; /* true if the input stream contains a sequence of
9719 more than one character */
9721 GET_RE_DEBUG_FLAGS_DECL;
9723 PERL_ARGS_ASSERT_GROK_BSLASH_N;
9727 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
9729 /* The [^\n] meaning of \N ignores spaces and comments under the /x
9730 * modifier. The other meaning does not */
9731 p = (RExC_flags & RXf_PMf_EXTENDED)
9732 ? regwhite( pRExC_state, RExC_parse )
9735 /* Disambiguate between \N meaning a named character versus \N meaning
9736 * [^\n]. The former is assumed when it can't be the latter. */
9737 if (*p != '{' || regcurly(p, FALSE)) {
9740 /* no bare \N in a charclass */
9741 if (in_char_class) {
9742 vFAIL("\\N in a character class must be a named character: \\N{...}");
9746 nextchar(pRExC_state);
9747 *node_p = reg_node(pRExC_state, REG_ANY);
9748 *flagp |= HASWIDTH|SIMPLE;
9751 Set_Node_Length(*node_p, 1); /* MJD */
9755 /* Here, we have decided it should be a named character or sequence */
9757 /* The test above made sure that the next real character is a '{', but
9758 * under the /x modifier, it could be separated by space (or a comment and
9759 * \n) and this is not allowed (for consistency with \x{...} and the
9760 * tokenizer handling of \N{NAME}). */
9761 if (*RExC_parse != '{') {
9762 vFAIL("Missing braces on \\N{}");
9765 RExC_parse++; /* Skip past the '{' */
9767 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9768 || ! (endbrace == RExC_parse /* nothing between the {} */
9769 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9770 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9772 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9773 vFAIL("\\N{NAME} must be resolved by the lexer");
9776 if (endbrace == RExC_parse) { /* empty: \N{} */
9779 *node_p = reg_node(pRExC_state,NOTHING);
9781 else if (in_char_class) {
9782 if (SIZE_ONLY && in_char_class) {
9784 RExC_parse++; /* Position after the "}" */
9785 vFAIL("Zero length \\N{}");
9788 ckWARNreg(RExC_parse,
9789 "Ignoring zero length \\N{} in character class");
9797 nextchar(pRExC_state);
9801 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9802 RExC_parse += 2; /* Skip past the 'U+' */
9804 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9806 /* Code points are separated by dots. If none, there is only one code
9807 * point, and is terminated by the brace */
9808 has_multiple_chars = (endchar < endbrace);
9810 if (valuep && (! has_multiple_chars || in_char_class)) {
9811 /* We only pay attention to the first char of
9812 multichar strings being returned in char classes. I kinda wonder
9813 if this makes sense as it does change the behaviour
9814 from earlier versions, OTOH that behaviour was broken
9815 as well. XXX Solution is to recharacterize as
9816 [rest-of-class]|multi1|multi2... */
9818 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9819 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9820 | PERL_SCAN_DISALLOW_PREFIX
9821 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9823 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9825 /* The tokenizer should have guaranteed validity, but it's possible to
9826 * bypass it by using single quoting, so check */
9827 if (length_of_hex == 0
9828 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9830 RExC_parse += length_of_hex; /* Includes all the valid */
9831 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
9832 ? UTF8SKIP(RExC_parse)
9834 /* Guard against malformed utf8 */
9835 if (RExC_parse >= endchar) {
9836 RExC_parse = endchar;
9838 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9841 if (in_char_class && has_multiple_chars) {
9843 RExC_parse = endbrace;
9844 vFAIL("\\N{} in character class restricted to one character");
9847 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9851 RExC_parse = endbrace + 1;
9853 else if (! node_p || ! has_multiple_chars) {
9855 /* Here, the input is legal, but not according to the caller's
9856 * options. We fail without advancing the parse, so that the
9857 * caller can try again */
9863 /* What is done here is to convert this to a sub-pattern of the form
9864 * (?:\x{char1}\x{char2}...)
9865 * and then call reg recursively. That way, it retains its atomicness,
9866 * while not having to worry about special handling that some code
9867 * points may have. toke.c has converted the original Unicode values
9868 * to native, so that we can just pass on the hex values unchanged. We
9869 * do have to set a flag to keep recoding from happening in the
9872 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9874 char *orig_end = RExC_end;
9877 while (RExC_parse < endbrace) {
9879 /* Convert to notation the rest of the code understands */
9880 sv_catpv(substitute_parse, "\\x{");
9881 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9882 sv_catpv(substitute_parse, "}");
9884 /* Point to the beginning of the next character in the sequence. */
9885 RExC_parse = endchar + 1;
9886 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9888 sv_catpv(substitute_parse, ")");
9890 RExC_parse = SvPV(substitute_parse, len);
9892 /* Don't allow empty number */
9894 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9896 RExC_end = RExC_parse + len;
9898 /* The values are Unicode, and therefore not subject to recoding */
9899 RExC_override_recoding = 1;
9901 *node_p = reg(pRExC_state, 1, &flags, depth+1);
9902 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9904 RExC_parse = endbrace;
9905 RExC_end = orig_end;
9906 RExC_override_recoding = 0;
9908 nextchar(pRExC_state);
9918 * It returns the code point in utf8 for the value in *encp.
9919 * value: a code value in the source encoding
9920 * encp: a pointer to an Encode object
9922 * If the result from Encode is not a single character,
9923 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9926 S_reg_recode(pTHX_ const char value, SV **encp)
9929 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9930 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9931 const STRLEN newlen = SvCUR(sv);
9932 UV uv = UNICODE_REPLACEMENT;
9934 PERL_ARGS_ASSERT_REG_RECODE;
9938 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9941 if (!newlen || numlen != newlen) {
9942 uv = UNICODE_REPLACEMENT;
9948 PERL_STATIC_INLINE U8
9949 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
9953 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
9959 op = get_regex_charset(RExC_flags);
9960 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
9961 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
9962 been, so there is no hole */
9968 PERL_STATIC_INLINE void
9969 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
9971 /* This knows the details about sizing an EXACTish node, setting flags for
9972 * it (by setting <*flagp>, and potentially populating it with a single
9975 * If <len> (the length in bytes) is non-zero, this function assumes that
9976 * the node has already been populated, and just does the sizing. In this
9977 * case <code_point> should be the final code point that has already been
9978 * placed into the node. This value will be ignored except that under some
9979 * circumstances <*flagp> is set based on it.
9981 * If <len> is zero, the function assumes that the node is to contain only
9982 * the single character given by <code_point> and calculates what <len>
9983 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
9984 * additionally will populate the node's STRING with <code_point>, if <len>
9985 * is 0. In both cases <*flagp> is appropriately set
9987 * It knows that under FOLD, UTF characters and the Latin Sharp S must be
9988 * folded (the latter only when the rules indicate it can match 'ss') */
9990 bool len_passed_in = cBOOL(len != 0);
9991 U8 character[UTF8_MAXBYTES_CASE+1];
9993 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
9995 if (! len_passed_in) {
9998 to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
10001 uvchr_to_utf8( character, code_point);
10002 len = UTF8SKIP(character);
10006 || code_point != LATIN_SMALL_LETTER_SHARP_S
10007 || ASCII_FOLD_RESTRICTED
10008 || ! AT_LEAST_UNI_SEMANTICS)
10010 *character = (U8) code_point;
10015 *(character + 1) = 's';
10021 RExC_size += STR_SZ(len);
10024 RExC_emit += STR_SZ(len);
10025 STR_LEN(node) = len;
10026 if (! len_passed_in) {
10027 Copy((char *) character, STRING(node), len, char);
10031 *flagp |= HASWIDTH;
10033 /* A single character node is SIMPLE, except for the special-cased SHARP S
10035 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10036 && (code_point != LATIN_SMALL_LETTER_SHARP_S
10037 || ! FOLD || ! DEPENDS_SEMANTICS))
10044 - regatom - the lowest level
10046 Try to identify anything special at the start of the pattern. If there
10047 is, then handle it as required. This may involve generating a single regop,
10048 such as for an assertion; or it may involve recursing, such as to
10049 handle a () structure.
10051 If the string doesn't start with something special then we gobble up
10052 as much literal text as we can.
10054 Once we have been able to handle whatever type of thing started the
10055 sequence, we return.
10057 Note: we have to be careful with escapes, as they can be both literal
10058 and special, and in the case of \10 and friends, context determines which.
10060 A summary of the code structure is:
10062 switch (first_byte) {
10063 cases for each special:
10064 handle this special;
10067 switch (2nd byte) {
10068 cases for each unambiguous special:
10069 handle this special;
10071 cases for each ambigous special/literal:
10073 if (special) handle here
10075 default: // unambiguously literal:
10078 default: // is a literal char
10081 create EXACTish node for literal;
10082 while (more input and node isn't full) {
10083 switch (input_byte) {
10084 cases for each special;
10085 make sure parse pointer is set so that the next call to
10086 regatom will see this special first
10087 goto loopdone; // EXACTish node terminated by prev. char
10089 append char to EXACTISH node;
10091 get next input byte;
10095 return the generated node;
10097 Specifically there are two separate switches for handling
10098 escape sequences, with the one for handling literal escapes requiring
10099 a dummy entry for all of the special escapes that are actually handled
10104 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10107 regnode *ret = NULL;
10109 char *parse_start = RExC_parse;
10113 GET_RE_DEBUG_FLAGS_DECL;
10115 *flagp = WORST; /* Tentatively. */
10117 DEBUG_PARSE("atom");
10119 PERL_ARGS_ASSERT_REGATOM;
10122 switch ((U8)*RExC_parse) {
10124 RExC_seen_zerolen++;
10125 nextchar(pRExC_state);
10126 if (RExC_flags & RXf_PMf_MULTILINE)
10127 ret = reg_node(pRExC_state, MBOL);
10128 else if (RExC_flags & RXf_PMf_SINGLELINE)
10129 ret = reg_node(pRExC_state, SBOL);
10131 ret = reg_node(pRExC_state, BOL);
10132 Set_Node_Length(ret, 1); /* MJD */
10135 nextchar(pRExC_state);
10137 RExC_seen_zerolen++;
10138 if (RExC_flags & RXf_PMf_MULTILINE)
10139 ret = reg_node(pRExC_state, MEOL);
10140 else if (RExC_flags & RXf_PMf_SINGLELINE)
10141 ret = reg_node(pRExC_state, SEOL);
10143 ret = reg_node(pRExC_state, EOL);
10144 Set_Node_Length(ret, 1); /* MJD */
10147 nextchar(pRExC_state);
10148 if (RExC_flags & RXf_PMf_SINGLELINE)
10149 ret = reg_node(pRExC_state, SANY);
10151 ret = reg_node(pRExC_state, REG_ANY);
10152 *flagp |= HASWIDTH|SIMPLE;
10154 Set_Node_Length(ret, 1); /* MJD */
10158 char * const oregcomp_parse = ++RExC_parse;
10159 ret = regclass(pRExC_state, flagp,depth+1,
10160 FALSE, /* means parse the whole char class */
10161 TRUE, /* allow multi-char folds */
10162 FALSE, /* don't silence non-portable warnings. */
10164 if (*RExC_parse != ']') {
10165 RExC_parse = oregcomp_parse;
10166 vFAIL("Unmatched [");
10168 nextchar(pRExC_state);
10169 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10173 nextchar(pRExC_state);
10174 ret = reg(pRExC_state, 1, &flags,depth+1);
10176 if (flags & TRYAGAIN) {
10177 if (RExC_parse == RExC_end) {
10178 /* Make parent create an empty node if needed. */
10179 *flagp |= TRYAGAIN;
10186 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10190 if (flags & TRYAGAIN) {
10191 *flagp |= TRYAGAIN;
10194 vFAIL("Internal urp");
10195 /* Supposed to be caught earlier. */
10198 if (!regcurly(RExC_parse, FALSE)) {
10207 vFAIL("Quantifier follows nothing");
10212 This switch handles escape sequences that resolve to some kind
10213 of special regop and not to literal text. Escape sequnces that
10214 resolve to literal text are handled below in the switch marked
10217 Every entry in this switch *must* have a corresponding entry
10218 in the literal escape switch. However, the opposite is not
10219 required, as the default for this switch is to jump to the
10220 literal text handling code.
10222 switch ((U8)*++RExC_parse) {
10224 /* Special Escapes */
10226 RExC_seen_zerolen++;
10227 ret = reg_node(pRExC_state, SBOL);
10229 goto finish_meta_pat;
10231 ret = reg_node(pRExC_state, GPOS);
10232 RExC_seen |= REG_SEEN_GPOS;
10234 goto finish_meta_pat;
10236 RExC_seen_zerolen++;
10237 ret = reg_node(pRExC_state, KEEPS);
10239 /* XXX:dmq : disabling in-place substitution seems to
10240 * be necessary here to avoid cases of memory corruption, as
10241 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10243 RExC_seen |= REG_SEEN_LOOKBEHIND;
10244 goto finish_meta_pat;
10246 ret = reg_node(pRExC_state, SEOL);
10248 RExC_seen_zerolen++; /* Do not optimize RE away */
10249 goto finish_meta_pat;
10251 ret = reg_node(pRExC_state, EOS);
10253 RExC_seen_zerolen++; /* Do not optimize RE away */
10254 goto finish_meta_pat;
10256 ret = reg_node(pRExC_state, CANY);
10257 RExC_seen |= REG_SEEN_CANY;
10258 *flagp |= HASWIDTH|SIMPLE;
10259 goto finish_meta_pat;
10261 ret = reg_node(pRExC_state, CLUMP);
10262 *flagp |= HASWIDTH;
10263 goto finish_meta_pat;
10269 arg = ANYOF_WORDCHAR;
10273 RExC_seen_zerolen++;
10274 RExC_seen |= REG_SEEN_LOOKBEHIND;
10275 op = BOUND + get_regex_charset(RExC_flags);
10276 if (op > BOUNDA) { /* /aa is same as /a */
10279 ret = reg_node(pRExC_state, op);
10280 FLAGS(ret) = get_regex_charset(RExC_flags);
10282 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10283 ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
10285 goto finish_meta_pat;
10287 RExC_seen_zerolen++;
10288 RExC_seen |= REG_SEEN_LOOKBEHIND;
10289 op = NBOUND + get_regex_charset(RExC_flags);
10290 if (op > NBOUNDA) { /* /aa is same as /a */
10293 ret = reg_node(pRExC_state, op);
10294 FLAGS(ret) = get_regex_charset(RExC_flags);
10296 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10297 ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
10299 goto finish_meta_pat;
10309 ret = reg_node(pRExC_state, LNBREAK);
10310 *flagp |= HASWIDTH|SIMPLE;
10311 goto finish_meta_pat;
10319 goto join_posix_op_known;
10325 arg = ANYOF_VERTWS;
10327 goto join_posix_op_known;
10337 op = POSIXD + get_regex_charset(RExC_flags);
10338 if (op > POSIXA) { /* /aa is same as /a */
10342 join_posix_op_known:
10345 op += NPOSIXD - POSIXD;
10348 ret = reg_node(pRExC_state, op);
10350 FLAGS(ret) = namedclass_to_classnum(arg);
10353 *flagp |= HASWIDTH|SIMPLE;
10357 nextchar(pRExC_state);
10358 Set_Node_Length(ret, 2); /* MJD */
10364 char* parse_start = RExC_parse - 2;
10369 ret = regclass(pRExC_state, flagp,depth+1,
10370 TRUE, /* means just parse this element */
10371 FALSE, /* don't allow multi-char folds */
10372 FALSE, /* don't silence non-portable warnings.
10373 It would be a bug if these returned
10379 Set_Node_Offset(ret, parse_start + 2);
10380 Set_Node_Cur_Length(ret);
10381 nextchar(pRExC_state);
10385 /* Handle \N and \N{NAME} with multiple code points here and not
10386 * below because it can be multicharacter. join_exact() will join
10387 * them up later on. Also this makes sure that things like
10388 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10389 * The options to the grok function call causes it to fail if the
10390 * sequence is just a single code point. We then go treat it as
10391 * just another character in the current EXACT node, and hence it
10392 * gets uniform treatment with all the other characters. The
10393 * special treatment for quantifiers is not needed for such single
10394 * character sequences */
10396 if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
10397 FALSE /* not strict */ )) {
10402 case 'k': /* Handle \k<NAME> and \k'NAME' */
10405 char ch= RExC_parse[1];
10406 if (ch != '<' && ch != '\'' && ch != '{') {
10408 vFAIL2("Sequence %.2s... not terminated",parse_start);
10410 /* this pretty much dupes the code for (?P=...) in reg(), if
10411 you change this make sure you change that */
10412 char* name_start = (RExC_parse += 2);
10414 SV *sv_dat = reg_scan_name(pRExC_state,
10415 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10416 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10417 if (RExC_parse == name_start || *RExC_parse != ch)
10418 vFAIL2("Sequence %.3s... not terminated",parse_start);
10421 num = add_data( pRExC_state, 1, "S" );
10422 RExC_rxi->data->data[num]=(void*)sv_dat;
10423 SvREFCNT_inc_simple_void(sv_dat);
10427 ret = reganode(pRExC_state,
10430 : (ASCII_FOLD_RESTRICTED)
10432 : (AT_LEAST_UNI_SEMANTICS)
10438 *flagp |= HASWIDTH;
10440 /* override incorrect value set in reganode MJD */
10441 Set_Node_Offset(ret, parse_start+1);
10442 Set_Node_Cur_Length(ret); /* MJD */
10443 nextchar(pRExC_state);
10449 case '1': case '2': case '3': case '4':
10450 case '5': case '6': case '7': case '8': case '9':
10453 bool isg = *RExC_parse == 'g';
10458 if (*RExC_parse == '{') {
10462 if (*RExC_parse == '-') {
10466 if (hasbrace && !isDIGIT(*RExC_parse)) {
10467 if (isrel) RExC_parse--;
10469 goto parse_named_seq;
10471 num = atoi(RExC_parse);
10472 if (isg && num == 0)
10473 vFAIL("Reference to invalid group 0");
10475 num = RExC_npar - num;
10477 vFAIL("Reference to nonexistent or unclosed group");
10479 if (!isg && num > 9 && num >= RExC_npar)
10480 /* Probably a character specified in octal, e.g. \35 */
10483 char * const parse_start = RExC_parse - 1; /* MJD */
10484 while (isDIGIT(*RExC_parse))
10486 if (parse_start == RExC_parse - 1)
10487 vFAIL("Unterminated \\g... pattern");
10489 if (*RExC_parse != '}')
10490 vFAIL("Unterminated \\g{...} pattern");
10494 if (num > (I32)RExC_rx->nparens)
10495 vFAIL("Reference to nonexistent group");
10498 ret = reganode(pRExC_state,
10501 : (ASCII_FOLD_RESTRICTED)
10503 : (AT_LEAST_UNI_SEMANTICS)
10509 *flagp |= HASWIDTH;
10511 /* override incorrect value set in reganode MJD */
10512 Set_Node_Offset(ret, parse_start+1);
10513 Set_Node_Cur_Length(ret); /* MJD */
10515 nextchar(pRExC_state);
10520 if (RExC_parse >= RExC_end)
10521 FAIL("Trailing \\");
10524 /* Do not generate "unrecognized" warnings here, we fall
10525 back into the quick-grab loop below */
10532 if (RExC_flags & RXf_PMf_EXTENDED) {
10533 if ( reg_skipcomment( pRExC_state ) )
10540 parse_start = RExC_parse - 1;
10549 #define MAX_NODE_STRING_SIZE 127
10550 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10552 U8 upper_parse = MAX_NODE_STRING_SIZE;
10555 bool next_is_quantifier;
10556 char * oldp = NULL;
10558 /* If a folding node contains only code points that don't
10559 * participate in folds, it can be changed into an EXACT node,
10560 * which allows the optimizer more things to look for */
10564 node_type = compute_EXACTish(pRExC_state);
10565 ret = reg_node(pRExC_state, node_type);
10567 /* In pass1, folded, we use a temporary buffer instead of the
10568 * actual node, as the node doesn't exist yet */
10569 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10575 /* We do the EXACTFish to EXACT node only if folding, and not if in
10576 * locale, as whether a character folds or not isn't known until
10578 maybe_exact = FOLD && ! LOC;
10580 /* XXX The node can hold up to 255 bytes, yet this only goes to
10581 * 127. I (khw) do not know why. Keeping it somewhat less than
10582 * 255 allows us to not have to worry about overflow due to
10583 * converting to utf8 and fold expansion, but that value is
10584 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
10585 * split up by this limit into a single one using the real max of
10586 * 255. Even at 127, this breaks under rare circumstances. If
10587 * folding, we do not want to split a node at a character that is a
10588 * non-final in a multi-char fold, as an input string could just
10589 * happen to want to match across the node boundary. The join
10590 * would solve that problem if the join actually happens. But a
10591 * series of more than two nodes in a row each of 127 would cause
10592 * the first join to succeed to get to 254, but then there wouldn't
10593 * be room for the next one, which could at be one of those split
10594 * multi-char folds. I don't know of any fool-proof solution. One
10595 * could back off to end with only a code point that isn't such a
10596 * non-final, but it is possible for there not to be any in the
10598 for (p = RExC_parse - 1;
10599 len < upper_parse && p < RExC_end;
10604 if (RExC_flags & RXf_PMf_EXTENDED)
10605 p = regwhite( pRExC_state, p );
10616 /* Literal Escapes Switch
10618 This switch is meant to handle escape sequences that
10619 resolve to a literal character.
10621 Every escape sequence that represents something
10622 else, like an assertion or a char class, is handled
10623 in the switch marked 'Special Escapes' above in this
10624 routine, but also has an entry here as anything that
10625 isn't explicitly mentioned here will be treated as
10626 an unescaped equivalent literal.
10629 switch ((U8)*++p) {
10630 /* These are all the special escapes. */
10631 case 'A': /* Start assertion */
10632 case 'b': case 'B': /* Word-boundary assertion*/
10633 case 'C': /* Single char !DANGEROUS! */
10634 case 'd': case 'D': /* digit class */
10635 case 'g': case 'G': /* generic-backref, pos assertion */
10636 case 'h': case 'H': /* HORIZWS */
10637 case 'k': case 'K': /* named backref, keep marker */
10638 case 'p': case 'P': /* Unicode property */
10639 case 'R': /* LNBREAK */
10640 case 's': case 'S': /* space class */
10641 case 'v': case 'V': /* VERTWS */
10642 case 'w': case 'W': /* word class */
10643 case 'X': /* eXtended Unicode "combining character sequence" */
10644 case 'z': case 'Z': /* End of line/string assertion */
10648 /* Anything after here is an escape that resolves to a
10649 literal. (Except digits, which may or may not)
10655 case 'N': /* Handle a single-code point named character. */
10656 /* The options cause it to fail if a multiple code
10657 * point sequence. Handle those in the switch() above
10659 RExC_parse = p + 1;
10660 if (! grok_bslash_N(pRExC_state, NULL, &ender,
10661 flagp, depth, FALSE,
10662 FALSE /* not strict */ ))
10664 RExC_parse = p = oldp;
10668 if (ender > 0xff) {
10685 ender = ASCII_TO_NATIVE('\033');
10689 ender = ASCII_TO_NATIVE('\007');
10695 const char* error_msg;
10697 bool valid = grok_bslash_o(&p,
10700 TRUE, /* out warnings */
10701 FALSE, /* not strict */
10702 TRUE, /* Output warnings
10707 RExC_parse = p; /* going to die anyway; point
10708 to exact spot of failure */
10712 if (PL_encoding && ender < 0x100) {
10713 goto recode_encoding;
10715 if (ender > 0xff) {
10722 UV result = UV_MAX; /* initialize to erroneous
10724 const char* error_msg;
10726 bool valid = grok_bslash_x(&p,
10729 TRUE, /* out warnings */
10730 FALSE, /* not strict */
10731 TRUE, /* Output warnings
10736 RExC_parse = p; /* going to die anyway; point
10737 to exact spot of failure */
10742 if (PL_encoding && ender < 0x100) {
10743 goto recode_encoding;
10745 if (ender > 0xff) {
10752 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10754 case '0': case '1': case '2': case '3':case '4':
10755 case '5': case '6': case '7':
10757 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10759 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10761 ender = grok_oct(p, &numlen, &flags, NULL);
10762 if (ender > 0xff) {
10766 if (SIZE_ONLY /* like \08, \178 */
10769 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
10771 reg_warn_non_literal_string(
10773 form_short_octal_warning(p, numlen));
10776 else { /* Not to be treated as an octal constant, go
10781 if (PL_encoding && ender < 0x100)
10782 goto recode_encoding;
10785 if (! RExC_override_recoding) {
10786 SV* enc = PL_encoding;
10787 ender = reg_recode((const char)(U8)ender, &enc);
10788 if (!enc && SIZE_ONLY)
10789 ckWARNreg(p, "Invalid escape in the specified encoding");
10795 FAIL("Trailing \\");
10798 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
10799 /* Include any { following the alpha to emphasize
10800 * that it could be part of an escape at some point
10802 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
10803 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
10805 goto normal_default;
10806 } /* End of switch on '\' */
10808 default: /* A literal character */
10811 && RExC_flags & RXf_PMf_EXTENDED
10812 && ckWARN(WARN_DEPRECATED)
10813 && is_PATWS_non_low(p, UTF))
10815 vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
10816 "Escape literal pattern white space under /x");
10820 if (UTF8_IS_START(*p) && UTF) {
10822 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10823 &numlen, UTF8_ALLOW_DEFAULT);
10829 } /* End of switch on the literal */
10831 /* Here, have looked at the literal character and <ender>
10832 * contains its ordinal, <p> points to the character after it
10835 if ( RExC_flags & RXf_PMf_EXTENDED)
10836 p = regwhite( pRExC_state, p );
10838 /* If the next thing is a quantifier, it applies to this
10839 * character only, which means that this character has to be in
10840 * its own node and can't just be appended to the string in an
10841 * existing node, so if there are already other characters in
10842 * the node, close the node with just them, and set up to do
10843 * this character again next time through, when it will be the
10844 * only thing in its new node */
10845 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
10853 /* See comments for join_exact() as to why we fold
10854 * this non-UTF at compile time */
10855 || (node_type == EXACTFU
10856 && ender == LATIN_SMALL_LETTER_SHARP_S))
10860 /* Prime the casefolded buffer. Locale rules, which
10861 * apply only to code points < 256, aren't known until
10862 * execution, so for them, just output the original
10863 * character using utf8. If we start to fold non-UTF
10864 * patterns, be sure to update join_exact() */
10865 if (LOC && ender < 256) {
10866 if (UNI_IS_INVARIANT(ender)) {
10870 *s = UTF8_TWO_BYTE_HI(ender);
10871 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
10876 UV folded = _to_uni_fold_flags(
10881 | ((LOC) ? FOLD_FLAGS_LOCALE
10882 : (ASCII_FOLD_RESTRICTED)
10883 ? FOLD_FLAGS_NOMIX_ASCII
10887 /* If this node only contains non-folding code
10888 * points so far, see if this new one is also
10891 if (folded != ender) {
10892 maybe_exact = FALSE;
10895 /* Here the fold is the original; we have
10896 * to check further to see if anything
10898 if (! PL_utf8_foldable) {
10899 SV* swash = swash_init("utf8",
10901 &PL_sv_undef, 1, 0);
10903 _get_swash_invlist(swash);
10904 SvREFCNT_dec_NN(swash);
10906 if (_invlist_contains_cp(PL_utf8_foldable,
10909 maybe_exact = FALSE;
10917 /* The loop increments <len> each time, as all but this
10918 * path (and the one just below for UTF) through it add
10919 * a single byte to the EXACTish node. But this one
10920 * has changed len to be the correct final value, so
10921 * subtract one to cancel out the increment that
10923 len += foldlen - 1;
10926 *(s++) = (char) ender;
10927 maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
10931 const STRLEN unilen = reguni(pRExC_state, ender, s);
10937 /* See comment just above for - 1 */
10941 REGC((char)ender, s++);
10944 if (next_is_quantifier) {
10946 /* Here, the next input is a quantifier, and to get here,
10947 * the current character is the only one in the node.
10948 * Also, here <len> doesn't include the final byte for this
10954 } /* End of loop through literal characters */
10956 /* Here we have either exhausted the input or ran out of room in
10957 * the node. (If we encountered a character that can't be in the
10958 * node, transfer is made directly to <loopdone>, and so we
10959 * wouldn't have fallen off the end of the loop.) In the latter
10960 * case, we artificially have to split the node into two, because
10961 * we just don't have enough space to hold everything. This
10962 * creates a problem if the final character participates in a
10963 * multi-character fold in the non-final position, as a match that
10964 * should have occurred won't, due to the way nodes are matched,
10965 * and our artificial boundary. So back off until we find a non-
10966 * problematic character -- one that isn't at the beginning or
10967 * middle of such a fold. (Either it doesn't participate in any
10968 * folds, or appears only in the final position of all the folds it
10969 * does participate in.) A better solution with far fewer false
10970 * positives, and that would fill the nodes more completely, would
10971 * be to actually have available all the multi-character folds to
10972 * test against, and to back-off only far enough to be sure that
10973 * this node isn't ending with a partial one. <upper_parse> is set
10974 * further below (if we need to reparse the node) to include just
10975 * up through that final non-problematic character that this code
10976 * identifies, so when it is set to less than the full node, we can
10977 * skip the rest of this */
10978 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
10980 const STRLEN full_len = len;
10982 assert(len >= MAX_NODE_STRING_SIZE);
10984 /* Here, <s> points to the final byte of the final character.
10985 * Look backwards through the string until find a non-
10986 * problematic character */
10990 /* These two have no multi-char folds to non-UTF characters
10992 if (ASCII_FOLD_RESTRICTED || LOC) {
10996 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
11000 if (! PL_NonL1NonFinalFold) {
11001 PL_NonL1NonFinalFold = _new_invlist_C_array(
11002 NonL1_Perl_Non_Final_Folds_invlist);
11005 /* Point to the first byte of the final character */
11006 s = (char *) utf8_hop((U8 *) s, -1);
11008 while (s >= s0) { /* Search backwards until find
11009 non-problematic char */
11010 if (UTF8_IS_INVARIANT(*s)) {
11012 /* There are no ascii characters that participate
11013 * in multi-char folds under /aa. In EBCDIC, the
11014 * non-ascii invariants are all control characters,
11015 * so don't ever participate in any folds. */
11016 if (ASCII_FOLD_RESTRICTED
11017 || ! IS_NON_FINAL_FOLD(*s))
11022 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11024 /* No Latin1 characters participate in multi-char
11025 * folds under /l */
11027 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
11033 else if (! _invlist_contains_cp(
11034 PL_NonL1NonFinalFold,
11035 valid_utf8_to_uvchr((U8 *) s, NULL)))
11040 /* Here, the current character is problematic in that
11041 * it does occur in the non-final position of some
11042 * fold, so try the character before it, but have to
11043 * special case the very first byte in the string, so
11044 * we don't read outside the string */
11045 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11046 } /* End of loop backwards through the string */
11048 /* If there were only problematic characters in the string,
11049 * <s> will point to before s0, in which case the length
11050 * should be 0, otherwise include the length of the
11051 * non-problematic character just found */
11052 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11055 /* Here, have found the final character, if any, that is
11056 * non-problematic as far as ending the node without splitting
11057 * it across a potential multi-char fold. <len> contains the
11058 * number of bytes in the node up-to and including that
11059 * character, or is 0 if there is no such character, meaning
11060 * the whole node contains only problematic characters. In
11061 * this case, give up and just take the node as-is. We can't
11067 /* Here, the node does contain some characters that aren't
11068 * problematic. If one such is the final character in the
11069 * node, we are done */
11070 if (len == full_len) {
11073 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11075 /* If the final character is problematic, but the
11076 * penultimate is not, back-off that last character to
11077 * later start a new node with it */
11082 /* Here, the final non-problematic character is earlier
11083 * in the input than the penultimate character. What we do
11084 * is reparse from the beginning, going up only as far as
11085 * this final ok one, thus guaranteeing that the node ends
11086 * in an acceptable character. The reason we reparse is
11087 * that we know how far in the character is, but we don't
11088 * know how to correlate its position with the input parse.
11089 * An alternate implementation would be to build that
11090 * correlation as we go along during the original parse,
11091 * but that would entail extra work for every node, whereas
11092 * this code gets executed only when the string is too
11093 * large for the node, and the final two characters are
11094 * problematic, an infrequent occurrence. Yet another
11095 * possible strategy would be to save the tail of the
11096 * string, and the next time regatom is called, initialize
11097 * with that. The problem with this is that unless you
11098 * back off one more character, you won't be guaranteed
11099 * regatom will get called again, unless regbranch,
11100 * regpiece ... are also changed. If you do back off that
11101 * extra character, so that there is input guaranteed to
11102 * force calling regatom, you can't handle the case where
11103 * just the first character in the node is acceptable. I
11104 * (khw) decided to try this method which doesn't have that
11105 * pitfall; if performance issues are found, we can do a
11106 * combination of the current approach plus that one */
11112 } /* End of verifying node ends with an appropriate char */
11114 loopdone: /* Jumped to when encounters something that shouldn't be in
11117 /* If 'maybe_exact' is still set here, means there are no
11118 * code points in the node that participate in folds */
11119 if (FOLD && maybe_exact) {
11123 /* I (khw) don't know if you can get here with zero length, but the
11124 * old code handled this situation by creating a zero-length EXACT
11125 * node. Might as well be NOTHING instead */
11130 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11133 RExC_parse = p - 1;
11134 Set_Node_Cur_Length(ret); /* MJD */
11135 nextchar(pRExC_state);
11137 /* len is STRLEN which is unsigned, need to copy to signed */
11140 vFAIL("Internal disaster");
11143 } /* End of label 'defchar:' */
11145 } /* End of giant switch on input character */
11151 S_regwhite( RExC_state_t *pRExC_state, char *p )
11153 const char *e = RExC_end;
11155 PERL_ARGS_ASSERT_REGWHITE;
11160 else if (*p == '#') {
11163 if (*p++ == '\n') {
11169 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11178 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
11180 /* Returns the next non-pattern-white space, non-comment character (the
11181 * latter only if 'recognize_comment is true) in the string p, which is
11182 * ended by RExC_end. If there is no line break ending a comment,
11183 * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
11184 const char *e = RExC_end;
11186 PERL_ARGS_ASSERT_REGPATWS;
11190 if ((len = is_PATWS_safe(p, e, UTF))) {
11193 else if (recognize_comment && *p == '#') {
11197 if (is_LNBREAK_safe(p, e, UTF)) {
11203 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11211 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11212 Character classes ([:foo:]) can also be negated ([:^foo:]).
11213 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11214 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11215 but trigger failures because they are currently unimplemented. */
11217 #define POSIXCC_DONE(c) ((c) == ':')
11218 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11219 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11221 PERL_STATIC_INLINE I32
11222 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, SV *free_me,
11226 I32 namedclass = OOB_NAMEDCLASS;
11228 PERL_ARGS_ASSERT_REGPPOSIXCC;
11230 if (value == '[' && RExC_parse + 1 < RExC_end &&
11231 /* I smell either [: or [= or [. -- POSIX has been here, right? */
11232 POSIXCC(UCHARAT(RExC_parse)))
11234 const char c = UCHARAT(RExC_parse);
11235 char* const s = RExC_parse++;
11237 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11239 if (RExC_parse == RExC_end) {
11242 /* Try to give a better location for the error (than the end of
11243 * the string) by looking for the matching ']' */
11245 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11248 vFAIL2("Unmatched '%c' in POSIX class", c);
11250 /* Grandfather lone [:, [=, [. */
11254 const char* const t = RExC_parse++; /* skip over the c */
11257 if (UCHARAT(RExC_parse) == ']') {
11258 const char *posixcc = s + 1;
11259 RExC_parse++; /* skip over the ending ] */
11262 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11263 const I32 skip = t - posixcc;
11265 /* Initially switch on the length of the name. */
11268 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
11269 this is the Perl \w
11271 namedclass = ANYOF_WORDCHAR;
11274 /* Names all of length 5. */
11275 /* alnum alpha ascii blank cntrl digit graph lower
11276 print punct space upper */
11277 /* Offset 4 gives the best switch position. */
11278 switch (posixcc[4]) {
11280 if (memEQ(posixcc, "alph", 4)) /* alpha */
11281 namedclass = ANYOF_ALPHA;
11284 if (memEQ(posixcc, "spac", 4)) /* space */
11285 namedclass = ANYOF_PSXSPC;
11288 if (memEQ(posixcc, "grap", 4)) /* graph */
11289 namedclass = ANYOF_GRAPH;
11292 if (memEQ(posixcc, "asci", 4)) /* ascii */
11293 namedclass = ANYOF_ASCII;
11296 if (memEQ(posixcc, "blan", 4)) /* blank */
11297 namedclass = ANYOF_BLANK;
11300 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11301 namedclass = ANYOF_CNTRL;
11304 if (memEQ(posixcc, "alnu", 4)) /* alnum */
11305 namedclass = ANYOF_ALPHANUMERIC;
11308 if (memEQ(posixcc, "lowe", 4)) /* lower */
11309 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
11310 else if (memEQ(posixcc, "uppe", 4)) /* upper */
11311 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
11314 if (memEQ(posixcc, "digi", 4)) /* digit */
11315 namedclass = ANYOF_DIGIT;
11316 else if (memEQ(posixcc, "prin", 4)) /* print */
11317 namedclass = ANYOF_PRINT;
11318 else if (memEQ(posixcc, "punc", 4)) /* punct */
11319 namedclass = ANYOF_PUNCT;
11324 if (memEQ(posixcc, "xdigit", 6))
11325 namedclass = ANYOF_XDIGIT;
11329 if (namedclass == OOB_NAMEDCLASS)
11330 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11333 /* The #defines are structured so each complement is +1 to
11334 * the normal one */
11338 assert (posixcc[skip] == ':');
11339 assert (posixcc[skip+1] == ']');
11340 } else if (!SIZE_ONLY) {
11341 /* [[=foo=]] and [[.foo.]] are still future. */
11343 /* adjust RExC_parse so the warning shows after
11344 the class closes */
11345 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11347 SvREFCNT_dec(free_me);
11348 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11351 /* Maternal grandfather:
11352 * "[:" ending in ":" but not in ":]" */
11354 vFAIL("Unmatched '[' in POSIX class");
11357 /* Grandfather lone [:, [=, [. */
11367 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
11369 /* This applies some heuristics at the current parse position (which should
11370 * be at a '[') to see if what follows might be intended to be a [:posix:]
11371 * class. It returns true if it really is a posix class, of course, but it
11372 * also can return true if it thinks that what was intended was a posix
11373 * class that didn't quite make it.
11375 * It will return true for
11377 * [:alphanumerics] (as long as the ] isn't followed immediately by a
11378 * ')' indicating the end of the (?[
11379 * [:any garbage including %^&$ punctuation:]
11381 * This is designed to be called only from S_handle_regex_sets; it could be
11382 * easily adapted to be called from the spot at the beginning of regclass()
11383 * that checks to see in a normal bracketed class if the surrounding []
11384 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
11385 * change long-standing behavior, so I (khw) didn't do that */
11386 char* p = RExC_parse + 1;
11387 char first_char = *p;
11389 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
11391 assert(*(p - 1) == '[');
11393 if (! POSIXCC(first_char)) {
11398 while (p < RExC_end && isWORDCHAR(*p)) p++;
11400 if (p >= RExC_end) {
11404 if (p - RExC_parse > 2 /* Got at least 1 word character */
11405 && (*p == first_char
11406 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
11411 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
11414 && p - RExC_parse > 2 /* [:] evaluates to colon;
11415 [::] is a bad posix class. */
11416 && first_char == *(p - 1));
11420 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth,
11421 char * const oregcomp_parse)
11423 /* Handle the (?[...]) construct to do set operations */
11426 UV start, end; /* End points of code point ranges */
11428 char *save_end, *save_parse;
11433 const bool save_fold = FOLD;
11435 GET_RE_DEBUG_FLAGS_DECL;
11437 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
11440 vFAIL("(?[...]) not valid in locale");
11442 RExC_uni_semantics = 1;
11444 /* This will return only an ANYOF regnode, or (unlikely) something smaller
11445 * (such as EXACT). Thus we can skip most everything if just sizing. We
11446 * call regclass to handle '[]' so as to not have to reinvent its parsing
11447 * rules here (throwing away the size it computes each time). And, we exit
11448 * upon an unescaped ']' that isn't one ending a regclass. To do both
11449 * these things, we need to realize that something preceded by a backslash
11450 * is escaped, so we have to keep track of backslashes */
11453 Perl_ck_warner_d(aTHX_
11454 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
11455 "The regex_sets feature is experimental" REPORT_LOCATION,
11456 (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse);
11458 while (RExC_parse < RExC_end) {
11459 SV* current = NULL;
11460 RExC_parse = regpatws(pRExC_state, RExC_parse,
11461 TRUE); /* means recognize comments */
11462 switch (*RExC_parse) {
11466 /* Skip the next byte (which could cause us to end up in
11467 * the middle of a UTF-8 character, but since none of those
11468 * are confusable with anything we currently handle in this
11469 * switch (invariants all), it's safe. We'll just hit the
11470 * default: case next time and keep on incrementing until
11471 * we find one of the invariants we do handle. */
11476 /* If this looks like it is a [:posix:] class, leave the
11477 * parse pointer at the '[' to fool regclass() into
11478 * thinking it is part of a '[[:posix:]]'. That function
11479 * will use strict checking to force a syntax error if it
11480 * doesn't work out to a legitimate class */
11481 bool is_posix_class
11482 = could_it_be_a_POSIX_class(pRExC_state);
11483 if (! is_posix_class) {
11487 (void) regclass(pRExC_state, flagp,depth+1,
11488 is_posix_class, /* parse the whole char
11489 class only if not a
11491 FALSE, /* don't allow multi-char folds */
11492 TRUE, /* silence non-portable warnings. */
11494 /* function call leaves parse pointing to the ']', except
11495 * if we faked it */
11496 if (is_posix_class) {
11500 SvREFCNT_dec(current); /* In case it returned something */
11506 if (RExC_parse < RExC_end
11507 && *RExC_parse == ')')
11509 node = reganode(pRExC_state, ANYOF, 0);
11510 RExC_size += ANYOF_SKIP;
11511 nextchar(pRExC_state);
11512 Set_Node_Length(node,
11513 RExC_parse - oregcomp_parse + 1); /* MJD */
11522 FAIL("Syntax error in (?[...])");
11525 /* Pass 2 only after this. Everything in this construct is a
11526 * metacharacter. Operands begin with either a '\' (for an escape
11527 * sequence), or a '[' for a bracketed character class. Any other
11528 * character should be an operator, or parenthesis for grouping. Both
11529 * types of operands are handled by calling regclass() to parse them. It
11530 * is called with a parameter to indicate to return the computed inversion
11531 * list. The parsing here is implemented via a stack. Each entry on the
11532 * stack is a single character representing one of the operators, or the
11533 * '('; or else a pointer to an operand inversion list. */
11535 #define IS_OPERAND(a) (! SvIOK(a))
11537 /* The stack starts empty. It is a syntax error if the first thing parsed
11538 * is a binary operator; everything else is pushed on the stack. When an
11539 * operand is parsed, the top of the stack is examined. If it is a binary
11540 * operator, the item before it should be an operand, and both are replaced
11541 * by the result of doing that operation on the new operand and the one on
11542 * the stack. Thus a sequence of binary operands is reduced to a single
11543 * one before the next one is parsed.
11545 * A unary operator may immediately follow a binary in the input, for
11548 * When an operand is parsed and the top of the stack is a unary operator,
11549 * the operation is performed, and then the stack is rechecked to see if
11550 * this new operand is part of a binary operation; if so, it is handled as
11553 * A '(' is simply pushed on the stack; it is valid only if the stack is
11554 * empty, or the top element of the stack is an operator or another '('
11555 * (for which the parenthesized expression will become an operand). By the
11556 * time the corresponding ')' is parsed everything in between should have
11557 * been parsed and evaluated to a single operand (or else is a syntax
11558 * error), and is handled as a regular operand */
11562 while (RExC_parse < RExC_end) {
11563 I32 top_index = av_tindex(stack);
11565 SV* current = NULL;
11567 /* Skip white space */
11568 RExC_parse = regpatws(pRExC_state, RExC_parse,
11569 TRUE); /* means recognize comments */
11570 if (RExC_parse >= RExC_end) {
11571 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
11573 if ((curchar = UCHARAT(RExC_parse)) == ']') {
11580 if (av_tindex(stack) >= 0 /* This makes sure that we can
11581 safely subtract 1 from
11582 RExC_parse in the next clause.
11583 If we have something on the
11584 stack, we have parsed something
11586 && UCHARAT(RExC_parse - 1) == '('
11587 && RExC_parse < RExC_end)
11589 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
11590 * This happens when we have some thing like
11592 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
11594 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
11596 * Here we would be handling the interpolated
11597 * '$thai_or_lao'. We handle this by a recursive call to
11598 * ourselves which returns the inversion list the
11599 * interpolated expression evaluates to. We use the flags
11600 * from the interpolated pattern. */
11601 U32 save_flags = RExC_flags;
11602 const char * const save_parse = ++RExC_parse;
11604 parse_lparen_question_flags(pRExC_state);
11606 if (RExC_parse == save_parse /* Makes sure there was at
11607 least one flag (or this
11608 embedding wasn't compiled)
11610 || RExC_parse >= RExC_end - 4
11611 || UCHARAT(RExC_parse) != ':'
11612 || UCHARAT(++RExC_parse) != '('
11613 || UCHARAT(++RExC_parse) != '?'
11614 || UCHARAT(++RExC_parse) != '[')
11617 /* In combination with the above, this moves the
11618 * pointer to the point just after the first erroneous
11619 * character (or if there are no flags, to where they
11620 * should have been) */
11621 if (RExC_parse >= RExC_end - 4) {
11622 RExC_parse = RExC_end;
11624 else if (RExC_parse != save_parse) {
11625 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11627 vFAIL("Expecting '(?flags:(?[...'");
11630 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
11631 depth+1, oregcomp_parse);
11633 /* Here, 'current' contains the embedded expression's
11634 * inversion list, and RExC_parse points to the trailing
11635 * ']'; the next character should be the ')' which will be
11636 * paired with the '(' that has been put on the stack, so
11637 * the whole embedded expression reduces to '(operand)' */
11640 RExC_flags = save_flags;
11641 goto handle_operand;
11646 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11647 vFAIL("Unexpected character");
11650 (void) regclass(pRExC_state, flagp,depth+1,
11651 TRUE, /* means parse just the next thing */
11652 FALSE, /* don't allow multi-char folds */
11653 FALSE, /* don't silence non-portable warnings.
11656 /* regclass() will return with parsing just the \ sequence,
11657 * leaving the parse pointer at the next thing to parse */
11659 goto handle_operand;
11661 case '[': /* Is a bracketed character class */
11663 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
11665 if (! is_posix_class) {
11669 (void) regclass(pRExC_state, flagp,depth+1,
11670 is_posix_class, /* parse the whole char class
11671 only if not a posix class */
11672 FALSE, /* don't allow multi-char folds */
11673 FALSE, /* don't silence non-portable warnings.
11676 /* function call leaves parse pointing to the ']', except if we
11678 if (is_posix_class) {
11682 goto handle_operand;
11691 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
11692 || ! IS_OPERAND(*top_ptr))
11695 vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
11697 av_push(stack, newSVuv(curchar));
11701 av_push(stack, newSVuv(curchar));
11705 if (top_index >= 0) {
11706 top_ptr = av_fetch(stack, top_index, FALSE);
11708 if (IS_OPERAND(*top_ptr)) {
11710 vFAIL("Unexpected '(' with no preceding operator");
11713 av_push(stack, newSVuv(curchar));
11720 || ! (current = av_pop(stack))
11721 || ! IS_OPERAND(current)
11722 || ! (lparen = av_pop(stack))
11723 || IS_OPERAND(lparen)
11724 || SvUV(lparen) != '(')
11727 vFAIL("Unexpected ')'");
11730 SvREFCNT_dec_NN(lparen);
11737 /* Here, we have an operand to process, in 'current' */
11739 if (top_index < 0) { /* Just push if stack is empty */
11740 av_push(stack, current);
11743 SV* top = av_pop(stack);
11744 char current_operator;
11746 if (IS_OPERAND(top)) {
11747 vFAIL("Operand with no preceding operator");
11749 current_operator = (char) SvUV(top);
11750 switch (current_operator) {
11751 case '(': /* Push the '(' back on followed by the new
11753 av_push(stack, top);
11754 av_push(stack, current);
11755 SvREFCNT_inc(top); /* Counters the '_dec' done
11756 just after the 'break', so
11757 it doesn't get wrongly freed
11762 _invlist_invert(current);
11764 /* Unlike binary operators, the top of the stack,
11765 * now that this unary one has been popped off, may
11766 * legally be an operator, and we now have operand
11769 SvREFCNT_dec_NN(top);
11770 goto handle_operand;
11773 _invlist_intersection(av_pop(stack),
11776 av_push(stack, current);
11781 _invlist_union(av_pop(stack), current, ¤t);
11782 av_push(stack, current);
11786 _invlist_subtract(av_pop(stack), current, ¤t);
11787 av_push(stack, current);
11790 case '^': /* The union minus the intersection */
11796 element = av_pop(stack);
11797 _invlist_union(element, current, &u);
11798 _invlist_intersection(element, current, &i);
11799 _invlist_subtract(u, i, ¤t);
11800 av_push(stack, current);
11801 SvREFCNT_dec_NN(i);
11802 SvREFCNT_dec_NN(u);
11803 SvREFCNT_dec_NN(element);
11808 Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
11810 SvREFCNT_dec_NN(top);
11814 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11817 if (av_tindex(stack) < 0 /* Was empty */
11818 || ((final = av_pop(stack)) == NULL)
11819 || ! IS_OPERAND(final)
11820 || av_tindex(stack) >= 0) /* More left on stack */
11822 vFAIL("Incomplete expression within '(?[ ])'");
11825 /* Here, 'final' is the resultant inversion list from evaluating the
11826 * expression. Return it if so requested */
11827 if (return_invlist) {
11828 *return_invlist = final;
11832 /* Otherwise generate a resultant node, based on 'final'. regclass() is
11833 * expecting a string of ranges and individual code points */
11834 invlist_iterinit(final);
11835 result_string = newSVpvs("");
11836 while (invlist_iternext(final, &start, &end)) {
11837 if (start == end) {
11838 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
11841 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
11846 save_parse = RExC_parse;
11847 RExC_parse = SvPV(result_string, len);
11848 save_end = RExC_end;
11849 RExC_end = RExC_parse + len;
11851 /* We turn off folding around the call, as the class we have constructed
11852 * already has all folding taken into consideration, and we don't want
11853 * regclass() to add to that */
11854 RExC_flags &= ~RXf_PMf_FOLD;
11855 node = regclass(pRExC_state, flagp,depth+1,
11856 FALSE, /* means parse the whole char class */
11857 FALSE, /* don't allow multi-char folds */
11858 TRUE, /* silence non-portable warnings. The above may very
11859 well have generated non-portable code points, but
11860 they're valid on this machine */
11863 RExC_flags |= RXf_PMf_FOLD;
11865 RExC_parse = save_parse + 1;
11866 RExC_end = save_end;
11867 SvREFCNT_dec_NN(final);
11868 SvREFCNT_dec_NN(result_string);
11869 SvREFCNT_dec_NN(stack);
11871 nextchar(pRExC_state);
11872 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
11877 /* The names of properties whose definitions are not known at compile time are
11878 * stored in this SV, after a constant heading. So if the length has been
11879 * changed since initialization, then there is a run-time definition. */
11880 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
11883 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
11884 const bool stop_at_1, /* Just parse the next thing, don't
11885 look for a full character class */
11886 bool allow_multi_folds,
11887 const bool silence_non_portable, /* Don't output warnings
11890 SV** ret_invlist) /* Return an inversion list, not a node */
11892 /* parse a bracketed class specification. Most of these will produce an
11893 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
11894 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
11895 * under /i with multi-character folds: it will be rewritten following the
11896 * paradigm of this example, where the <multi-fold>s are characters which
11897 * fold to multiple character sequences:
11898 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
11899 * gets effectively rewritten as:
11900 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
11901 * reg() gets called (recursively) on the rewritten version, and this
11902 * function will return what it constructs. (Actually the <multi-fold>s
11903 * aren't physically removed from the [abcdefghi], it's just that they are
11904 * ignored in the recursion by means of a flag:
11905 * <RExC_in_multi_char_class>.)
11907 * ANYOF nodes contain a bit map for the first 256 characters, with the
11908 * corresponding bit set if that character is in the list. For characters
11909 * above 255, a range list or swash is used. There are extra bits for \w,
11910 * etc. in locale ANYOFs, as what these match is not determinable at
11914 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
11916 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
11919 IV namedclass = OOB_NAMEDCLASS;
11920 char *rangebegin = NULL;
11921 bool need_class = 0;
11923 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
11924 than just initialized. */
11925 SV* properties = NULL; /* Code points that match \p{} \P{} */
11926 SV* posixes = NULL; /* Code points that match classes like, [:word:],
11927 extended beyond the Latin1 range */
11928 UV element_count = 0; /* Number of distinct elements in the class.
11929 Optimizations may be possible if this is tiny */
11930 AV * multi_char_matches = NULL; /* Code points that fold to more than one
11931 character; used under /i */
11933 char * stop_ptr = RExC_end; /* where to stop parsing */
11934 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
11936 const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
11938 /* Unicode properties are stored in a swash; this holds the current one
11939 * being parsed. If this swash is the only above-latin1 component of the
11940 * character class, an optimization is to pass it directly on to the
11941 * execution engine. Otherwise, it is set to NULL to indicate that there
11942 * are other things in the class that have to be dealt with at execution
11944 SV* swash = NULL; /* Code points that match \p{} \P{} */
11946 /* Set if a component of this character class is user-defined; just passed
11947 * on to the engine */
11948 bool has_user_defined_property = FALSE;
11950 /* inversion list of code points this node matches only when the target
11951 * string is in UTF-8. (Because is under /d) */
11952 SV* depends_list = NULL;
11954 /* inversion list of code points this node matches. For much of the
11955 * function, it includes only those that match regardless of the utf8ness
11956 * of the target string */
11957 SV* cp_list = NULL;
11960 /* In a range, counts how many 0-2 of the ends of it came from literals,
11961 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
11962 UV literal_endpoint = 0;
11964 bool invert = FALSE; /* Is this class to be complemented */
11966 /* Is there any thing like \W or [:^digit:] that matches above the legal
11967 * Unicode range? */
11968 bool runtime_posix_matches_above_Unicode = FALSE;
11970 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11971 case we need to change the emitted regop to an EXACT. */
11972 const char * orig_parse = RExC_parse;
11973 const I32 orig_size = RExC_size;
11974 GET_RE_DEBUG_FLAGS_DECL;
11976 PERL_ARGS_ASSERT_REGCLASS;
11978 PERL_UNUSED_ARG(depth);
11981 DEBUG_PARSE("clas");
11983 /* Assume we are going to generate an ANYOF node. */
11984 ret = reganode(pRExC_state, ANYOF, 0);
11987 RExC_size += ANYOF_SKIP;
11988 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11991 ANYOF_FLAGS(ret) = 0;
11993 RExC_emit += ANYOF_SKIP;
11995 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11997 listsv = newSVpvs("# comment\n");
11998 initial_listsv_len = SvCUR(listsv);
12002 RExC_parse = regpatws(pRExC_state, RExC_parse,
12003 FALSE /* means don't recognize comments */);
12006 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
12009 allow_multi_folds = FALSE;
12012 RExC_parse = regpatws(pRExC_state, RExC_parse,
12013 FALSE /* means don't recognize comments */);
12017 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
12018 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
12019 const char *s = RExC_parse;
12020 const char c = *s++;
12022 while (isWORDCHAR(*s))
12024 if (*s && c == *s && s[1] == ']') {
12025 SAVEFREESV(RExC_rx_sv);
12026 SAVEFREESV(listsv);
12028 "POSIX syntax [%c %c] belongs inside character classes",
12030 (void)ReREFCNT_inc(RExC_rx_sv);
12031 SvREFCNT_inc_simple_void_NN(listsv);
12035 /* If the caller wants us to just parse a single element, accomplish this
12036 * by faking the loop ending condition */
12037 if (stop_at_1 && RExC_end > RExC_parse) {
12038 stop_ptr = RExC_parse + 1;
12041 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
12042 if (UCHARAT(RExC_parse) == ']')
12043 goto charclassloop;
12047 if (RExC_parse >= stop_ptr) {
12052 RExC_parse = regpatws(pRExC_state, RExC_parse,
12053 FALSE /* means don't recognize comments */);
12056 if (UCHARAT(RExC_parse) == ']') {
12062 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
12063 save_value = value;
12064 save_prevvalue = prevvalue;
12067 rangebegin = RExC_parse;
12071 value = utf8n_to_uvchr((U8*)RExC_parse,
12072 RExC_end - RExC_parse,
12073 &numlen, UTF8_ALLOW_DEFAULT);
12074 RExC_parse += numlen;
12077 value = UCHARAT(RExC_parse++);
12080 && RExC_parse < RExC_end
12081 && POSIXCC(UCHARAT(RExC_parse)))
12083 namedclass = regpposixcc(pRExC_state, value, listsv, strict);
12085 else if (value == '\\') {
12087 value = utf8n_to_uvchr((U8*)RExC_parse,
12088 RExC_end - RExC_parse,
12089 &numlen, UTF8_ALLOW_DEFAULT);
12090 RExC_parse += numlen;
12093 value = UCHARAT(RExC_parse++);
12095 /* Some compilers cannot handle switching on 64-bit integer
12096 * values, therefore value cannot be an UV. Yes, this will
12097 * be a problem later if we want switch on Unicode.
12098 * A similar issue a little bit later when switching on
12099 * namedclass. --jhi */
12101 /* If the \ is escaping white space when white space is being
12102 * skipped, it means that that white space is wanted literally, and
12103 * is already in 'value'. Otherwise, need to translate the escape
12104 * into what it signifies. */
12105 if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
12107 case 'w': namedclass = ANYOF_WORDCHAR; break;
12108 case 'W': namedclass = ANYOF_NWORDCHAR; break;
12109 case 's': namedclass = ANYOF_SPACE; break;
12110 case 'S': namedclass = ANYOF_NSPACE; break;
12111 case 'd': namedclass = ANYOF_DIGIT; break;
12112 case 'D': namedclass = ANYOF_NDIGIT; break;
12113 case 'v': namedclass = ANYOF_VERTWS; break;
12114 case 'V': namedclass = ANYOF_NVERTWS; break;
12115 case 'h': namedclass = ANYOF_HORIZWS; break;
12116 case 'H': namedclass = ANYOF_NHORIZWS; break;
12117 case 'N': /* Handle \N{NAME} in class */
12119 /* We only pay attention to the first char of
12120 multichar strings being returned. I kinda wonder
12121 if this makes sense as it does change the behaviour
12122 from earlier versions, OTOH that behaviour was broken
12124 if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
12125 TRUE, /* => charclass */
12137 /* We will handle any undefined properties ourselves */
12138 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
12140 if (RExC_parse >= RExC_end)
12141 vFAIL2("Empty \\%c{}", (U8)value);
12142 if (*RExC_parse == '{') {
12143 const U8 c = (U8)value;
12144 e = strchr(RExC_parse++, '}');
12146 vFAIL2("Missing right brace on \\%c{}", c);
12147 while (isSPACE(UCHARAT(RExC_parse)))
12149 if (e == RExC_parse)
12150 vFAIL2("Empty \\%c{}", c);
12151 n = e - RExC_parse;
12152 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
12163 if (UCHARAT(RExC_parse) == '^') {
12166 /* toggle. (The rhs xor gets the single bit that
12167 * differs between P and p; the other xor inverts just
12169 value ^= 'P' ^ 'p';
12171 while (isSPACE(UCHARAT(RExC_parse))) {
12176 /* Try to get the definition of the property into
12177 * <invlist>. If /i is in effect, the effective property
12178 * will have its name be <__NAME_i>. The design is
12179 * discussed in commit
12180 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
12181 Newx(name, n + sizeof("_i__\n"), char);
12183 sprintf(name, "%s%.*s%s\n",
12184 (FOLD) ? "__" : "",
12190 /* Look up the property name, and get its swash and
12191 * inversion list, if the property is found */
12193 SvREFCNT_dec_NN(swash);
12195 swash = _core_swash_init("utf8", name, &PL_sv_undef,
12198 NULL, /* No inversion list */
12201 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
12203 SvREFCNT_dec_NN(swash);
12207 /* Here didn't find it. It could be a user-defined
12208 * property that will be available at run-time. If we
12209 * accept only compile-time properties, is an error;
12210 * otherwise add it to the list for run-time look up */
12212 RExC_parse = e + 1;
12213 vFAIL3("Property '%.*s' is unknown", (int) n, name);
12215 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
12216 (value == 'p' ? '+' : '!'),
12218 has_user_defined_property = TRUE;
12220 /* We don't know yet, so have to assume that the
12221 * property could match something in the Latin1 range,
12222 * hence something that isn't utf8. Note that this
12223 * would cause things in <depends_list> to match
12224 * inappropriately, except that any \p{}, including
12225 * this one forces Unicode semantics, which means there
12226 * is <no depends_list> */
12227 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
12231 /* Here, did get the swash and its inversion list. If
12232 * the swash is from a user-defined property, then this
12233 * whole character class should be regarded as such */
12234 has_user_defined_property =
12236 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
12238 /* Invert if asking for the complement */
12239 if (value == 'P') {
12240 _invlist_union_complement_2nd(properties,
12244 /* The swash can't be used as-is, because we've
12245 * inverted things; delay removing it to here after
12246 * have copied its invlist above */
12247 SvREFCNT_dec_NN(swash);
12251 _invlist_union(properties, invlist, &properties);
12256 RExC_parse = e + 1;
12257 namedclass = ANYOF_UNIPROP; /* no official name, but it's
12260 /* \p means they want Unicode semantics */
12261 RExC_uni_semantics = 1;
12264 case 'n': value = '\n'; break;
12265 case 'r': value = '\r'; break;
12266 case 't': value = '\t'; break;
12267 case 'f': value = '\f'; break;
12268 case 'b': value = '\b'; break;
12269 case 'e': value = ASCII_TO_NATIVE('\033');break;
12270 case 'a': value = ASCII_TO_NATIVE('\007');break;
12272 RExC_parse--; /* function expects to be pointed at the 'o' */
12274 const char* error_msg;
12275 bool valid = grok_bslash_o(&RExC_parse,
12278 SIZE_ONLY, /* warnings in pass
12281 silence_non_portable,
12287 if (PL_encoding && value < 0x100) {
12288 goto recode_encoding;
12292 RExC_parse--; /* function expects to be pointed at the 'x' */
12294 const char* error_msg;
12295 bool valid = grok_bslash_x(&RExC_parse,
12298 TRUE, /* Output warnings */
12300 silence_non_portable,
12306 if (PL_encoding && value < 0x100)
12307 goto recode_encoding;
12310 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
12312 case '0': case '1': case '2': case '3': case '4':
12313 case '5': case '6': case '7':
12315 /* Take 1-3 octal digits */
12316 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12317 numlen = (strict) ? 4 : 3;
12318 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
12319 RExC_parse += numlen;
12321 SAVEFREESV(listsv); /* In case warnings are fatalized */
12323 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12324 vFAIL("Need exactly 3 octal digits");
12326 else if (! SIZE_ONLY /* like \08, \178 */
12328 && RExC_parse < RExC_end
12329 && isDIGIT(*RExC_parse)
12330 && ckWARN(WARN_REGEXP))
12332 SAVEFREESV(RExC_rx_sv);
12333 reg_warn_non_literal_string(
12335 form_short_octal_warning(RExC_parse, numlen));
12336 (void)ReREFCNT_inc(RExC_rx_sv);
12338 SvREFCNT_inc_simple_void_NN(listsv);
12340 if (PL_encoding && value < 0x100)
12341 goto recode_encoding;
12345 if (! RExC_override_recoding) {
12346 SV* enc = PL_encoding;
12347 value = reg_recode((const char)(U8)value, &enc);
12350 vFAIL("Invalid escape in the specified encoding");
12352 else if (SIZE_ONLY) {
12353 ckWARNreg(RExC_parse,
12354 "Invalid escape in the specified encoding");
12360 /* Allow \_ to not give an error */
12361 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
12362 SAVEFREESV(listsv);
12364 vFAIL2("Unrecognized escape \\%c in character class",
12368 SAVEFREESV(RExC_rx_sv);
12369 ckWARN2reg(RExC_parse,
12370 "Unrecognized escape \\%c in character class passed through",
12372 (void)ReREFCNT_inc(RExC_rx_sv);
12374 SvREFCNT_inc_simple_void_NN(listsv);
12377 } /* End of switch on char following backslash */
12378 } /* end of handling backslash escape sequences */
12381 literal_endpoint++;
12384 /* Here, we have the current token in 'value' */
12386 /* What matches in a locale is not known until runtime. This includes
12387 * what the Posix classes (like \w, [:space:]) match. Room must be
12388 * reserved (one time per class) to store such classes, either if Perl
12389 * is compiled so that locale nodes always should have this space, or
12390 * if there is such class info to be stored. The space will contain a
12391 * bit for each named class that is to be matched against. This isn't
12392 * needed for \p{} and pseudo-classes, as they are not affected by
12393 * locale, and hence are dealt with separately */
12396 && (ANYOF_LOCALE == ANYOF_CLASS
12397 || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
12401 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12404 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12405 ANYOF_CLASS_ZERO(ret);
12407 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
12410 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
12412 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
12413 * literal, as is the character that began the false range, i.e.
12414 * the 'a' in the examples */
12417 const int w = (RExC_parse >= rangebegin)
12418 ? RExC_parse - rangebegin
12420 SAVEFREESV(listsv); /* in case of fatal warnings */
12422 vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin);
12425 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
12426 ckWARN4reg(RExC_parse,
12427 "False [] range \"%*.*s\"",
12429 (void)ReREFCNT_inc(RExC_rx_sv);
12430 cp_list = add_cp_to_invlist(cp_list, '-');
12431 cp_list = add_cp_to_invlist(cp_list, prevvalue);
12433 SvREFCNT_inc_simple_void_NN(listsv);
12436 range = 0; /* this was not a true range */
12437 element_count += 2; /* So counts for three values */
12441 U8 classnum = namedclass_to_classnum(namedclass);
12442 if (namedclass >= ANYOF_MAX) { /* If a special class */
12443 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
12445 /* Here, should be \h, \H, \v, or \V. Neither /d nor
12446 * /l make a difference in what these match. There
12447 * would be problems if these characters had folds
12448 * other than themselves, as cp_list is subject to
12450 if (classnum != _CC_VERTSPACE) {
12451 assert( namedclass == ANYOF_HORIZWS
12452 || namedclass == ANYOF_NHORIZWS);
12454 /* It turns out that \h is just a synonym for
12456 classnum = _CC_BLANK;
12459 _invlist_union_maybe_complement_2nd(
12461 PL_XPosix_ptrs[classnum],
12462 cBOOL(namedclass % 2), /* Complement if odd
12463 (NHORIZWS, NVERTWS)
12468 else if (classnum == _CC_ASCII) {
12471 ANYOF_CLASS_SET(ret, namedclass);
12474 #endif /* Not isascii(); just use the hard-coded definition for it */
12475 _invlist_union_maybe_complement_2nd(
12478 cBOOL(namedclass % 2), /* Complement if odd
12482 else { /* Garden variety class */
12484 /* The ascii range inversion list */
12485 SV* ascii_source = PL_Posix_ptrs[classnum];
12487 /* The full Latin1 range inversion list */
12488 SV* l1_source = PL_L1Posix_ptrs[classnum];
12490 /* This code is structured into two major clauses. The
12491 * first is for classes whose complete definitions may not
12492 * already be known. It not, the Latin1 definition
12493 * (guaranteed to already known) is used plus code is
12494 * generated to load the rest at run-time (only if needed).
12495 * If the complete definition is known, it drops down to
12496 * the second clause, where the complete definition is
12499 if (classnum < _FIRST_NON_SWASH_CC) {
12501 /* Here, the class has a swash, which may or not
12502 * already be loaded */
12504 /* The name of the property to use to match the full
12505 * eXtended Unicode range swash for this character
12507 const char *Xname = swash_property_names[classnum];
12509 /* If returning the inversion list, we can't defer
12510 * getting this until runtime */
12511 if (ret_invlist && ! PL_utf8_swash_ptrs[classnum]) {
12512 PL_utf8_swash_ptrs[classnum] =
12513 _core_swash_init("utf8", Xname, &PL_sv_undef,
12516 NULL, /* No inversion list */
12517 NULL /* No flags */
12519 assert(PL_utf8_swash_ptrs[classnum]);
12521 if ( ! PL_utf8_swash_ptrs[classnum]) {
12522 if (namedclass % 2 == 0) { /* A non-complemented
12524 /* If not /a matching, there are code points we
12525 * don't know at compile time. Arrange for the
12526 * unknown matches to be loaded at run-time, if
12528 if (! AT_LEAST_ASCII_RESTRICTED) {
12529 Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
12532 if (LOC) { /* Under locale, set run-time
12534 ANYOF_CLASS_SET(ret, namedclass);
12537 /* Add the current class's code points to
12538 * the running total */
12539 _invlist_union(posixes,
12540 (AT_LEAST_ASCII_RESTRICTED)
12546 else { /* A complemented class */
12547 if (AT_LEAST_ASCII_RESTRICTED) {
12548 /* Under /a should match everything above
12549 * ASCII, plus the complement of the set's
12551 _invlist_union_complement_2nd(posixes,
12556 /* Arrange for the unknown matches to be
12557 * loaded at run-time, if needed */
12558 Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
12560 runtime_posix_matches_above_Unicode = TRUE;
12562 ANYOF_CLASS_SET(ret, namedclass);
12566 /* We want to match everything in
12567 * Latin1, except those things that
12568 * l1_source matches */
12569 SV* scratch_list = NULL;
12570 _invlist_subtract(PL_Latin1, l1_source,
12573 /* Add the list from this class to the
12576 posixes = scratch_list;
12579 _invlist_union(posixes,
12582 SvREFCNT_dec_NN(scratch_list);
12584 if (DEPENDS_SEMANTICS) {
12586 |= ANYOF_NON_UTF8_LATIN1_ALL;
12591 goto namedclass_done;
12594 /* Here, there is a swash loaded for the class. If no
12595 * inversion list for it yet, get it */
12596 if (! PL_XPosix_ptrs[classnum]) {
12597 PL_XPosix_ptrs[classnum]
12598 = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
12602 /* Here there is an inversion list already loaded for the
12605 if (namedclass % 2 == 0) { /* A non-complemented class,
12606 like ANYOF_PUNCT */
12608 /* For non-locale, just add it to any existing list
12610 _invlist_union(posixes,
12611 (AT_LEAST_ASCII_RESTRICTED)
12613 : PL_XPosix_ptrs[classnum],
12616 else { /* Locale */
12617 SV* scratch_list = NULL;
12619 /* For above Latin1 code points, we use the full
12621 _invlist_intersection(PL_AboveLatin1,
12622 PL_XPosix_ptrs[classnum],
12624 /* And set the output to it, adding instead if
12625 * there already is an output. Checking if
12626 * 'posixes' is NULL first saves an extra clone.
12627 * Its reference count will be decremented at the
12628 * next union, etc, or if this is the only
12629 * instance, at the end of the routine */
12631 posixes = scratch_list;
12634 _invlist_union(posixes, scratch_list, &posixes);
12635 SvREFCNT_dec_NN(scratch_list);
12638 #ifndef HAS_ISBLANK
12639 if (namedclass != ANYOF_BLANK) {
12641 /* Set this class in the node for runtime
12643 ANYOF_CLASS_SET(ret, namedclass);
12644 #ifndef HAS_ISBLANK
12647 /* No isblank(), use the hard-coded ASCII-range
12648 * blanks, adding them to the running total. */
12650 _invlist_union(posixes, ascii_source, &posixes);
12655 else { /* A complemented class, like ANYOF_NPUNCT */
12657 _invlist_union_complement_2nd(
12659 (AT_LEAST_ASCII_RESTRICTED)
12661 : PL_XPosix_ptrs[classnum],
12663 /* Under /d, everything in the upper half of the
12664 * Latin1 range matches this complement */
12665 if (DEPENDS_SEMANTICS) {
12666 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
12669 else { /* Locale */
12670 SV* scratch_list = NULL;
12671 _invlist_subtract(PL_AboveLatin1,
12672 PL_XPosix_ptrs[classnum],
12675 posixes = scratch_list;
12678 _invlist_union(posixes, scratch_list, &posixes);
12679 SvREFCNT_dec_NN(scratch_list);
12681 #ifndef HAS_ISBLANK
12682 if (namedclass != ANYOF_NBLANK) {
12684 ANYOF_CLASS_SET(ret, namedclass);
12685 #ifndef HAS_ISBLANK
12688 /* Get the list of all code points in Latin1
12689 * that are not ASCII blanks, and add them to
12690 * the running total */
12691 _invlist_subtract(PL_Latin1, ascii_source,
12693 _invlist_union(posixes, scratch_list, &posixes);
12694 SvREFCNT_dec_NN(scratch_list);
12701 continue; /* Go get next character */
12703 } /* end of namedclass \blah */
12705 /* Here, we have a single value. If 'range' is set, it is the ending
12706 * of a range--check its validity. Later, we will handle each
12707 * individual code point in the range. If 'range' isn't set, this
12708 * could be the beginning of a range, so check for that by looking
12709 * ahead to see if the next real character to be processed is the range
12710 * indicator--the minus sign */
12713 RExC_parse = regpatws(pRExC_state, RExC_parse,
12714 FALSE /* means don't recognize comments */);
12718 if (prevvalue > value) /* b-a */ {
12719 const int w = RExC_parse - rangebegin;
12720 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
12721 range = 0; /* not a valid range */
12725 prevvalue = value; /* save the beginning of the potential range */
12726 if (! stop_at_1 /* Can't be a range if parsing just one thing */
12727 && *RExC_parse == '-')
12729 char* next_char_ptr = RExC_parse + 1;
12730 if (skip_white) { /* Get the next real char after the '-' */
12731 next_char_ptr = regpatws(pRExC_state,
12733 FALSE); /* means don't recognize
12737 /* If the '-' is at the end of the class (just before the ']',
12738 * it is a literal minus; otherwise it is a range */
12739 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
12740 RExC_parse = next_char_ptr;
12742 /* a bad range like \w-, [:word:]- ? */
12743 if (namedclass > OOB_NAMEDCLASS) {
12744 if (strict || ckWARN(WARN_REGEXP)) {
12746 RExC_parse >= rangebegin ?
12747 RExC_parse - rangebegin : 0;
12749 vFAIL4("False [] range \"%*.*s\"",
12754 "False [] range \"%*.*s\"",
12759 cp_list = add_cp_to_invlist(cp_list, '-');
12763 range = 1; /* yeah, it's a range! */
12764 continue; /* but do it the next time */
12769 /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12772 /* non-Latin1 code point implies unicode semantics. Must be set in
12773 * pass1 so is there for the whole of pass 2 */
12775 RExC_uni_semantics = 1;
12778 /* Ready to process either the single value, or the completed range.
12779 * For single-valued non-inverted ranges, we consider the possibility
12780 * of multi-char folds. (We made a conscious decision to not do this
12781 * for the other cases because it can often lead to non-intuitive
12782 * results. For example, you have the peculiar case that:
12783 * "s s" =~ /^[^\xDF]+$/i => Y
12784 * "ss" =~ /^[^\xDF]+$/i => N
12786 * See [perl #89750] */
12787 if (FOLD && allow_multi_folds && value == prevvalue) {
12788 if (value == LATIN_SMALL_LETTER_SHARP_S
12789 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
12792 /* Here <value> is indeed a multi-char fold. Get what it is */
12794 U8 foldbuf[UTF8_MAXBYTES_CASE];
12797 UV folded = _to_uni_fold_flags(
12802 | ((LOC) ? FOLD_FLAGS_LOCALE
12803 : (ASCII_FOLD_RESTRICTED)
12804 ? FOLD_FLAGS_NOMIX_ASCII
12808 /* Here, <folded> should be the first character of the
12809 * multi-char fold of <value>, with <foldbuf> containing the
12810 * whole thing. But, if this fold is not allowed (because of
12811 * the flags), <fold> will be the same as <value>, and should
12812 * be processed like any other character, so skip the special
12814 if (folded != value) {
12816 /* Skip if we are recursed, currently parsing the class
12817 * again. Otherwise add this character to the list of
12818 * multi-char folds. */
12819 if (! RExC_in_multi_char_class) {
12820 AV** this_array_ptr;
12822 STRLEN cp_count = utf8_length(foldbuf,
12823 foldbuf + foldlen);
12824 SV* multi_fold = sv_2mortal(newSVpvn("", 0));
12826 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
12829 if (! multi_char_matches) {
12830 multi_char_matches = newAV();
12833 /* <multi_char_matches> is actually an array of arrays.
12834 * There will be one or two top-level elements: [2],
12835 * and/or [3]. The [2] element is an array, each
12836 * element thereof is a character which folds to two
12837 * characters; likewise for [3]. (Unicode guarantees a
12838 * maximum of 3 characters in any fold.) When we
12839 * rewrite the character class below, we will do so
12840 * such that the longest folds are written first, so
12841 * that it prefers the longest matching strings first.
12842 * This is done even if it turns out that any
12843 * quantifier is non-greedy, out of programmer
12844 * laziness. Tom Christiansen has agreed that this is
12845 * ok. This makes the test for the ligature 'ffi' come
12846 * before the test for 'ff' */
12847 if (av_exists(multi_char_matches, cp_count)) {
12848 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12850 this_array = *this_array_ptr;
12853 this_array = newAV();
12854 av_store(multi_char_matches, cp_count,
12857 av_push(this_array, multi_fold);
12860 /* This element should not be processed further in this
12863 value = save_value;
12864 prevvalue = save_prevvalue;
12870 /* Deal with this element of the class */
12873 cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
12875 UV* this_range = _new_invlist(1);
12876 _append_range_to_invlist(this_range, prevvalue, value);
12878 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
12879 * If this range was specified using something like 'i-j', we want
12880 * to include only the 'i' and the 'j', and not anything in
12881 * between, so exclude non-ASCII, non-alphabetics from it.
12882 * However, if the range was specified with something like
12883 * [\x89-\x91] or [\x89-j], all code points within it should be
12884 * included. literal_endpoint==2 means both ends of the range used
12885 * a literal character, not \x{foo} */
12886 if (literal_endpoint == 2
12887 && (prevvalue >= 'a' && value <= 'z')
12888 || (prevvalue >= 'A' && value <= 'Z'))
12890 _invlist_intersection(this_range, PL_ASCII, &this_range, );
12891 _invlist_intersection(this_range, PL_Alpha, &this_range, );
12893 _invlist_union(cp_list, this_range, &cp_list);
12894 literal_endpoint = 0;
12898 range = 0; /* this range (if it was one) is done now */
12899 } /* End of loop through all the text within the brackets */
12901 /* If anything in the class expands to more than one character, we have to
12902 * deal with them by building up a substitute parse string, and recursively
12903 * calling reg() on it, instead of proceeding */
12904 if (multi_char_matches) {
12905 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
12908 char *save_end = RExC_end;
12909 char *save_parse = RExC_parse;
12910 bool first_time = TRUE; /* First multi-char occurrence doesn't get
12915 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
12916 because too confusing */
12918 sv_catpv(substitute_parse, "(?:");
12922 /* Look at the longest folds first */
12923 for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
12925 if (av_exists(multi_char_matches, cp_count)) {
12926 AV** this_array_ptr;
12929 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12931 while ((this_sequence = av_pop(*this_array_ptr)) !=
12934 if (! first_time) {
12935 sv_catpv(substitute_parse, "|");
12937 first_time = FALSE;
12939 sv_catpv(substitute_parse, SvPVX(this_sequence));
12944 /* If the character class contains anything else besides these
12945 * multi-character folds, have to include it in recursive parsing */
12946 if (element_count) {
12947 sv_catpv(substitute_parse, "|[");
12948 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
12949 sv_catpv(substitute_parse, "]");
12952 sv_catpv(substitute_parse, ")");
12955 /* This is a way to get the parse to skip forward a whole named
12956 * sequence instead of matching the 2nd character when it fails the
12958 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
12962 RExC_parse = SvPV(substitute_parse, len);
12963 RExC_end = RExC_parse + len;
12964 RExC_in_multi_char_class = 1;
12965 RExC_emit = (regnode *)orig_emit;
12967 ret = reg(pRExC_state, 1, ®_flags, depth+1);
12969 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED);
12971 RExC_parse = save_parse;
12972 RExC_end = save_end;
12973 RExC_in_multi_char_class = 0;
12974 SvREFCNT_dec_NN(multi_char_matches);
12975 SvREFCNT_dec_NN(listsv);
12979 /* If the character class contains only a single element, it may be
12980 * optimizable into another node type which is smaller and runs faster.
12981 * Check if this is the case for this class */
12982 if (element_count == 1 && ! ret_invlist) {
12986 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
12987 [:digit:] or \p{foo} */
12989 /* All named classes are mapped into POSIXish nodes, with its FLAG
12990 * argument giving which class it is */
12991 switch ((I32)namedclass) {
12992 case ANYOF_UNIPROP:
12995 /* These don't depend on the charset modifiers. They always
12996 * match under /u rules */
12997 case ANYOF_NHORIZWS:
12998 case ANYOF_HORIZWS:
12999 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
13002 case ANYOF_NVERTWS:
13007 /* The actual POSIXish node for all the rest depends on the
13008 * charset modifier. The ones in the first set depend only on
13009 * ASCII or, if available on this platform, locale */
13013 op = (LOC) ? POSIXL : POSIXA;
13024 /* under /a could be alpha */
13026 if (ASCII_RESTRICTED) {
13027 namedclass = ANYOF_ALPHA + (namedclass % 2);
13035 /* The rest have more possibilities depending on the charset.
13036 * We take advantage of the enum ordering of the charset
13037 * modifiers to get the exact node type, */
13039 op = POSIXD + get_regex_charset(RExC_flags);
13040 if (op > POSIXA) { /* /aa is same as /a */
13043 #ifndef HAS_ISBLANK
13045 && (namedclass == ANYOF_BLANK
13046 || namedclass == ANYOF_NBLANK))
13053 /* The odd numbered ones are the complements of the
13054 * next-lower even number one */
13055 if (namedclass % 2 == 1) {
13059 arg = namedclass_to_classnum(namedclass);
13063 else if (value == prevvalue) {
13065 /* Here, the class consists of just a single code point */
13068 if (! LOC && value == '\n') {
13069 op = REG_ANY; /* Optimize [^\n] */
13070 *flagp |= HASWIDTH|SIMPLE;
13074 else if (value < 256 || UTF) {
13076 /* Optimize a single value into an EXACTish node, but not if it
13077 * would require converting the pattern to UTF-8. */
13078 op = compute_EXACTish(pRExC_state);
13080 } /* Otherwise is a range */
13081 else if (! LOC) { /* locale could vary these */
13082 if (prevvalue == '0') {
13083 if (value == '9') {
13090 /* Here, we have changed <op> away from its initial value iff we found
13091 * an optimization */
13094 /* Throw away this ANYOF regnode, and emit the calculated one,
13095 * which should correspond to the beginning, not current, state of
13097 const char * cur_parse = RExC_parse;
13098 RExC_parse = (char *)orig_parse;
13102 /* To get locale nodes to not use the full ANYOF size would
13103 * require moving the code above that writes the portions
13104 * of it that aren't in other nodes to after this point.
13105 * e.g. ANYOF_CLASS_SET */
13106 RExC_size = orig_size;
13110 RExC_emit = (regnode *)orig_emit;
13111 if (PL_regkind[op] == POSIXD) {
13113 op += NPOSIXD - POSIXD;
13118 ret = reg_node(pRExC_state, op);
13120 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
13124 *flagp |= HASWIDTH|SIMPLE;
13126 else if (PL_regkind[op] == EXACT) {
13127 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13130 RExC_parse = (char *) cur_parse;
13132 SvREFCNT_dec(posixes);
13133 SvREFCNT_dec_NN(listsv);
13134 SvREFCNT_dec(cp_list);
13141 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
13143 /* If folding, we calculate all characters that could fold to or from the
13144 * ones already on the list */
13145 if (FOLD && cp_list) {
13146 UV start, end; /* End points of code point ranges */
13148 SV* fold_intersection = NULL;
13150 /* If the highest code point is within Latin1, we can use the
13151 * compiled-in Alphas list, and not have to go out to disk. This
13152 * yields two false positives, the masculine and feminine ordinal
13153 * indicators, which are weeded out below using the
13154 * IS_IN_SOME_FOLD_L1() macro */
13155 if (invlist_highest(cp_list) < 256) {
13156 _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
13157 &fold_intersection);
13161 /* Here, there are non-Latin1 code points, so we will have to go
13162 * fetch the list of all the characters that participate in folds
13164 if (! PL_utf8_foldable) {
13165 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13166 &PL_sv_undef, 1, 0);
13167 PL_utf8_foldable = _get_swash_invlist(swash);
13168 SvREFCNT_dec_NN(swash);
13171 /* This is a hash that for a particular fold gives all characters
13172 * that are involved in it */
13173 if (! PL_utf8_foldclosures) {
13175 /* If we were unable to find any folds, then we likely won't be
13176 * able to find the closures. So just create an empty list.
13177 * Folding will effectively be restricted to the non-Unicode
13178 * rules hard-coded into Perl. (This case happens legitimately
13179 * during compilation of Perl itself before the Unicode tables
13180 * are generated) */
13181 if (_invlist_len(PL_utf8_foldable) == 0) {
13182 PL_utf8_foldclosures = newHV();
13185 /* If the folds haven't been read in, call a fold function
13187 if (! PL_utf8_tofold) {
13188 U8 dummy[UTF8_MAXBYTES+1];
13190 /* This string is just a short named one above \xff */
13191 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
13192 assert(PL_utf8_tofold); /* Verify that worked */
13194 PL_utf8_foldclosures =
13195 _swash_inversion_hash(PL_utf8_tofold);
13199 /* Only the characters in this class that participate in folds need
13200 * be checked. Get the intersection of this class and all the
13201 * possible characters that are foldable. This can quickly narrow
13202 * down a large class */
13203 _invlist_intersection(PL_utf8_foldable, cp_list,
13204 &fold_intersection);
13207 /* Now look at the foldable characters in this class individually */
13208 invlist_iterinit(fold_intersection);
13209 while (invlist_iternext(fold_intersection, &start, &end)) {
13212 /* Locale folding for Latin1 characters is deferred until runtime */
13213 if (LOC && start < 256) {
13217 /* Look at every character in the range */
13218 for (j = start; j <= end; j++) {
13220 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
13226 /* We have the latin1 folding rules hard-coded here so that
13227 * an innocent-looking character class, like /[ks]/i won't
13228 * have to go out to disk to find the possible matches.
13229 * XXX It would be better to generate these via regen, in
13230 * case a new version of the Unicode standard adds new
13231 * mappings, though that is not really likely, and may be
13232 * caught by the default: case of the switch below. */
13234 if (IS_IN_SOME_FOLD_L1(j)) {
13236 /* ASCII is always matched; non-ASCII is matched only
13237 * under Unicode rules */
13238 if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
13240 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
13244 add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
13248 if (HAS_NONLATIN1_FOLD_CLOSURE(j)
13249 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
13251 /* Certain Latin1 characters have matches outside
13252 * Latin1. To get here, <j> is one of those
13253 * characters. None of these matches is valid for
13254 * ASCII characters under /aa, which is why the 'if'
13255 * just above excludes those. These matches only
13256 * happen when the target string is utf8. The code
13257 * below adds the single fold closures for <j> to the
13258 * inversion list. */
13263 add_cp_to_invlist(cp_list, KELVIN_SIGN);
13267 cp_list = add_cp_to_invlist(cp_list,
13268 LATIN_SMALL_LETTER_LONG_S);
13271 cp_list = add_cp_to_invlist(cp_list,
13272 GREEK_CAPITAL_LETTER_MU);
13273 cp_list = add_cp_to_invlist(cp_list,
13274 GREEK_SMALL_LETTER_MU);
13276 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13277 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13279 add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
13281 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13282 cp_list = add_cp_to_invlist(cp_list,
13283 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13285 case LATIN_SMALL_LETTER_SHARP_S:
13286 cp_list = add_cp_to_invlist(cp_list,
13287 LATIN_CAPITAL_LETTER_SHARP_S);
13289 case 'F': case 'f':
13290 case 'I': case 'i':
13291 case 'L': case 'l':
13292 case 'T': case 't':
13293 case 'A': case 'a':
13294 case 'H': case 'h':
13295 case 'J': case 'j':
13296 case 'N': case 'n':
13297 case 'W': case 'w':
13298 case 'Y': case 'y':
13299 /* These all are targets of multi-character
13300 * folds from code points that require UTF8 to
13301 * express, so they can't match unless the
13302 * target string is in UTF-8, so no action here
13303 * is necessary, as regexec.c properly handles
13304 * the general case for UTF-8 matching and
13305 * multi-char folds */
13308 /* Use deprecated warning to increase the
13309 * chances of this being output */
13310 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
13317 /* Here is an above Latin1 character. We don't have the rules
13318 * hard-coded for it. First, get its fold. This is the simple
13319 * fold, as the multi-character folds have been handled earlier
13320 * and separated out */
13321 _to_uni_fold_flags(j, foldbuf, &foldlen,
13323 ? FOLD_FLAGS_LOCALE
13324 : (ASCII_FOLD_RESTRICTED)
13325 ? FOLD_FLAGS_NOMIX_ASCII
13328 /* Single character fold of above Latin1. Add everything in
13329 * its fold closure to the list that this node should match.
13330 * The fold closures data structure is a hash with the keys
13331 * being the UTF-8 of every character that is folded to, like
13332 * 'k', and the values each an array of all code points that
13333 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
13334 * Multi-character folds are not included */
13335 if ((listp = hv_fetch(PL_utf8_foldclosures,
13336 (char *) foldbuf, foldlen, FALSE)))
13338 AV* list = (AV*) *listp;
13340 for (k = 0; k <= av_len(list); k++) {
13341 SV** c_p = av_fetch(list, k, FALSE);
13344 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
13348 /* /aa doesn't allow folds between ASCII and non-; /l
13349 * doesn't allow them between above and below 256 */
13350 if ((ASCII_FOLD_RESTRICTED
13351 && (isASCII(c) != isASCII(j)))
13352 || (LOC && ((c < 256) != (j < 256))))
13357 /* Folds involving non-ascii Latin1 characters
13358 * under /d are added to a separate list */
13359 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
13361 cp_list = add_cp_to_invlist(cp_list, c);
13364 depends_list = add_cp_to_invlist(depends_list, c);
13370 SvREFCNT_dec_NN(fold_intersection);
13373 /* And combine the result (if any) with any inversion list from posix
13374 * classes. The lists are kept separate up to now because we don't want to
13375 * fold the classes (folding of those is automatically handled by the swash
13376 * fetching code) */
13378 if (! DEPENDS_SEMANTICS) {
13380 _invlist_union(cp_list, posixes, &cp_list);
13381 SvREFCNT_dec_NN(posixes);
13388 /* Under /d, we put into a separate list the Latin1 things that
13389 * match only when the target string is utf8 */
13390 SV* nonascii_but_latin1_properties = NULL;
13391 _invlist_intersection(posixes, PL_Latin1,
13392 &nonascii_but_latin1_properties);
13393 _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
13394 &nonascii_but_latin1_properties);
13395 _invlist_subtract(posixes, nonascii_but_latin1_properties,
13398 _invlist_union(cp_list, posixes, &cp_list);
13399 SvREFCNT_dec_NN(posixes);
13405 if (depends_list) {
13406 _invlist_union(depends_list, nonascii_but_latin1_properties,
13408 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
13411 depends_list = nonascii_but_latin1_properties;
13416 /* And combine the result (if any) with any inversion list from properties.
13417 * The lists are kept separate up to now so that we can distinguish the two
13418 * in regards to matching above-Unicode. A run-time warning is generated
13419 * if a Unicode property is matched against a non-Unicode code point. But,
13420 * we allow user-defined properties to match anything, without any warning,
13421 * and we also suppress the warning if there is a portion of the character
13422 * class that isn't a Unicode property, and which matches above Unicode, \W
13423 * or [\x{110000}] for example.
13424 * (Note that in this case, unlike the Posix one above, there is no
13425 * <depends_list>, because having a Unicode property forces Unicode
13428 bool warn_super = ! has_user_defined_property;
13431 /* If it matters to the final outcome, see if a non-property
13432 * component of the class matches above Unicode. If so, the
13433 * warning gets suppressed. This is true even if just a single
13434 * such code point is specified, as though not strictly correct if
13435 * another such code point is matched against, the fact that they
13436 * are using above-Unicode code points indicates they should know
13437 * the issues involved */
13439 bool non_prop_matches_above_Unicode =
13440 runtime_posix_matches_above_Unicode
13441 | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
13443 non_prop_matches_above_Unicode =
13444 ! non_prop_matches_above_Unicode;
13446 warn_super = ! non_prop_matches_above_Unicode;
13449 _invlist_union(properties, cp_list, &cp_list);
13450 SvREFCNT_dec_NN(properties);
13453 cp_list = properties;
13457 OP(ret) = ANYOF_WARN_SUPER;
13461 /* Here, we have calculated what code points should be in the character
13464 * Now we can see about various optimizations. Fold calculation (which we
13465 * did above) needs to take place before inversion. Otherwise /[^k]/i
13466 * would invert to include K, which under /i would match k, which it
13467 * shouldn't. Therefore we can't invert folded locale now, as it won't be
13468 * folded until runtime */
13470 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
13471 * at compile time. Besides not inverting folded locale now, we can't
13472 * invert if there are things such as \w, which aren't known until runtime
13475 && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
13477 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13479 _invlist_invert(cp_list);
13481 /* Any swash can't be used as-is, because we've inverted things */
13483 SvREFCNT_dec_NN(swash);
13487 /* Clear the invert flag since have just done it here */
13492 *ret_invlist = cp_list;
13494 /* Discard the generated node */
13496 RExC_size = orig_size;
13499 RExC_emit = orig_emit;
13504 /* If we didn't do folding, it's because some information isn't available
13505 * until runtime; set the run-time fold flag for these. (We don't have to
13506 * worry about properties folding, as that is taken care of by the swash
13510 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
13513 /* Some character classes are equivalent to other nodes. Such nodes take
13514 * up less room and generally fewer operations to execute than ANYOF nodes.
13515 * Above, we checked for and optimized into some such equivalents for
13516 * certain common classes that are easy to test. Getting to this point in
13517 * the code means that the class didn't get optimized there. Since this
13518 * code is only executed in Pass 2, it is too late to save space--it has
13519 * been allocated in Pass 1, and currently isn't given back. But turning
13520 * things into an EXACTish node can allow the optimizer to join it to any
13521 * adjacent such nodes. And if the class is equivalent to things like /./,
13522 * expensive run-time swashes can be avoided. Now that we have more
13523 * complete information, we can find things necessarily missed by the
13524 * earlier code. I (khw) am not sure how much to look for here. It would
13525 * be easy, but perhaps too slow, to check any candidates against all the
13526 * node types they could possibly match using _invlistEQ(). */
13531 && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
13532 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13535 U8 op = END; /* The optimzation node-type */
13536 const char * cur_parse= RExC_parse;
13538 invlist_iterinit(cp_list);
13539 if (! invlist_iternext(cp_list, &start, &end)) {
13541 /* Here, the list is empty. This happens, for example, when a
13542 * Unicode property is the only thing in the character class, and
13543 * it doesn't match anything. (perluniprops.pod notes such
13546 *flagp |= HASWIDTH|SIMPLE;
13548 else if (start == end) { /* The range is a single code point */
13549 if (! invlist_iternext(cp_list, &start, &end)
13551 /* Don't do this optimization if it would require changing
13552 * the pattern to UTF-8 */
13553 && (start < 256 || UTF))
13555 /* Here, the list contains a single code point. Can optimize
13556 * into an EXACT node */
13565 /* A locale node under folding with one code point can be
13566 * an EXACTFL, as its fold won't be calculated until
13572 /* Here, we are generally folding, but there is only one
13573 * code point to match. If we have to, we use an EXACT
13574 * node, but it would be better for joining with adjacent
13575 * nodes in the optimization pass if we used the same
13576 * EXACTFish node that any such are likely to be. We can
13577 * do this iff the code point doesn't participate in any
13578 * folds. For example, an EXACTF of a colon is the same as
13579 * an EXACT one, since nothing folds to or from a colon. */
13581 if (IS_IN_SOME_FOLD_L1(value)) {
13586 if (! PL_utf8_foldable) {
13587 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13588 &PL_sv_undef, 1, 0);
13589 PL_utf8_foldable = _get_swash_invlist(swash);
13590 SvREFCNT_dec_NN(swash);
13592 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13597 /* If we haven't found the node type, above, it means we
13598 * can use the prevailing one */
13600 op = compute_EXACTish(pRExC_state);
13605 else if (start == 0) {
13606 if (end == UV_MAX) {
13608 *flagp |= HASWIDTH|SIMPLE;
13611 else if (end == '\n' - 1
13612 && invlist_iternext(cp_list, &start, &end)
13613 && start == '\n' + 1 && end == UV_MAX)
13616 *flagp |= HASWIDTH|SIMPLE;
13620 invlist_iterfinish(cp_list);
13623 RExC_parse = (char *)orig_parse;
13624 RExC_emit = (regnode *)orig_emit;
13626 ret = reg_node(pRExC_state, op);
13628 RExC_parse = (char *)cur_parse;
13630 if (PL_regkind[op] == EXACT) {
13631 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13634 SvREFCNT_dec_NN(cp_list);
13635 SvREFCNT_dec_NN(listsv);
13640 /* Here, <cp_list> contains all the code points we can determine at
13641 * compile time that match under all conditions. Go through it, and
13642 * for things that belong in the bitmap, put them there, and delete from
13643 * <cp_list>. While we are at it, see if everything above 255 is in the
13644 * list, and if so, set a flag to speed up execution */
13645 ANYOF_BITMAP_ZERO(ret);
13648 /* This gets set if we actually need to modify things */
13649 bool change_invlist = FALSE;
13653 /* Start looking through <cp_list> */
13654 invlist_iterinit(cp_list);
13655 while (invlist_iternext(cp_list, &start, &end)) {
13659 if (end == UV_MAX && start <= 256) {
13660 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13663 /* Quit if are above what we should change */
13668 change_invlist = TRUE;
13670 /* Set all the bits in the range, up to the max that we are doing */
13671 high = (end < 255) ? end : 255;
13672 for (i = start; i <= (int) high; i++) {
13673 if (! ANYOF_BITMAP_TEST(ret, i)) {
13674 ANYOF_BITMAP_SET(ret, i);
13680 invlist_iterfinish(cp_list);
13682 /* Done with loop; remove any code points that are in the bitmap from
13684 if (change_invlist) {
13685 _invlist_subtract(cp_list, PL_Latin1, &cp_list);
13688 /* If have completely emptied it, remove it completely */
13689 if (_invlist_len(cp_list) == 0) {
13690 SvREFCNT_dec_NN(cp_list);
13696 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
13699 /* Here, the bitmap has been populated with all the Latin1 code points that
13700 * always match. Can now add to the overall list those that match only
13701 * when the target string is UTF-8 (<depends_list>). */
13702 if (depends_list) {
13704 _invlist_union(cp_list, depends_list, &cp_list);
13705 SvREFCNT_dec_NN(depends_list);
13708 cp_list = depends_list;
13712 /* If there is a swash and more than one element, we can't use the swash in
13713 * the optimization below. */
13714 if (swash && element_count > 1) {
13715 SvREFCNT_dec_NN(swash);
13720 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13722 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
13723 SvREFCNT_dec_NN(listsv);
13726 /* av[0] stores the character class description in its textual form:
13727 * used later (regexec.c:Perl_regclass_swash()) to initialize the
13728 * appropriate swash, and is also useful for dumping the regnode.
13729 * av[1] if NULL, is a placeholder to later contain the swash computed
13730 * from av[0]. But if no further computation need be done, the
13731 * swash is stored there now.
13732 * av[2] stores the cp_list inversion list for use in addition or
13733 * instead of av[0]; used only if av[1] is NULL
13734 * av[3] is set if any component of the class is from a user-defined
13735 * property; used only if av[1] is NULL */
13736 AV * const av = newAV();
13739 av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13741 : (SvREFCNT_dec_NN(listsv), &PL_sv_undef));
13743 av_store(av, 1, swash);
13744 SvREFCNT_dec_NN(cp_list);
13747 av_store(av, 1, NULL);
13749 av_store(av, 2, cp_list);
13750 av_store(av, 3, newSVuv(has_user_defined_property));
13754 rv = newRV_noinc(MUTABLE_SV(av));
13755 n = add_data(pRExC_state, 1, "s");
13756 RExC_rxi->data->data[n] = (void*)rv;
13760 *flagp |= HASWIDTH|SIMPLE;
13763 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
13766 /* reg_skipcomment()
13768 Absorbs an /x style # comments from the input stream.
13769 Returns true if there is more text remaining in the stream.
13770 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
13771 terminates the pattern without including a newline.
13773 Note its the callers responsibility to ensure that we are
13774 actually in /x mode
13779 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
13783 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
13785 while (RExC_parse < RExC_end)
13786 if (*RExC_parse++ == '\n') {
13791 /* we ran off the end of the pattern without ending
13792 the comment, so we have to add an \n when wrapping */
13793 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
13801 Advances the parse position, and optionally absorbs
13802 "whitespace" from the inputstream.
13804 Without /x "whitespace" means (?#...) style comments only,
13805 with /x this means (?#...) and # comments and whitespace proper.
13807 Returns the RExC_parse point from BEFORE the scan occurs.
13809 This is the /x friendly way of saying RExC_parse++.
13813 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
13815 char* const retval = RExC_parse++;
13817 PERL_ARGS_ASSERT_NEXTCHAR;
13820 if (RExC_end - RExC_parse >= 3
13821 && *RExC_parse == '('
13822 && RExC_parse[1] == '?'
13823 && RExC_parse[2] == '#')
13825 while (*RExC_parse != ')') {
13826 if (RExC_parse == RExC_end)
13827 FAIL("Sequence (?#... not terminated");
13833 if (RExC_flags & RXf_PMf_EXTENDED) {
13834 if (isSPACE(*RExC_parse)) {
13838 else if (*RExC_parse == '#') {
13839 if ( reg_skipcomment( pRExC_state ) )
13848 - reg_node - emit a node
13850 STATIC regnode * /* Location. */
13851 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
13855 regnode * const ret = RExC_emit;
13856 GET_RE_DEBUG_FLAGS_DECL;
13858 PERL_ARGS_ASSERT_REG_NODE;
13861 SIZE_ALIGN(RExC_size);
13865 if (RExC_emit >= RExC_emit_bound)
13866 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13867 op, RExC_emit, RExC_emit_bound);
13869 NODE_ALIGN_FILL(ret);
13871 FILL_ADVANCE_NODE(ptr, op);
13872 #ifdef RE_TRACK_PATTERN_OFFSETS
13873 if (RExC_offsets) { /* MJD */
13874 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
13875 "reg_node", __LINE__,
13877 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
13878 ? "Overwriting end of array!\n" : "OK",
13879 (UV)(RExC_emit - RExC_emit_start),
13880 (UV)(RExC_parse - RExC_start),
13881 (UV)RExC_offsets[0]));
13882 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
13890 - reganode - emit a node with an argument
13892 STATIC regnode * /* Location. */
13893 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
13897 regnode * const ret = RExC_emit;
13898 GET_RE_DEBUG_FLAGS_DECL;
13900 PERL_ARGS_ASSERT_REGANODE;
13903 SIZE_ALIGN(RExC_size);
13908 assert(2==regarglen[op]+1);
13910 Anything larger than this has to allocate the extra amount.
13911 If we changed this to be:
13913 RExC_size += (1 + regarglen[op]);
13915 then it wouldn't matter. Its not clear what side effect
13916 might come from that so its not done so far.
13921 if (RExC_emit >= RExC_emit_bound)
13922 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13923 op, RExC_emit, RExC_emit_bound);
13925 NODE_ALIGN_FILL(ret);
13927 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
13928 #ifdef RE_TRACK_PATTERN_OFFSETS
13929 if (RExC_offsets) { /* MJD */
13930 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13934 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
13935 "Overwriting end of array!\n" : "OK",
13936 (UV)(RExC_emit - RExC_emit_start),
13937 (UV)(RExC_parse - RExC_start),
13938 (UV)RExC_offsets[0]));
13939 Set_Cur_Node_Offset;
13947 - reguni - emit (if appropriate) a Unicode character
13950 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
13954 PERL_ARGS_ASSERT_REGUNI;
13956 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
13960 - reginsert - insert an operator in front of already-emitted operand
13962 * Means relocating the operand.
13965 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
13971 const int offset = regarglen[(U8)op];
13972 const int size = NODE_STEP_REGNODE + offset;
13973 GET_RE_DEBUG_FLAGS_DECL;
13975 PERL_ARGS_ASSERT_REGINSERT;
13976 PERL_UNUSED_ARG(depth);
13977 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13978 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
13987 if (RExC_open_parens) {
13989 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
13990 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
13991 if ( RExC_open_parens[paren] >= opnd ) {
13992 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
13993 RExC_open_parens[paren] += size;
13995 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
13997 if ( RExC_close_parens[paren] >= opnd ) {
13998 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
13999 RExC_close_parens[paren] += size;
14001 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
14006 while (src > opnd) {
14007 StructCopy(--src, --dst, regnode);
14008 #ifdef RE_TRACK_PATTERN_OFFSETS
14009 if (RExC_offsets) { /* MJD 20010112 */
14010 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
14014 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
14015 ? "Overwriting end of array!\n" : "OK",
14016 (UV)(src - RExC_emit_start),
14017 (UV)(dst - RExC_emit_start),
14018 (UV)RExC_offsets[0]));
14019 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
14020 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
14026 place = opnd; /* Op node, where operand used to be. */
14027 #ifdef RE_TRACK_PATTERN_OFFSETS
14028 if (RExC_offsets) { /* MJD */
14029 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14033 (UV)(place - RExC_emit_start) > RExC_offsets[0]
14034 ? "Overwriting end of array!\n" : "OK",
14035 (UV)(place - RExC_emit_start),
14036 (UV)(RExC_parse - RExC_start),
14037 (UV)RExC_offsets[0]));
14038 Set_Node_Offset(place, RExC_parse);
14039 Set_Node_Length(place, 1);
14042 src = NEXTOPER(place);
14043 FILL_ADVANCE_NODE(place, op);
14044 Zero(src, offset, regnode);
14048 - regtail - set the next-pointer at the end of a node chain of p to val.
14049 - SEE ALSO: regtail_study
14051 /* TODO: All three parms should be const */
14053 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14057 GET_RE_DEBUG_FLAGS_DECL;
14059 PERL_ARGS_ASSERT_REGTAIL;
14061 PERL_UNUSED_ARG(depth);
14067 /* Find last node. */
14070 regnode * const temp = regnext(scan);
14072 SV * const mysv=sv_newmortal();
14073 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
14074 regprop(RExC_rx, mysv, scan);
14075 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
14076 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
14077 (temp == NULL ? "->" : ""),
14078 (temp == NULL ? PL_reg_name[OP(val)] : "")
14086 if (reg_off_by_arg[OP(scan)]) {
14087 ARG_SET(scan, val - scan);
14090 NEXT_OFF(scan) = val - scan;
14096 - regtail_study - set the next-pointer at the end of a node chain of p to val.
14097 - Look for optimizable sequences at the same time.
14098 - currently only looks for EXACT chains.
14100 This is experimental code. The idea is to use this routine to perform
14101 in place optimizations on branches and groups as they are constructed,
14102 with the long term intention of removing optimization from study_chunk so
14103 that it is purely analytical.
14105 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
14106 to control which is which.
14109 /* TODO: All four parms should be const */
14112 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14117 #ifdef EXPERIMENTAL_INPLACESCAN
14120 GET_RE_DEBUG_FLAGS_DECL;
14122 PERL_ARGS_ASSERT_REGTAIL_STUDY;
14128 /* Find last node. */
14132 regnode * const temp = regnext(scan);
14133 #ifdef EXPERIMENTAL_INPLACESCAN
14134 if (PL_regkind[OP(scan)] == EXACT) {
14135 bool has_exactf_sharp_s; /* Unexamined in this routine */
14136 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
14141 switch (OP(scan)) {
14147 case EXACTFU_TRICKYFOLD:
14149 if( exact == PSEUDO )
14151 else if ( exact != OP(scan) )
14160 SV * const mysv=sv_newmortal();
14161 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
14162 regprop(RExC_rx, mysv, scan);
14163 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
14164 SvPV_nolen_const(mysv),
14165 REG_NODE_NUM(scan),
14166 PL_reg_name[exact]);
14173 SV * const mysv_val=sv_newmortal();
14174 DEBUG_PARSE_MSG("");
14175 regprop(RExC_rx, mysv_val, val);
14176 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
14177 SvPV_nolen_const(mysv_val),
14178 (IV)REG_NODE_NUM(val),
14182 if (reg_off_by_arg[OP(scan)]) {
14183 ARG_SET(scan, val - scan);
14186 NEXT_OFF(scan) = val - scan;
14194 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
14198 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
14204 for (bit=0; bit<32; bit++) {
14205 if (flags & (1<<bit)) {
14206 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
14209 if (!set++ && lead)
14210 PerlIO_printf(Perl_debug_log, "%s",lead);
14211 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
14214 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
14215 if (!set++ && lead) {
14216 PerlIO_printf(Perl_debug_log, "%s",lead);
14219 case REGEX_UNICODE_CHARSET:
14220 PerlIO_printf(Perl_debug_log, "UNICODE");
14222 case REGEX_LOCALE_CHARSET:
14223 PerlIO_printf(Perl_debug_log, "LOCALE");
14225 case REGEX_ASCII_RESTRICTED_CHARSET:
14226 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
14228 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
14229 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
14232 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
14238 PerlIO_printf(Perl_debug_log, "\n");
14240 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14246 Perl_regdump(pTHX_ const regexp *r)
14250 SV * const sv = sv_newmortal();
14251 SV *dsv= sv_newmortal();
14252 RXi_GET_DECL(r,ri);
14253 GET_RE_DEBUG_FLAGS_DECL;
14255 PERL_ARGS_ASSERT_REGDUMP;
14257 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
14259 /* Header fields of interest. */
14260 if (r->anchored_substr) {
14261 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
14262 RE_SV_DUMPLEN(r->anchored_substr), 30);
14263 PerlIO_printf(Perl_debug_log,
14264 "anchored %s%s at %"IVdf" ",
14265 s, RE_SV_TAIL(r->anchored_substr),
14266 (IV)r->anchored_offset);
14267 } else if (r->anchored_utf8) {
14268 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
14269 RE_SV_DUMPLEN(r->anchored_utf8), 30);
14270 PerlIO_printf(Perl_debug_log,
14271 "anchored utf8 %s%s at %"IVdf" ",
14272 s, RE_SV_TAIL(r->anchored_utf8),
14273 (IV)r->anchored_offset);
14275 if (r->float_substr) {
14276 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
14277 RE_SV_DUMPLEN(r->float_substr), 30);
14278 PerlIO_printf(Perl_debug_log,
14279 "floating %s%s at %"IVdf"..%"UVuf" ",
14280 s, RE_SV_TAIL(r->float_substr),
14281 (IV)r->float_min_offset, (UV)r->float_max_offset);
14282 } else if (r->float_utf8) {
14283 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
14284 RE_SV_DUMPLEN(r->float_utf8), 30);
14285 PerlIO_printf(Perl_debug_log,
14286 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
14287 s, RE_SV_TAIL(r->float_utf8),
14288 (IV)r->float_min_offset, (UV)r->float_max_offset);
14290 if (r->check_substr || r->check_utf8)
14291 PerlIO_printf(Perl_debug_log,
14293 (r->check_substr == r->float_substr
14294 && r->check_utf8 == r->float_utf8
14295 ? "(checking floating" : "(checking anchored"));
14296 if (r->extflags & RXf_NOSCAN)
14297 PerlIO_printf(Perl_debug_log, " noscan");
14298 if (r->extflags & RXf_CHECK_ALL)
14299 PerlIO_printf(Perl_debug_log, " isall");
14300 if (r->check_substr || r->check_utf8)
14301 PerlIO_printf(Perl_debug_log, ") ");
14303 if (ri->regstclass) {
14304 regprop(r, sv, ri->regstclass);
14305 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
14307 if (r->extflags & RXf_ANCH) {
14308 PerlIO_printf(Perl_debug_log, "anchored");
14309 if (r->extflags & RXf_ANCH_BOL)
14310 PerlIO_printf(Perl_debug_log, "(BOL)");
14311 if (r->extflags & RXf_ANCH_MBOL)
14312 PerlIO_printf(Perl_debug_log, "(MBOL)");
14313 if (r->extflags & RXf_ANCH_SBOL)
14314 PerlIO_printf(Perl_debug_log, "(SBOL)");
14315 if (r->extflags & RXf_ANCH_GPOS)
14316 PerlIO_printf(Perl_debug_log, "(GPOS)");
14317 PerlIO_putc(Perl_debug_log, ' ');
14319 if (r->extflags & RXf_GPOS_SEEN)
14320 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
14321 if (r->intflags & PREGf_SKIP)
14322 PerlIO_printf(Perl_debug_log, "plus ");
14323 if (r->intflags & PREGf_IMPLICIT)
14324 PerlIO_printf(Perl_debug_log, "implicit ");
14325 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
14326 if (r->extflags & RXf_EVAL_SEEN)
14327 PerlIO_printf(Perl_debug_log, "with eval ");
14328 PerlIO_printf(Perl_debug_log, "\n");
14329 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
14331 PERL_ARGS_ASSERT_REGDUMP;
14332 PERL_UNUSED_CONTEXT;
14333 PERL_UNUSED_ARG(r);
14334 #endif /* DEBUGGING */
14338 - regprop - printable representation of opcode
14340 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
14343 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
14344 if (flags & ANYOF_INVERT) \
14345 /*make sure the invert info is in each */ \
14346 sv_catpvs(sv, "^"); \
14352 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
14358 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
14359 static const char * const anyofs[] = {
14360 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
14361 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
14362 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
14363 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
14364 || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \
14365 || _CC_VERTSPACE != 16
14366 #error Need to adjust order of anyofs[]
14403 RXi_GET_DECL(prog,progi);
14404 GET_RE_DEBUG_FLAGS_DECL;
14406 PERL_ARGS_ASSERT_REGPROP;
14410 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
14411 /* It would be nice to FAIL() here, but this may be called from
14412 regexec.c, and it would be hard to supply pRExC_state. */
14413 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
14414 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
14416 k = PL_regkind[OP(o)];
14419 sv_catpvs(sv, " ");
14420 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
14421 * is a crude hack but it may be the best for now since
14422 * we have no flag "this EXACTish node was UTF-8"
14424 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
14425 PERL_PV_ESCAPE_UNI_DETECT |
14426 PERL_PV_ESCAPE_NONASCII |
14427 PERL_PV_PRETTY_ELLIPSES |
14428 PERL_PV_PRETTY_LTGT |
14429 PERL_PV_PRETTY_NOCLEAR
14431 } else if (k == TRIE) {
14432 /* print the details of the trie in dumpuntil instead, as
14433 * progi->data isn't available here */
14434 const char op = OP(o);
14435 const U32 n = ARG(o);
14436 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
14437 (reg_ac_data *)progi->data->data[n] :
14439 const reg_trie_data * const trie
14440 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
14442 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
14443 DEBUG_TRIE_COMPILE_r(
14444 Perl_sv_catpvf(aTHX_ sv,
14445 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
14446 (UV)trie->startstate,
14447 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
14448 (UV)trie->wordcount,
14451 (UV)TRIE_CHARCOUNT(trie),
14452 (UV)trie->uniquecharcount
14455 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
14457 int rangestart = -1;
14458 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
14459 sv_catpvs(sv, "[");
14460 for (i = 0; i <= 256; i++) {
14461 if (i < 256 && BITMAP_TEST(bitmap,i)) {
14462 if (rangestart == -1)
14464 } else if (rangestart != -1) {
14465 if (i <= rangestart + 3)
14466 for (; rangestart < i; rangestart++)
14467 put_byte(sv, rangestart);
14469 put_byte(sv, rangestart);
14470 sv_catpvs(sv, "-");
14471 put_byte(sv, i - 1);
14476 sv_catpvs(sv, "]");
14479 } else if (k == CURLY) {
14480 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
14481 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
14482 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
14484 else if (k == WHILEM && o->flags) /* Ordinal/of */
14485 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
14486 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
14487 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
14488 if ( RXp_PAREN_NAMES(prog) ) {
14489 if ( k != REF || (OP(o) < NREF)) {
14490 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
14491 SV **name= av_fetch(list, ARG(o), 0 );
14493 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14496 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
14497 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
14498 I32 *nums=(I32*)SvPVX(sv_dat);
14499 SV **name= av_fetch(list, nums[0], 0 );
14502 for ( n=0; n<SvIVX(sv_dat); n++ ) {
14503 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
14504 (n ? "," : ""), (IV)nums[n]);
14506 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14510 } else if (k == GOSUB)
14511 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
14512 else if (k == VERB) {
14514 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
14515 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
14516 } else if (k == LOGICAL)
14517 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
14518 else if (k == ANYOF) {
14519 int i, rangestart = -1;
14520 const U8 flags = ANYOF_FLAGS(o);
14524 if (flags & ANYOF_LOCALE)
14525 sv_catpvs(sv, "{loc}");
14526 if (flags & ANYOF_LOC_FOLD)
14527 sv_catpvs(sv, "{i}");
14528 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
14529 if (flags & ANYOF_INVERT)
14530 sv_catpvs(sv, "^");
14532 /* output what the standard cp 0-255 bitmap matches */
14533 for (i = 0; i <= 256; i++) {
14534 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
14535 if (rangestart == -1)
14537 } else if (rangestart != -1) {
14538 if (i <= rangestart + 3)
14539 for (; rangestart < i; rangestart++)
14540 put_byte(sv, rangestart);
14542 put_byte(sv, rangestart);
14543 sv_catpvs(sv, "-");
14544 put_byte(sv, i - 1);
14551 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14552 /* output any special charclass tests (used entirely under use locale) */
14553 if (ANYOF_CLASS_TEST_ANY_SET(o))
14554 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
14555 if (ANYOF_CLASS_TEST(o,i)) {
14556 sv_catpv(sv, anyofs[i]);
14560 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14562 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
14563 sv_catpvs(sv, "{non-utf8-latin1-all}");
14566 /* output information about the unicode matching */
14567 if (flags & ANYOF_UNICODE_ALL)
14568 sv_catpvs(sv, "{unicode_all}");
14569 else if (ANYOF_NONBITMAP(o))
14570 sv_catpvs(sv, "{unicode}");
14571 if (flags & ANYOF_NONBITMAP_NON_UTF8)
14572 sv_catpvs(sv, "{outside bitmap}");
14574 if (ANYOF_NONBITMAP(o)) {
14575 SV *lv; /* Set if there is something outside the bit map */
14576 SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
14577 bool byte_output = FALSE; /* If something in the bitmap has been
14580 if (lv && lv != &PL_sv_undef) {
14582 U8 s[UTF8_MAXBYTES_CASE+1];
14584 for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
14585 uvchr_to_utf8(s, i);
14588 && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
14592 && swash_fetch(sw, s, TRUE))
14594 if (rangestart == -1)
14596 } else if (rangestart != -1) {
14597 byte_output = TRUE;
14598 if (i <= rangestart + 3)
14599 for (; rangestart < i; rangestart++) {
14600 put_byte(sv, rangestart);
14603 put_byte(sv, rangestart);
14604 sv_catpvs(sv, "-");
14613 char *s = savesvpv(lv);
14614 char * const origs = s;
14616 while (*s && *s != '\n')
14620 const char * const t = ++s;
14623 sv_catpvs(sv, " ");
14629 /* Truncate very long output */
14630 if (s - origs > 256) {
14631 Perl_sv_catpvf(aTHX_ sv,
14633 (int) (s - origs - 1),
14639 else if (*s == '\t') {
14654 SvREFCNT_dec_NN(lv);
14658 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14660 else if (k == POSIXD || k == NPOSIXD) {
14661 U8 index = FLAGS(o) * 2;
14662 if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14663 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14666 sv_catpv(sv, anyofs[index]);
14669 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14670 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14672 PERL_UNUSED_CONTEXT;
14673 PERL_UNUSED_ARG(sv);
14674 PERL_UNUSED_ARG(o);
14675 PERL_UNUSED_ARG(prog);
14676 #endif /* DEBUGGING */
14680 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14681 { /* Assume that RE_INTUIT is set */
14683 struct regexp *const prog = ReANY(r);
14684 GET_RE_DEBUG_FLAGS_DECL;
14686 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14687 PERL_UNUSED_CONTEXT;
14691 const char * const s = SvPV_nolen_const(prog->check_substr
14692 ? prog->check_substr : prog->check_utf8);
14694 if (!PL_colorset) reginitcolors();
14695 PerlIO_printf(Perl_debug_log,
14696 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14698 prog->check_substr ? "" : "utf8 ",
14699 PL_colors[5],PL_colors[0],
14702 (strlen(s) > 60 ? "..." : ""));
14705 return prog->check_substr ? prog->check_substr : prog->check_utf8;
14711 handles refcounting and freeing the perl core regexp structure. When
14712 it is necessary to actually free the structure the first thing it
14713 does is call the 'free' method of the regexp_engine associated to
14714 the regexp, allowing the handling of the void *pprivate; member
14715 first. (This routine is not overridable by extensions, which is why
14716 the extensions free is called first.)
14718 See regdupe and regdupe_internal if you change anything here.
14720 #ifndef PERL_IN_XSUB_RE
14722 Perl_pregfree(pTHX_ REGEXP *r)
14728 Perl_pregfree2(pTHX_ REGEXP *rx)
14731 struct regexp *const r = ReANY(rx);
14732 GET_RE_DEBUG_FLAGS_DECL;
14734 PERL_ARGS_ASSERT_PREGFREE2;
14736 if (r->mother_re) {
14737 ReREFCNT_dec(r->mother_re);
14739 CALLREGFREE_PVT(rx); /* free the private data */
14740 SvREFCNT_dec(RXp_PAREN_NAMES(r));
14741 Safefree(r->xpv_len_u.xpvlenu_pv);
14744 SvREFCNT_dec(r->anchored_substr);
14745 SvREFCNT_dec(r->anchored_utf8);
14746 SvREFCNT_dec(r->float_substr);
14747 SvREFCNT_dec(r->float_utf8);
14748 Safefree(r->substrs);
14750 RX_MATCH_COPY_FREE(rx);
14751 #ifdef PERL_ANY_COW
14752 SvREFCNT_dec(r->saved_copy);
14755 SvREFCNT_dec(r->qr_anoncv);
14756 rx->sv_u.svu_rx = 0;
14761 This is a hacky workaround to the structural issue of match results
14762 being stored in the regexp structure which is in turn stored in
14763 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
14764 could be PL_curpm in multiple contexts, and could require multiple
14765 result sets being associated with the pattern simultaneously, such
14766 as when doing a recursive match with (??{$qr})
14768 The solution is to make a lightweight copy of the regexp structure
14769 when a qr// is returned from the code executed by (??{$qr}) this
14770 lightweight copy doesn't actually own any of its data except for
14771 the starp/end and the actual regexp structure itself.
14777 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
14779 struct regexp *ret;
14780 struct regexp *const r = ReANY(rx);
14781 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
14783 PERL_ARGS_ASSERT_REG_TEMP_COPY;
14786 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
14788 SvOK_off((SV *)ret_x);
14790 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
14791 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
14792 made both spots point to the same regexp body.) */
14793 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
14794 assert(!SvPVX(ret_x));
14795 ret_x->sv_u.svu_rx = temp->sv_any;
14796 temp->sv_any = NULL;
14797 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
14798 SvREFCNT_dec_NN(temp);
14799 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
14800 ing below will not set it. */
14801 SvCUR_set(ret_x, SvCUR(rx));
14804 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
14805 sv_force_normal(sv) is called. */
14807 ret = ReANY(ret_x);
14809 SvFLAGS(ret_x) |= SvUTF8(rx);
14810 /* We share the same string buffer as the original regexp, on which we
14811 hold a reference count, incremented when mother_re is set below.
14812 The string pointer is copied here, being part of the regexp struct.
14814 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
14815 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
14817 const I32 npar = r->nparens+1;
14818 Newx(ret->offs, npar, regexp_paren_pair);
14819 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14822 Newx(ret->substrs, 1, struct reg_substr_data);
14823 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14825 SvREFCNT_inc_void(ret->anchored_substr);
14826 SvREFCNT_inc_void(ret->anchored_utf8);
14827 SvREFCNT_inc_void(ret->float_substr);
14828 SvREFCNT_inc_void(ret->float_utf8);
14830 /* check_substr and check_utf8, if non-NULL, point to either their
14831 anchored or float namesakes, and don't hold a second reference. */
14833 RX_MATCH_COPIED_off(ret_x);
14834 #ifdef PERL_ANY_COW
14835 ret->saved_copy = NULL;
14837 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
14838 SvREFCNT_inc_void(ret->qr_anoncv);
14844 /* regfree_internal()
14846 Free the private data in a regexp. This is overloadable by
14847 extensions. Perl takes care of the regexp structure in pregfree(),
14848 this covers the *pprivate pointer which technically perl doesn't
14849 know about, however of course we have to handle the
14850 regexp_internal structure when no extension is in use.
14852 Note this is called before freeing anything in the regexp
14857 Perl_regfree_internal(pTHX_ REGEXP * const rx)
14860 struct regexp *const r = ReANY(rx);
14861 RXi_GET_DECL(r,ri);
14862 GET_RE_DEBUG_FLAGS_DECL;
14864 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
14870 SV *dsv= sv_newmortal();
14871 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
14872 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
14873 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
14874 PL_colors[4],PL_colors[5],s);
14877 #ifdef RE_TRACK_PATTERN_OFFSETS
14879 Safefree(ri->u.offsets); /* 20010421 MJD */
14881 if (ri->code_blocks) {
14883 for (n = 0; n < ri->num_code_blocks; n++)
14884 SvREFCNT_dec(ri->code_blocks[n].src_regex);
14885 Safefree(ri->code_blocks);
14889 int n = ri->data->count;
14892 /* If you add a ->what type here, update the comment in regcomp.h */
14893 switch (ri->data->what[n]) {
14899 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
14902 Safefree(ri->data->data[n]);
14908 { /* Aho Corasick add-on structure for a trie node.
14909 Used in stclass optimization only */
14911 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
14913 refcount = --aho->refcount;
14916 PerlMemShared_free(aho->states);
14917 PerlMemShared_free(aho->fail);
14918 /* do this last!!!! */
14919 PerlMemShared_free(ri->data->data[n]);
14920 PerlMemShared_free(ri->regstclass);
14926 /* trie structure. */
14928 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
14930 refcount = --trie->refcount;
14933 PerlMemShared_free(trie->charmap);
14934 PerlMemShared_free(trie->states);
14935 PerlMemShared_free(trie->trans);
14937 PerlMemShared_free(trie->bitmap);
14939 PerlMemShared_free(trie->jump);
14940 PerlMemShared_free(trie->wordinfo);
14941 /* do this last!!!! */
14942 PerlMemShared_free(ri->data->data[n]);
14947 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
14950 Safefree(ri->data->what);
14951 Safefree(ri->data);
14957 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
14958 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
14959 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
14962 re_dup - duplicate a regexp.
14964 This routine is expected to clone a given regexp structure. It is only
14965 compiled under USE_ITHREADS.
14967 After all of the core data stored in struct regexp is duplicated
14968 the regexp_engine.dupe method is used to copy any private data
14969 stored in the *pprivate pointer. This allows extensions to handle
14970 any duplication it needs to do.
14972 See pregfree() and regfree_internal() if you change anything here.
14974 #if defined(USE_ITHREADS)
14975 #ifndef PERL_IN_XSUB_RE
14977 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
14981 const struct regexp *r = ReANY(sstr);
14982 struct regexp *ret = ReANY(dstr);
14984 PERL_ARGS_ASSERT_RE_DUP_GUTS;
14986 npar = r->nparens+1;
14987 Newx(ret->offs, npar, regexp_paren_pair);
14988 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14990 /* no need to copy these */
14991 Newx(ret->swap, npar, regexp_paren_pair);
14994 if (ret->substrs) {
14995 /* Do it this way to avoid reading from *r after the StructCopy().
14996 That way, if any of the sv_dup_inc()s dislodge *r from the L1
14997 cache, it doesn't matter. */
14998 const bool anchored = r->check_substr
14999 ? r->check_substr == r->anchored_substr
15000 : r->check_utf8 == r->anchored_utf8;
15001 Newx(ret->substrs, 1, struct reg_substr_data);
15002 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15004 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
15005 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
15006 ret->float_substr = sv_dup_inc(ret->float_substr, param);
15007 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
15009 /* check_substr and check_utf8, if non-NULL, point to either their
15010 anchored or float namesakes, and don't hold a second reference. */
15012 if (ret->check_substr) {
15014 assert(r->check_utf8 == r->anchored_utf8);
15015 ret->check_substr = ret->anchored_substr;
15016 ret->check_utf8 = ret->anchored_utf8;
15018 assert(r->check_substr == r->float_substr);
15019 assert(r->check_utf8 == r->float_utf8);
15020 ret->check_substr = ret->float_substr;
15021 ret->check_utf8 = ret->float_utf8;
15023 } else if (ret->check_utf8) {
15025 ret->check_utf8 = ret->anchored_utf8;
15027 ret->check_utf8 = ret->float_utf8;
15032 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
15033 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
15036 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
15038 if (RX_MATCH_COPIED(dstr))
15039 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
15041 ret->subbeg = NULL;
15042 #ifdef PERL_ANY_COW
15043 ret->saved_copy = NULL;
15046 /* Whether mother_re be set or no, we need to copy the string. We
15047 cannot refrain from copying it when the storage points directly to
15048 our mother regexp, because that's
15049 1: a buffer in a different thread
15050 2: something we no longer hold a reference on
15051 so we need to copy it locally. */
15052 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
15053 ret->mother_re = NULL;
15056 #endif /* PERL_IN_XSUB_RE */
15061 This is the internal complement to regdupe() which is used to copy
15062 the structure pointed to by the *pprivate pointer in the regexp.
15063 This is the core version of the extension overridable cloning hook.
15064 The regexp structure being duplicated will be copied by perl prior
15065 to this and will be provided as the regexp *r argument, however
15066 with the /old/ structures pprivate pointer value. Thus this routine
15067 may override any copying normally done by perl.
15069 It returns a pointer to the new regexp_internal structure.
15073 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
15076 struct regexp *const r = ReANY(rx);
15077 regexp_internal *reti;
15079 RXi_GET_DECL(r,ri);
15081 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
15085 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
15086 Copy(ri->program, reti->program, len+1, regnode);
15088 reti->num_code_blocks = ri->num_code_blocks;
15089 if (ri->code_blocks) {
15091 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
15092 struct reg_code_block);
15093 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
15094 struct reg_code_block);
15095 for (n = 0; n < ri->num_code_blocks; n++)
15096 reti->code_blocks[n].src_regex = (REGEXP*)
15097 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
15100 reti->code_blocks = NULL;
15102 reti->regstclass = NULL;
15105 struct reg_data *d;
15106 const int count = ri->data->count;
15109 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
15110 char, struct reg_data);
15111 Newx(d->what, count, U8);
15114 for (i = 0; i < count; i++) {
15115 d->what[i] = ri->data->what[i];
15116 switch (d->what[i]) {
15117 /* see also regcomp.h and regfree_internal() */
15118 case 'a': /* actually an AV, but the dup function is identical. */
15122 case 'u': /* actually an HV, but the dup function is identical. */
15123 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
15126 /* This is cheating. */
15127 Newx(d->data[i], 1, struct regnode_charclass_class);
15128 StructCopy(ri->data->data[i], d->data[i],
15129 struct regnode_charclass_class);
15130 reti->regstclass = (regnode*)d->data[i];
15133 /* Trie stclasses are readonly and can thus be shared
15134 * without duplication. We free the stclass in pregfree
15135 * when the corresponding reg_ac_data struct is freed.
15137 reti->regstclass= ri->regstclass;
15141 ((reg_trie_data*)ri->data->data[i])->refcount++;
15146 d->data[i] = ri->data->data[i];
15149 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
15158 reti->name_list_idx = ri->name_list_idx;
15160 #ifdef RE_TRACK_PATTERN_OFFSETS
15161 if (ri->u.offsets) {
15162 Newx(reti->u.offsets, 2*len+1, U32);
15163 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
15166 SetProgLen(reti,len);
15169 return (void*)reti;
15172 #endif /* USE_ITHREADS */
15174 #ifndef PERL_IN_XSUB_RE
15177 - regnext - dig the "next" pointer out of a node
15180 Perl_regnext(pTHX_ regnode *p)
15188 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
15189 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
15192 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
15201 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
15204 STRLEN l1 = strlen(pat1);
15205 STRLEN l2 = strlen(pat2);
15208 const char *message;
15210 PERL_ARGS_ASSERT_RE_CROAK2;
15216 Copy(pat1, buf, l1 , char);
15217 Copy(pat2, buf + l1, l2 , char);
15218 buf[l1 + l2] = '\n';
15219 buf[l1 + l2 + 1] = '\0';
15221 /* ANSI variant takes additional second argument */
15222 va_start(args, pat2);
15226 msv = vmess(buf, &args);
15228 message = SvPV_const(msv,l1);
15231 Copy(message, buf, l1 , char);
15232 buf[l1-1] = '\0'; /* Overwrite \n */
15233 Perl_croak(aTHX_ "%s", buf);
15236 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
15238 #ifndef PERL_IN_XSUB_RE
15240 Perl_save_re_context(pTHX)
15244 struct re_save_state *state;
15246 SAVEVPTR(PL_curcop);
15247 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
15249 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
15250 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
15251 SSPUSHUV(SAVEt_RE_STATE);
15253 Copy(&PL_reg_state, state, 1, struct re_save_state);
15255 PL_reg_oldsaved = NULL;
15256 PL_reg_oldsavedlen = 0;
15257 PL_reg_oldsavedoffset = 0;
15258 PL_reg_oldsavedcoffset = 0;
15259 PL_reg_maxiter = 0;
15260 PL_reg_leftiter = 0;
15261 PL_reg_poscache = NULL;
15262 PL_reg_poscache_size = 0;
15263 #ifdef PERL_ANY_COW
15267 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
15269 const REGEXP * const rx = PM_GETRE(PL_curpm);
15272 for (i = 1; i <= RX_NPARENS(rx); i++) {
15273 char digits[TYPE_CHARS(long)];
15274 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
15275 GV *const *const gvp
15276 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
15279 GV * const gv = *gvp;
15280 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
15292 S_put_byte(pTHX_ SV *sv, int c)
15294 PERL_ARGS_ASSERT_PUT_BYTE;
15296 /* Our definition of isPRINT() ignores locales, so only bytes that are
15297 not part of UTF-8 are considered printable. I assume that the same
15298 holds for UTF-EBCDIC.
15299 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
15300 which Wikipedia says:
15302 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
15303 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
15304 identical, to the ASCII delete (DEL) or rubout control character.
15305 ) So the old condition can be simplified to !isPRINT(c) */
15308 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
15311 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
15315 const char string = c;
15316 if (c == '-' || c == ']' || c == '\\' || c == '^')
15317 sv_catpvs(sv, "\\");
15318 sv_catpvn(sv, &string, 1);
15323 #define CLEAR_OPTSTART \
15324 if (optstart) STMT_START { \
15325 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
15329 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
15331 STATIC const regnode *
15332 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
15333 const regnode *last, const regnode *plast,
15334 SV* sv, I32 indent, U32 depth)
15337 U8 op = PSEUDO; /* Arbitrary non-END op. */
15338 const regnode *next;
15339 const regnode *optstart= NULL;
15341 RXi_GET_DECL(r,ri);
15342 GET_RE_DEBUG_FLAGS_DECL;
15344 PERL_ARGS_ASSERT_DUMPUNTIL;
15346 #ifdef DEBUG_DUMPUNTIL
15347 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
15348 last ? last-start : 0,plast ? plast-start : 0);
15351 if (plast && plast < last)
15354 while (PL_regkind[op] != END && (!last || node < last)) {
15355 /* While that wasn't END last time... */
15358 if (op == CLOSE || op == WHILEM)
15360 next = regnext((regnode *)node);
15363 if (OP(node) == OPTIMIZED) {
15364 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
15371 regprop(r, sv, node);
15372 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
15373 (int)(2*indent + 1), "", SvPVX_const(sv));
15375 if (OP(node) != OPTIMIZED) {
15376 if (next == NULL) /* Next ptr. */
15377 PerlIO_printf(Perl_debug_log, " (0)");
15378 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
15379 PerlIO_printf(Perl_debug_log, " (FAIL)");
15381 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
15382 (void)PerlIO_putc(Perl_debug_log, '\n');
15386 if (PL_regkind[(U8)op] == BRANCHJ) {
15389 const regnode *nnode = (OP(next) == LONGJMP
15390 ? regnext((regnode *)next)
15392 if (last && nnode > last)
15394 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
15397 else if (PL_regkind[(U8)op] == BRANCH) {
15399 DUMPUNTIL(NEXTOPER(node), next);
15401 else if ( PL_regkind[(U8)op] == TRIE ) {
15402 const regnode *this_trie = node;
15403 const char op = OP(node);
15404 const U32 n = ARG(node);
15405 const reg_ac_data * const ac = op>=AHOCORASICK ?
15406 (reg_ac_data *)ri->data->data[n] :
15408 const reg_trie_data * const trie =
15409 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
15411 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
15413 const regnode *nextbranch= NULL;
15416 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
15417 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
15419 PerlIO_printf(Perl_debug_log, "%*s%s ",
15420 (int)(2*(indent+3)), "",
15421 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
15422 PL_colors[0], PL_colors[1],
15423 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
15424 PERL_PV_PRETTY_ELLIPSES |
15425 PERL_PV_PRETTY_LTGT
15430 U16 dist= trie->jump[word_idx+1];
15431 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
15432 (UV)((dist ? this_trie + dist : next) - start));
15435 nextbranch= this_trie + trie->jump[0];
15436 DUMPUNTIL(this_trie + dist, nextbranch);
15438 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
15439 nextbranch= regnext((regnode *)nextbranch);
15441 PerlIO_printf(Perl_debug_log, "\n");
15444 if (last && next > last)
15449 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
15450 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
15451 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
15453 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
15455 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
15457 else if ( op == PLUS || op == STAR) {
15458 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
15460 else if (PL_regkind[(U8)op] == ANYOF) {
15461 /* arglen 1 + class block */
15462 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
15463 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
15464 node = NEXTOPER(node);
15466 else if (PL_regkind[(U8)op] == EXACT) {
15467 /* Literal string, where present. */
15468 node += NODE_SZ_STR(node) - 1;
15469 node = NEXTOPER(node);
15472 node = NEXTOPER(node);
15473 node += regarglen[(U8)op];
15475 if (op == CURLYX || op == OPEN)
15479 #ifdef DEBUG_DUMPUNTIL
15480 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
15485 #endif /* DEBUGGING */
15489 * c-indentation-style: bsd
15490 * c-basic-offset: 4
15491 * indent-tabs-mode: nil
15494 * ex: set ts=8 sts=4 sw=4 et: