regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 extern const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  95 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  96 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  97
  98 #ifdef op
  99 #undef op
 100 #endif /* op */
 101
 102 #ifdef MSDOS
 103 #  if defined(BUGGY_MSC6)
 104  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 105 #    pragma optimize("a",off)
 106  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 107 #    pragma optimize("w",on )
 108 #  endif /* BUGGY_MSC6 */
 109 #endif /* MSDOS */
 110
 111 #ifndef STATIC
 112 #define STATIC  static
 113 #endif
 114
 115
 116 typedef struct RExC_state_t {
 117     U32         flags;                  /* RXf_* are we folding, multilining? */
 118     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 119     char        *precomp;               /* uncompiled string. */
 120     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 121     regexp      *rx;                    /* perl core regexp structure */
 122     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 123     char        *start;                 /* Start of input for compile */
 124     char        *end;                   /* End of input for compile */
 125     char        *parse;                 /* Input-scan pointer. */
 126     I32         whilem_seen;            /* number of WHILEM in this expr */
 127     regnode     *emit_start;            /* Start of emitted-code area */
 128     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 129     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 130     I32         naughty;                /* How bad is this pattern? */
 131     I32         sawback;                /* Did we see \1, ...? */
 132     U32         seen;
 133     I32         size;                   /* Code size. */
 134     I32         npar;                   /* Capture buffer count, (OPEN). */
 135     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 136     I32         nestroot;               /* root parens we are in - used by accept */
 137     I32         extralen;
 138     I32         seen_zerolen;
 139     regnode     **open_parens;          /* pointers to open parens */
 140     regnode     **close_parens;         /* pointers to close parens */
 141     regnode     *opend;                 /* END node in program */
 142     I32         utf8;           /* whether the pattern is utf8 or not */
 143     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 144                                 /* XXX use this for future optimisation of case
 145                                  * where pattern must be upgraded to utf8. */
 146     I32         uni_semantics;  /* If a d charset modifier should use unicode
 147                                    rules, even if the pattern is not in
 148                                    utf8 */
 149     HV          *paren_names;           /* Paren names */
 150
 151     regnode     **recurse;              /* Recurse regops */
 152     I32         recurse_count;          /* Number of recurse regops */
 153     I32         in_lookbehind;
 154     I32         contains_locale;
 155     I32         override_recoding;
 156     I32         in_multi_char_class;
 157     struct reg_code_block *code_blocks; /* positions of literal (?{})
 158                                             within pattern */
 159     int         num_code_blocks;        /* size of code_blocks[] */
 160     int         code_index;             /* next code_blocks[] slot */
 161 #if ADD_TO_REGEXEC
 162     char        *starttry;              /* -Dr: where regtry was called. */
 163 #define RExC_starttry   (pRExC_state->starttry)
 164 #endif
 165     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 166 #ifdef DEBUGGING
 167     const char  *lastparse;
 168     I32         lastnum;
 169     AV          *paren_name_list;       /* idx -> name */
 170 #define RExC_lastparse  (pRExC_state->lastparse)
 171 #define RExC_lastnum    (pRExC_state->lastnum)
 172 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 173 #endif
 174 } RExC_state_t;
 175
 176 #define RExC_flags      (pRExC_state->flags)
 177 #define RExC_pm_flags   (pRExC_state->pm_flags)
 178 #define RExC_precomp    (pRExC_state->precomp)
 179 #define RExC_rx_sv      (pRExC_state->rx_sv)
 180 #define RExC_rx         (pRExC_state->rx)
 181 #define RExC_rxi        (pRExC_state->rxi)
 182 #define RExC_start      (pRExC_state->start)
 183 #define RExC_end        (pRExC_state->end)
 184 #define RExC_parse      (pRExC_state->parse)
 185 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 186 #ifdef RE_TRACK_PATTERN_OFFSETS
 187 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 188 #endif
 189 #define RExC_emit       (pRExC_state->emit)
 190 #define RExC_emit_start (pRExC_state->emit_start)
 191 #define RExC_emit_bound (pRExC_state->emit_bound)
 192 #define RExC_naughty    (pRExC_state->naughty)
 193 #define RExC_sawback    (pRExC_state->sawback)
 194 #define RExC_seen       (pRExC_state->seen)
 195 #define RExC_size       (pRExC_state->size)
 196 #define RExC_npar       (pRExC_state->npar)
 197 #define RExC_nestroot   (pRExC_state->nestroot)
 198 #define RExC_extralen   (pRExC_state->extralen)
 199 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 200 #define RExC_utf8       (pRExC_state->utf8)
 201 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 202 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 203 #define RExC_open_parens        (pRExC_state->open_parens)
 204 #define RExC_close_parens       (pRExC_state->close_parens)
 205 #define RExC_opend      (pRExC_state->opend)
 206 #define RExC_paren_names        (pRExC_state->paren_names)
 207 #define RExC_recurse    (pRExC_state->recurse)
 208 #define RExC_recurse_count      (pRExC_state->recurse_count)
 209 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 210 #define RExC_contains_locale    (pRExC_state->contains_locale)
 211 #define RExC_override_recoding (pRExC_state->override_recoding)
 212 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 213
 214
 215 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 216 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 217         ((*s) == '{' && regcurly(s, FALSE)))
 218
 219 #ifdef SPSTART
 220 #undef SPSTART          /* dratted cpp namespace... */
 221 #endif
 222 /*
 223  * Flags to be passed up and down.
 224  */
 225 #define WORST           0       /* Worst case. */
 226 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 227
 228 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 229  * character.  (There needs to be a case: in the switch statement in regexec.c
 230  * for any node marked SIMPLE.)  Note that this is not the same thing as
 231  * REGNODE_SIMPLE */
 232 #define SIMPLE          0x02
 233 #define SPSTART         0x04    /* Starts with * or + */
 234 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 235 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 236
 237 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 238
 239 /* whether trie related optimizations are enabled */
 240 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 241 #define TRIE_STUDY_OPT
 242 #define FULL_TRIE_STUDY
 243 #define TRIE_STCLASS
 244 #endif
 245
 246
 247
 248 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 249 #define PBITVAL(paren) (1 << ((paren) & 7))
 250 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 251 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 252 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 253
 254 /* If not already in utf8, do a longjmp back to the beginning */
 255 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 256 #define REQUIRE_UTF8    STMT_START {                                       \
 257                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 258                         } STMT_END
 259
 260 /* This converts the named class defined in regcomp.h to its equivalent class
 261  * number defined in handy.h. */
 262 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 263 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 264
 265 /* About scan_data_t.
 266
 267   During optimisation we recurse through the regexp program performing
 268   various inplace (keyhole style) optimisations. In addition study_chunk
 269   and scan_commit populate this data structure with information about
 270   what strings MUST appear in the pattern. We look for the longest
 271   string that must appear at a fixed location, and we look for the
 272   longest string that may appear at a floating location. So for instance
 273   in the pattern:
 274
 275     /FOO[xX]A.*B[xX]BAR/
 276
 277   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 278   strings (because they follow a .* construct). study_chunk will identify
 279   both FOO and BAR as being the longest fixed and floating strings respectively.
 280
 281   The strings can be composites, for instance
 282
 283      /(f)(o)(o)/
 284
 285   will result in a composite fixed substring 'foo'.
 286
 287   For each string some basic information is maintained:
 288
 289   - offset or min_offset
 290     This is the position the string must appear at, or not before.
 291     It also implicitly (when combined with minlenp) tells us how many
 292     characters must match before the string we are searching for.
 293     Likewise when combined with minlenp and the length of the string it
 294     tells us how many characters must appear after the string we have
 295     found.
 296
 297   - max_offset
 298     Only used for floating strings. This is the rightmost point that
 299     the string can appear at. If set to I32 max it indicates that the
 300     string can occur infinitely far to the right.
 301
 302   - minlenp
 303     A pointer to the minimum number of characters of the pattern that the
 304     string was found inside. This is important as in the case of positive
 305     lookahead or positive lookbehind we can have multiple patterns
 306     involved. Consider
 307
 308     /(?=FOO).*F/
 309
 310     The minimum length of the pattern overall is 3, the minimum length
 311     of the lookahead part is 3, but the minimum length of the part that
 312     will actually match is 1. So 'FOO's minimum length is 3, but the
 313     minimum length for the F is 1. This is important as the minimum length
 314     is used to determine offsets in front of and behind the string being
 315     looked for.  Since strings can be composites this is the length of the
 316     pattern at the time it was committed with a scan_commit. Note that
 317     the length is calculated by study_chunk, so that the minimum lengths
 318     are not known until the full pattern has been compiled, thus the
 319     pointer to the value.
 320
 321   - lookbehind
 322
 323     In the case of lookbehind the string being searched for can be
 324     offset past the start point of the final matching string.
 325     If this value was just blithely removed from the min_offset it would
 326     invalidate some of the calculations for how many chars must match
 327     before or after (as they are derived from min_offset and minlen and
 328     the length of the string being searched for).
 329     When the final pattern is compiled and the data is moved from the
 330     scan_data_t structure into the regexp structure the information
 331     about lookbehind is factored in, with the information that would
 332     have been lost precalculated in the end_shift field for the
 333     associated string.
 334
 335   The fields pos_min and pos_delta are used to store the minimum offset
 336   and the delta to the maximum offset at the current point in the pattern.
 337
 338 */
 339
 340 typedef struct scan_data_t {
 341     /*I32 len_min;      unused */
 342     /*I32 len_delta;    unused */
 343     I32 pos_min;
 344     I32 pos_delta;
 345     SV *last_found;
 346     I32 last_end;           /* min value, <0 unless valid. */
 347     I32 last_start_min;
 348     I32 last_start_max;
 349     SV **longest;           /* Either &l_fixed, or &l_float. */
 350     SV *longest_fixed;      /* longest fixed string found in pattern */
 351     I32 offset_fixed;       /* offset where it starts */
 352     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 353     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 354     SV *longest_float;      /* longest floating string found in pattern */
 355     I32 offset_float_min;   /* earliest point in string it can appear */
 356     I32 offset_float_max;   /* latest point in string it can appear */
 357     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 358     I32 lookbehind_float;   /* is the position of the string modified by LB */
 359     I32 flags;
 360     I32 whilem_c;
 361     I32 *last_closep;
 362     struct regnode_charclass_class *start_class;
 363 } scan_data_t;
 364
 365 /*
 366  * Forward declarations for pregcomp()'s friends.
 367  */
 368
 369 static const scan_data_t zero_scan_data =
 370   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 371
 372 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 373 #define SF_BEFORE_SEOL          0x0001
 374 #define SF_BEFORE_MEOL          0x0002
 375 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 376 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 377
 378 #ifdef NO_UNARY_PLUS
 379 #  define SF_FIX_SHIFT_EOL      (0+2)
 380 #  define SF_FL_SHIFT_EOL               (0+4)
 381 #else
 382 #  define SF_FIX_SHIFT_EOL      (+2)
 383 #  define SF_FL_SHIFT_EOL               (+4)
 384 #endif
 385
 386 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 387 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 388
 389 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 390 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 391 #define SF_IS_INF               0x0040
 392 #define SF_HAS_PAR              0x0080
 393 #define SF_IN_PAR               0x0100
 394 #define SF_HAS_EVAL             0x0200
 395 #define SCF_DO_SUBSTR           0x0400
 396 #define SCF_DO_STCLASS_AND      0x0800
 397 #define SCF_DO_STCLASS_OR       0x1000
 398 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 399 #define SCF_WHILEM_VISITED_POS  0x2000
 400
 401 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 402 #define SCF_SEEN_ACCEPT         0x8000
 403
 404 #define UTF cBOOL(RExC_utf8)
 405
 406 /* The enums for all these are ordered so things work out correctly */
 407 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 408 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 409 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 410 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 411 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 412 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 413 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 414
 415 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 416
 417 #define OOB_NAMEDCLASS          -1
 418
 419 /* There is no code point that is out-of-bounds, so this is problematic.  But
 420  * its only current use is to initialize a variable that is always set before
 421  * looked at. */
 422 #define OOB_UNICODE             0xDEADBEEF
 423
 424 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 425 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 426
 427
 428 /* length of regex to show in messages that don't mark a position within */
 429 #define RegexLengthToShowInErrorMessages 127
 430
 431 /*
 432  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 433  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 434  * op/pragma/warn/regcomp.
 435  */
 436 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 437 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 438
 439 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 440
 441 /*
 442  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 443  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 444  * "...".
 445  */
 446 #define _FAIL(code) STMT_START {                                        \
 447     const char *ellipses = "";                                          \
 448     IV len = RExC_end - RExC_precomp;                                   \
 449                                                                         \
 450     if (!SIZE_ONLY)                                                     \
 451         SAVEFREESV(RExC_rx_sv);                                         \
 452     if (len > RegexLengthToShowInErrorMessages) {                       \
 453         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 454         len = RegexLengthToShowInErrorMessages - 10;                    \
 455         ellipses = "...";                                               \
 456     }                                                                   \
 457     code;                                                               \
 458 } STMT_END
 459
 460 #define FAIL(msg) _FAIL(                            \
 461     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 462             msg, (int)len, RExC_precomp, ellipses))
 463
 464 #define FAIL2(msg,arg) _FAIL(                       \
 465     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 466             arg, (int)len, RExC_precomp, ellipses))
 467
 468 /*
 469  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 470  */
 471 #define Simple_vFAIL(m) STMT_START {                                    \
 472     const IV offset = RExC_parse - RExC_precomp;                        \
 473     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 474             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 475 } STMT_END
 476
 477 /*
 478  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 479  */
 480 #define vFAIL(m) STMT_START {                           \
 481     if (!SIZE_ONLY)                                     \
 482         SAVEFREESV(RExC_rx_sv);                         \
 483     Simple_vFAIL(m);                                    \
 484 } STMT_END
 485
 486 /*
 487  * Like Simple_vFAIL(), but accepts two arguments.
 488  */
 489 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 490     const IV offset = RExC_parse - RExC_precomp;                        \
 491     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 492             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 493 } STMT_END
 494
 495 /*
 496  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 497  */
 498 #define vFAIL2(m,a1) STMT_START {                       \
 499     if (!SIZE_ONLY)                                     \
 500         SAVEFREESV(RExC_rx_sv);                         \
 501     Simple_vFAIL2(m, a1);                               \
 502 } STMT_END
 503
 504
 505 /*
 506  * Like Simple_vFAIL(), but accepts three arguments.
 507  */
 508 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 509     const IV offset = RExC_parse - RExC_precomp;                \
 510     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 511             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 512 } STMT_END
 513
 514 /*
 515  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 516  */
 517 #define vFAIL3(m,a1,a2) STMT_START {                    \
 518     if (!SIZE_ONLY)                                     \
 519         SAVEFREESV(RExC_rx_sv);                         \
 520     Simple_vFAIL3(m, a1, a2);                           \
 521 } STMT_END
 522
 523 /*
 524  * Like Simple_vFAIL(), but accepts four arguments.
 525  */
 526 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 527     const IV offset = RExC_parse - RExC_precomp;                \
 528     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 529             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 530 } STMT_END
 531
 532 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 533     if (!SIZE_ONLY)                                     \
 534         SAVEFREESV(RExC_rx_sv);                         \
 535     Simple_vFAIL4(m, a1, a2, a3);                       \
 536 } STMT_END
 537
 538 /* m is not necessarily a "literal string", in this macro */
 539 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 540     const IV offset = loc - RExC_precomp;                               \
 541     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 542             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 543 } STMT_END
 544
 545 #define ckWARNreg(loc,m) STMT_START {                                   \
 546     const IV offset = loc - RExC_precomp;                               \
 547     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 548             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 549 } STMT_END
 550
 551 #define vWARN_dep(loc, m) STMT_START {                                  \
 552     const IV offset = loc - RExC_precomp;                               \
 553     Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,     \
 554             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 555 } STMT_END
 556
 557 #define ckWARNdep(loc,m) STMT_START {                                   \
 558     const IV offset = loc - RExC_precomp;                               \
 559     Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                   \
 560             m REPORT_LOCATION,                                          \
 561             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 562 } STMT_END
 563
 564 #define ckWARNregdep(loc,m) STMT_START {                                \
 565     const IV offset = loc - RExC_precomp;                               \
 566     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 567             m REPORT_LOCATION,                                          \
 568             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 569 } STMT_END
 570
 571 #define ckWARN2regdep(loc,m, a1) STMT_START {                           \
 572     const IV offset = loc - RExC_precomp;                               \
 573     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 574             m REPORT_LOCATION,                                          \
 575             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 576 } STMT_END
 577
 578 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 579     const IV offset = loc - RExC_precomp;                               \
 580     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 581             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 582 } STMT_END
 583
 584 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 585     const IV offset = loc - RExC_precomp;                               \
 586     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 587             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 588 } STMT_END
 589
 590 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 591     const IV offset = loc - RExC_precomp;                               \
 592     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 593             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 594 } STMT_END
 595
 596 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 597     const IV offset = loc - RExC_precomp;                               \
 598     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 599             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 600 } STMT_END
 601
 602 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 603     const IV offset = loc - RExC_precomp;                               \
 604     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 605             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 606 } STMT_END
 607
 608 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 609     const IV offset = loc - RExC_precomp;                               \
 610     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 611             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 612 } STMT_END
 613
 614
 615 /* Allow for side effects in s */
 616 #define REGC(c,s) STMT_START {                  \
 617     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 618 } STMT_END
 619
 620 /* Macros for recording node offsets.   20001227 mjd@plover.com
 621  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 622  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 623  * Element 0 holds the number n.
 624  * Position is 1 indexed.
 625  */
 626 #ifndef RE_TRACK_PATTERN_OFFSETS
 627 #define Set_Node_Offset_To_R(node,byte)
 628 #define Set_Node_Offset(node,byte)
 629 #define Set_Cur_Node_Offset
 630 #define Set_Node_Length_To_R(node,len)
 631 #define Set_Node_Length(node,len)
 632 #define Set_Node_Cur_Length(node)
 633 #define Node_Offset(n)
 634 #define Node_Length(n)
 635 #define Set_Node_Offset_Length(node,offset,len)
 636 #define ProgLen(ri) ri->u.proglen
 637 #define SetProgLen(ri,x) ri->u.proglen = x
 638 #else
 639 #define ProgLen(ri) ri->u.offsets[0]
 640 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 641 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 642     if (! SIZE_ONLY) {                                                  \
 643         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 644                     __LINE__, (int)(node), (int)(byte)));               \
 645         if((node) < 0) {                                                \
 646             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 647         } else {                                                        \
 648             RExC_offsets[2*(node)-1] = (byte);                          \
 649         }                                                               \
 650     }                                                                   \
 651 } STMT_END
 652
 653 #define Set_Node_Offset(node,byte) \
 654     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 655 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 656
 657 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 658     if (! SIZE_ONLY) {                                                  \
 659         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 660                 __LINE__, (int)(node), (int)(len)));                    \
 661         if((node) < 0) {                                                \
 662             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 663         } else {                                                        \
 664             RExC_offsets[2*(node)] = (len);                             \
 665         }                                                               \
 666     }                                                                   \
 667 } STMT_END
 668
 669 #define Set_Node_Length(node,len) \
 670     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 671 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 672 #define Set_Node_Cur_Length(node) \
 673     Set_Node_Length(node, RExC_parse - parse_start)
 674
 675 /* Get offsets and lengths */
 676 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 677 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 678
 679 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 680     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 681     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 682 } STMT_END
 683 #endif
 684
 685 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 686 #define EXPERIMENTAL_INPLACESCAN
 687 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 688
 689 #define DEBUG_STUDYDATA(str,data,depth)                              \
 690 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 691     PerlIO_printf(Perl_debug_log,                                    \
 692         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 693         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 694         (int)(depth)*2, "",                                          \
 695         (IV)((data)->pos_min),                                       \
 696         (IV)((data)->pos_delta),                                     \
 697         (UV)((data)->flags),                                         \
 698         (IV)((data)->whilem_c),                                      \
 699         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 700         is_inf ? "INF " : ""                                         \
 701     );                                                               \
 702     if ((data)->last_found)                                          \
 703         PerlIO_printf(Perl_debug_log,                                \
 704             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 705             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 706             SvPVX_const((data)->last_found),                         \
 707             (IV)((data)->last_end),                                  \
 708             (IV)((data)->last_start_min),                            \
 709             (IV)((data)->last_start_max),                            \
 710             ((data)->longest &&                                      \
 711              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 712             SvPVX_const((data)->longest_fixed),                      \
 713             (IV)((data)->offset_fixed),                              \
 714             ((data)->longest &&                                      \
 715              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 716             SvPVX_const((data)->longest_float),                      \
 717             (IV)((data)->offset_float_min),                          \
 718             (IV)((data)->offset_float_max)                           \
 719         );                                                           \
 720     PerlIO_printf(Perl_debug_log,"\n");                              \
 721 });
 722
 723 /* Mark that we cannot extend a found fixed substring at this point.
 724    Update the longest found anchored substring and the longest found
 725    floating substrings if needed. */
 726
 727 STATIC void
 728 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 729 {
 730     const STRLEN l = CHR_SVLEN(data->last_found);
 731     const STRLEN old_l = CHR_SVLEN(*data->longest);
 732     GET_RE_DEBUG_FLAGS_DECL;
 733
 734     PERL_ARGS_ASSERT_SCAN_COMMIT;
 735
 736     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 737         SvSetMagicSV(*data->longest, data->last_found);
 738         if (*data->longest == data->longest_fixed) {
 739             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 740             if (data->flags & SF_BEFORE_EOL)
 741                 data->flags
 742                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 743             else
 744                 data->flags &= ~SF_FIX_BEFORE_EOL;
 745             data->minlen_fixed=minlenp;
 746             data->lookbehind_fixed=0;
 747         }
 748         else { /* *data->longest == data->longest_float */
 749             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 750             data->offset_float_max = (l
 751                                       ? data->last_start_max
 752                                       : data->pos_min + data->pos_delta);
 753             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 754                 data->offset_float_max = I32_MAX;
 755             if (data->flags & SF_BEFORE_EOL)
 756                 data->flags
 757                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 758             else
 759                 data->flags &= ~SF_FL_BEFORE_EOL;
 760             data->minlen_float=minlenp;
 761             data->lookbehind_float=0;
 762         }
 763     }
 764     SvCUR_set(data->last_found, 0);
 765     {
 766         SV * const sv = data->last_found;
 767         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 768             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 769             if (mg)
 770                 mg->mg_len = 0;
 771         }
 772     }
 773     data->last_end = -1;
 774     data->flags &= ~SF_BEFORE_EOL;
 775     DEBUG_STUDYDATA("commit: ",data,0);
 776 }
 777
 778 /* These macros set, clear and test whether the synthetic start class ('ssc',
 779  * given by the parameter) matches an empty string (EOS).  This uses the
 780  * 'next_off' field in the node, to save a bit in the flags field.  The ssc
 781  * stands alone, so there is never a next_off, so this field is otherwise
 782  * unused.  The EOS information is used only for compilation, but theoretically
 783  * it could be passed on to the execution code.  This could be used to store
 784  * more than one bit of information, but only this one is currently used. */
 785 #define SET_SSC_EOS(node)   STMT_START { (node)->next_off = TRUE; } STMT_END
 786 #define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
 787 #define TEST_SSC_EOS(node)  cBOOL((node)->next_off)
 788
 789 /* Can match anything (initialization) */
 790 STATIC void
 791 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 792 {
 793     PERL_ARGS_ASSERT_CL_ANYTHING;
 794
 795     ANYOF_BITMAP_SETALL(cl);
 796     cl->flags = ANYOF_UNICODE_ALL;
 797     SET_SSC_EOS(cl);
 798
 799     /* If any portion of the regex is to operate under locale rules,
 800      * initialization includes it.  The reason this isn't done for all regexes
 801      * is that the optimizer was written under the assumption that locale was
 802      * all-or-nothing.  Given the complexity and lack of documentation in the
 803      * optimizer, and that there are inadequate test cases for locale, so many
 804      * parts of it may not work properly, it is safest to avoid locale unless
 805      * necessary. */
 806     if (RExC_contains_locale) {
 807         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 808         cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD;
 809     }
 810     else {
 811         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
 812     }
 813 }
 814
 815 /* Can match anything (initialization) */
 816 STATIC int
 817 S_cl_is_anything(const struct regnode_charclass_class *cl)
 818 {
 819     int value;
 820
 821     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 822
 823     for (value = 0; value < ANYOF_MAX; value += 2)
 824         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 825             return 1;
 826     if (!(cl->flags & ANYOF_UNICODE_ALL))
 827         return 0;
 828     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 829         return 0;
 830     return 1;
 831 }
 832
 833 /* Can match anything (initialization) */
 834 STATIC void
 835 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 836 {
 837     PERL_ARGS_ASSERT_CL_INIT;
 838
 839     Zero(cl, 1, struct regnode_charclass_class);
 840     cl->type = ANYOF;
 841     cl_anything(pRExC_state, cl);
 842     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 843 }
 844
 845 /* These two functions currently do the exact same thing */
 846 #define cl_init_zero            S_cl_init
 847
 848 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 849  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 850  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 851 STATIC void
 852 S_cl_and(struct regnode_charclass_class *cl,
 853         const struct regnode_charclass_class *and_with)
 854 {
 855     PERL_ARGS_ASSERT_CL_AND;
 856
 857     assert(PL_regkind[and_with->type] == ANYOF);
 858
 859     /* I (khw) am not sure all these restrictions are necessary XXX */
 860     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 861         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 862         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 863         && !(and_with->flags & ANYOF_LOC_FOLD)
 864         && !(cl->flags & ANYOF_LOC_FOLD)) {
 865         int i;
 866
 867         if (and_with->flags & ANYOF_INVERT)
 868             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 869                 cl->bitmap[i] &= ~and_with->bitmap[i];
 870         else
 871             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 872                 cl->bitmap[i] &= and_with->bitmap[i];
 873     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 874
 875     if (and_with->flags & ANYOF_INVERT) {
 876
 877         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 878          * aren't affected by the inversion.  Those that are affected are
 879          * handled individually below */
 880         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 881         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 882         cl->flags |= affected_flags;
 883
 884         /* We currently don't know how to deal with things that aren't in the
 885          * bitmap, but we know that the intersection is no greater than what
 886          * is already in cl, so let there be false positives that get sorted
 887          * out after the synthetic start class succeeds, and the node is
 888          * matched for real. */
 889
 890         /* The inversion of these two flags indicate that the resulting
 891          * intersection doesn't have them */
 892         if (and_with->flags & ANYOF_UNICODE_ALL) {
 893             cl->flags &= ~ANYOF_UNICODE_ALL;
 894         }
 895         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 896             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 897         }
 898     }
 899     else {   /* and'd node is not inverted */
 900         U8 outside_bitmap_but_not_utf8; /* Temp variable */
 901
 902         if (! ANYOF_NONBITMAP(and_with)) {
 903
 904             /* Here 'and_with' doesn't match anything outside the bitmap
 905              * (except possibly ANYOF_UNICODE_ALL), which means the
 906              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 907              * which case we don't know what the intersection is, but it's no
 908              * greater than what cl already has, so can just leave it alone,
 909              * with possible false positives */
 910             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 911                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 912                 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 913             }
 914         }
 915         else if (! ANYOF_NONBITMAP(cl)) {
 916
 917             /* Here, 'and_with' does match something outside the bitmap, and cl
 918              * doesn't have a list of things to match outside the bitmap.  If
 919              * cl can match all code points above 255, the intersection will
 920              * be those above-255 code points that 'and_with' matches.  If cl
 921              * can't match all Unicode code points, it means that it can't
 922              * match anything outside the bitmap (since the 'if' that got us
 923              * into this block tested for that), so we leave the bitmap empty.
 924              */
 925             if (cl->flags & ANYOF_UNICODE_ALL) {
 926                 ARG_SET(cl, ARG(and_with));
 927
 928                 /* and_with's ARG may match things that don't require UTF8.
 929                  * And now cl's will too, in spite of this being an 'and'.  See
 930                  * the comments below about the kludge */
 931                 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 932             }
 933         }
 934         else {
 935             /* Here, both 'and_with' and cl match something outside the
 936              * bitmap.  Currently we do not do the intersection, so just match
 937              * whatever cl had at the beginning.  */
 938         }
 939
 940
 941         /* Take the intersection of the two sets of flags.  However, the
 942          * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 943          * kludge around the fact that this flag is not treated like the others
 944          * which are initialized in cl_anything().  The way the optimizer works
 945          * is that the synthetic start class (SSC) is initialized to match
 946          * anything, and then the first time a real node is encountered, its
 947          * values are AND'd with the SSC's with the result being the values of
 948          * the real node.  However, there are paths through the optimizer where
 949          * the AND never gets called, so those initialized bits are set
 950          * inappropriately, which is not usually a big deal, as they just cause
 951          * false positives in the SSC, which will just mean a probably
 952          * imperceptible slow down in execution.  However this bit has a
 953          * higher false positive consequence in that it can cause utf8.pm,
 954          * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 955          * bigger slowdown and also causes significant extra memory to be used.
 956          * In order to prevent this, the code now takes a different tack.  The
 957          * bit isn't set unless some part of the regular expression needs it,
 958          * but once set it won't get cleared.  This means that these extra
 959          * modules won't get loaded unless there was some path through the
 960          * pattern that would have required them anyway, and  so any false
 961          * positives that occur by not ANDing them out when they could be
 962          * aren't as severe as they would be if we treated this bit like all
 963          * the others */
 964         outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 965                                       & ANYOF_NONBITMAP_NON_UTF8;
 966         cl->flags &= and_with->flags;
 967         cl->flags |= outside_bitmap_but_not_utf8;
 968     }
 969 }
 970
 971 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 972  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 973  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 974 STATIC void
 975 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 976 {
 977     PERL_ARGS_ASSERT_CL_OR;
 978
 979     if (or_with->flags & ANYOF_INVERT) {
 980
 981         /* Here, the or'd node is to be inverted.  This means we take the
 982          * complement of everything not in the bitmap, but currently we don't
 983          * know what that is, so give up and match anything */
 984         if (ANYOF_NONBITMAP(or_with)) {
 985             cl_anything(pRExC_state, cl);
 986         }
 987         /* We do not use
 988          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 989          *   <= (B1 | !B2) | (CL1 | !CL2)
 990          * which is wasteful if CL2 is small, but we ignore CL2:
 991          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 992          * XXXX Can we handle case-fold?  Unclear:
 993          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 994          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 995          */
 996         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 997              && !(or_with->flags & ANYOF_LOC_FOLD)
 998              && !(cl->flags & ANYOF_LOC_FOLD) ) {
 999             int i;
1000
1001             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1002                 cl->bitmap[i] |= ~or_with->bitmap[i];
1003         } /* XXXX: logic is complicated otherwise */
1004         else {
1005             cl_anything(pRExC_state, cl);
1006         }
1007
1008         /* And, we can just take the union of the flags that aren't affected
1009          * by the inversion */
1010         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
1011
1012         /* For the remaining flags:
1013             ANYOF_UNICODE_ALL and inverted means to not match anything above
1014                     255, which means that the union with cl should just be
1015                     what cl has in it, so can ignore this flag
1016             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
1017                     is 127-255 to match them, but then invert that, so the
1018                     union with cl should just be what cl has in it, so can
1019                     ignore this flag
1020          */
1021     } else {    /* 'or_with' is not inverted */
1022         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
1023         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1024              && (!(or_with->flags & ANYOF_LOC_FOLD)
1025                  || (cl->flags & ANYOF_LOC_FOLD)) ) {
1026             int i;
1027
1028             /* OR char bitmap and class bitmap separately */
1029             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1030                 cl->bitmap[i] |= or_with->bitmap[i];
1031             ANYOF_CLASS_OR(or_with, cl);
1032         }
1033         else { /* XXXX: logic is complicated, leave it along for a moment. */
1034             cl_anything(pRExC_state, cl);
1035         }
1036
1037         if (ANYOF_NONBITMAP(or_with)) {
1038
1039             /* Use the added node's outside-the-bit-map match if there isn't a
1040              * conflict.  If there is a conflict (both nodes match something
1041              * outside the bitmap, but what they match outside is not the same
1042              * pointer, and hence not easily compared until XXX we extend
1043              * inversion lists this far), give up and allow the start class to
1044              * match everything outside the bitmap.  If that stuff is all above
1045              * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1046             if (! ANYOF_NONBITMAP(cl)) {
1047                 ARG_SET(cl, ARG(or_with));
1048             }
1049             else if (ARG(cl) != ARG(or_with)) {
1050
1051                 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1052                     cl_anything(pRExC_state, cl);
1053                 }
1054                 else {
1055                     cl->flags |= ANYOF_UNICODE_ALL;
1056                 }
1057             }
1058         }
1059
1060         /* Take the union */
1061         cl->flags |= or_with->flags;
1062     }
1063 }
1064
1065 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1066 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1067 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1068 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1069
1070
1071 #ifdef DEBUGGING
1072 /*
1073    dump_trie(trie,widecharmap,revcharmap)
1074    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1075    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1076
1077    These routines dump out a trie in a somewhat readable format.
1078    The _interim_ variants are used for debugging the interim
1079    tables that are used to generate the final compressed
1080    representation which is what dump_trie expects.
1081
1082    Part of the reason for their existence is to provide a form
1083    of documentation as to how the different representations function.
1084
1085 */
1086
1087 /*
1088   Dumps the final compressed table form of the trie to Perl_debug_log.
1089   Used for debugging make_trie().
1090 */
1091
1092 STATIC void
1093 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1094             AV *revcharmap, U32 depth)
1095 {
1096     U32 state;
1097     SV *sv=sv_newmortal();
1098     int colwidth= widecharmap ? 6 : 4;
1099     U16 word;
1100     GET_RE_DEBUG_FLAGS_DECL;
1101
1102     PERL_ARGS_ASSERT_DUMP_TRIE;
1103
1104     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1105         (int)depth * 2 + 2,"",
1106         "Match","Base","Ofs" );
1107
1108     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1109         SV ** const tmp = av_fetch( revcharmap, state, 0);
1110         if ( tmp ) {
1111             PerlIO_printf( Perl_debug_log, "%*s",
1112                 colwidth,
1113                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1114                             PL_colors[0], PL_colors[1],
1115                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1116                             PERL_PV_ESCAPE_FIRSTCHAR
1117                 )
1118             );
1119         }
1120     }
1121     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1122         (int)depth * 2 + 2,"");
1123
1124     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1125         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1126     PerlIO_printf( Perl_debug_log, "\n");
1127
1128     for( state = 1 ; state < trie->statecount ; state++ ) {
1129         const U32 base = trie->states[ state ].trans.base;
1130
1131         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1132
1133         if ( trie->states[ state ].wordnum ) {
1134             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1135         } else {
1136             PerlIO_printf( Perl_debug_log, "%6s", "" );
1137         }
1138
1139         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1140
1141         if ( base ) {
1142             U32 ofs = 0;
1143
1144             while( ( base + ofs  < trie->uniquecharcount ) ||
1145                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1146                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1147                     ofs++;
1148
1149             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1150
1151             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1152                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1153                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1154                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1155                 {
1156                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1157                     colwidth,
1158                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1159                 } else {
1160                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1161                 }
1162             }
1163
1164             PerlIO_printf( Perl_debug_log, "]");
1165
1166         }
1167         PerlIO_printf( Perl_debug_log, "\n" );
1168     }
1169     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1170     for (word=1; word <= trie->wordcount; word++) {
1171         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1172             (int)word, (int)(trie->wordinfo[word].prev),
1173             (int)(trie->wordinfo[word].len));
1174     }
1175     PerlIO_printf(Perl_debug_log, "\n" );
1176 }
1177 /*
1178   Dumps a fully constructed but uncompressed trie in list form.
1179   List tries normally only are used for construction when the number of
1180   possible chars (trie->uniquecharcount) is very high.
1181   Used for debugging make_trie().
1182 */
1183 STATIC void
1184 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1185                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1186                          U32 depth)
1187 {
1188     U32 state;
1189     SV *sv=sv_newmortal();
1190     int colwidth= widecharmap ? 6 : 4;
1191     GET_RE_DEBUG_FLAGS_DECL;
1192
1193     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1194
1195     /* print out the table precompression.  */
1196     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1197         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1198         "------:-----+-----------------\n" );
1199
1200     for( state=1 ; state < next_alloc ; state ++ ) {
1201         U16 charid;
1202
1203         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1204             (int)depth * 2 + 2,"", (UV)state  );
1205         if ( ! trie->states[ state ].wordnum ) {
1206             PerlIO_printf( Perl_debug_log, "%5s| ","");
1207         } else {
1208             PerlIO_printf( Perl_debug_log, "W%4x| ",
1209                 trie->states[ state ].wordnum
1210             );
1211         }
1212         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1213             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1214             if ( tmp ) {
1215                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1216                     colwidth,
1217                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1218                             PL_colors[0], PL_colors[1],
1219                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1220                             PERL_PV_ESCAPE_FIRSTCHAR
1221                     ) ,
1222                     TRIE_LIST_ITEM(state,charid).forid,
1223                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1224                 );
1225                 if (!(charid % 10))
1226                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1227                         (int)((depth * 2) + 14), "");
1228             }
1229         }
1230         PerlIO_printf( Perl_debug_log, "\n");
1231     }
1232 }
1233
1234 /*
1235   Dumps a fully constructed but uncompressed trie in table form.
1236   This is the normal DFA style state transition table, with a few
1237   twists to facilitate compression later.
1238   Used for debugging make_trie().
1239 */
1240 STATIC void
1241 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1242                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1243                           U32 depth)
1244 {
1245     U32 state;
1246     U16 charid;
1247     SV *sv=sv_newmortal();
1248     int colwidth= widecharmap ? 6 : 4;
1249     GET_RE_DEBUG_FLAGS_DECL;
1250
1251     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1252
1253     /*
1254        print out the table precompression so that we can do a visual check
1255        that they are identical.
1256      */
1257
1258     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1259
1260     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1261         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1262         if ( tmp ) {
1263             PerlIO_printf( Perl_debug_log, "%*s",
1264                 colwidth,
1265                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1266                             PL_colors[0], PL_colors[1],
1267                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1268                             PERL_PV_ESCAPE_FIRSTCHAR
1269                 )
1270             );
1271         }
1272     }
1273
1274     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1275
1276     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1277         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1278     }
1279
1280     PerlIO_printf( Perl_debug_log, "\n" );
1281
1282     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1283
1284         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1285             (int)depth * 2 + 2,"",
1286             (UV)TRIE_NODENUM( state ) );
1287
1288         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1289             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1290             if (v)
1291                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1292             else
1293                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1294         }
1295         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1296             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1297         } else {
1298             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1299             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1300         }
1301     }
1302 }
1303
1304 #endif
1305
1306
1307 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1308   startbranch: the first branch in the whole branch sequence
1309   first      : start branch of sequence of branch-exact nodes.
1310                May be the same as startbranch
1311   last       : Thing following the last branch.
1312                May be the same as tail.
1313   tail       : item following the branch sequence
1314   count      : words in the sequence
1315   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1316   depth      : indent depth
1317
1318 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1319
1320 A trie is an N'ary tree where the branches are determined by digital
1321 decomposition of the key. IE, at the root node you look up the 1st character and
1322 follow that branch repeat until you find the end of the branches. Nodes can be
1323 marked as "accepting" meaning they represent a complete word. Eg:
1324
1325   /he|she|his|hers/
1326
1327 would convert into the following structure. Numbers represent states, letters
1328 following numbers represent valid transitions on the letter from that state, if
1329 the number is in square brackets it represents an accepting state, otherwise it
1330 will be in parenthesis.
1331
1332       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1333       |    |
1334       |   (2)
1335       |    |
1336      (1)   +-i->(6)-+-s->[7]
1337       |
1338       +-s->(3)-+-h->(4)-+-e->[5]
1339
1340       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1341
1342 This shows that when matching against the string 'hers' we will begin at state 1
1343 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1344 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1345 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1346 single traverse. We store a mapping from accepting to state to which word was
1347 matched, and then when we have multiple possibilities we try to complete the
1348 rest of the regex in the order in which they occured in the alternation.
1349
1350 The only prior NFA like behaviour that would be changed by the TRIE support is
1351 the silent ignoring of duplicate alternations which are of the form:
1352
1353  / (DUPE|DUPE) X? (?{ ... }) Y /x
1354
1355 Thus EVAL blocks following a trie may be called a different number of times with
1356 and without the optimisation. With the optimisations dupes will be silently
1357 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1358 the following demonstrates:
1359
1360  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1361
1362 which prints out 'word' three times, but
1363
1364  'words'=~/(word|word|word)(?{ print $1 })S/
1365
1366 which doesnt print it out at all. This is due to other optimisations kicking in.
1367
1368 Example of what happens on a structural level:
1369
1370 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1371
1372    1: CURLYM[1] {1,32767}(18)
1373    5:   BRANCH(8)
1374    6:     EXACT <ac>(16)
1375    8:   BRANCH(11)
1376    9:     EXACT <ad>(16)
1377   11:   BRANCH(14)
1378   12:     EXACT <ab>(16)
1379   16:   SUCCEED(0)
1380   17:   NOTHING(18)
1381   18: END(0)
1382
1383 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1384 and should turn into:
1385
1386    1: CURLYM[1] {1,32767}(18)
1387    5:   TRIE(16)
1388         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1389           <ac>
1390           <ad>
1391           <ab>
1392   16:   SUCCEED(0)
1393   17:   NOTHING(18)
1394   18: END(0)
1395
1396 Cases where tail != last would be like /(?foo|bar)baz/:
1397
1398    1: BRANCH(4)
1399    2:   EXACT <foo>(8)
1400    4: BRANCH(7)
1401    5:   EXACT <bar>(8)
1402    7: TAIL(8)
1403    8: EXACT <baz>(10)
1404   10: END(0)
1405
1406 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1407 and would end up looking like:
1408
1409     1: TRIE(8)
1410       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1411         <foo>
1412         <bar>
1413    7: TAIL(8)
1414    8: EXACT <baz>(10)
1415   10: END(0)
1416
1417     d = uvuni_to_utf8_flags(d, uv, 0);
1418
1419 is the recommended Unicode-aware way of saying
1420
1421     *(d++) = uv;
1422 */
1423
1424 #define TRIE_STORE_REVCHAR(val)                                            \
1425     STMT_START {                                                           \
1426         if (UTF) {                                                         \
1427             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1428             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1429             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1430             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1431             SvPOK_on(zlopp);                                               \
1432             SvUTF8_on(zlopp);                                              \
1433             av_push(revcharmap, zlopp);                                    \
1434         } else {                                                           \
1435             char ooooff = (char)val;                                           \
1436             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1437         }                                                                  \
1438         } STMT_END
1439
1440 #define TRIE_READ_CHAR STMT_START {                                                     \
1441     wordlen++;                                                                          \
1442     if ( UTF ) {                                                                        \
1443         /* if it is UTF then it is either already folded, or does not need folding */   \
1444         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
1445     }                                                                                   \
1446     else if (folder == PL_fold_latin1) {                                                \
1447         /* if we use this folder we have to obey unicode rules on latin-1 data */       \
1448         if ( foldlen > 0 ) {                                                            \
1449            uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
1450            foldlen -= len;                                                              \
1451            scan += len;                                                                 \
1452            len = 0;                                                                     \
1453         } else {                                                                        \
1454             len = 1;                                                                    \
1455             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                     \
1456             skiplen = UNISKIP(uvc);                                                     \
1457             foldlen -= skiplen;                                                         \
1458             scan = foldbuf + skiplen;                                                   \
1459         }                                                                               \
1460     } else {                                                                            \
1461         /* raw data, will be folded later if needed */                                  \
1462         uvc = (U32)*uc;                                                                 \
1463         len = 1;                                                                        \
1464     }                                                                                   \
1465 } STMT_END
1466
1467
1468
1469 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1470     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1471         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1472         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1473     }                                                           \
1474     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1475     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1476     TRIE_LIST_CUR( state )++;                                   \
1477 } STMT_END
1478
1479 #define TRIE_LIST_NEW(state) STMT_START {                       \
1480     Newxz( trie->states[ state ].trans.list,               \
1481         4, reg_trie_trans_le );                                 \
1482      TRIE_LIST_CUR( state ) = 1;                                \
1483      TRIE_LIST_LEN( state ) = 4;                                \
1484 } STMT_END
1485
1486 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1487     U16 dupe= trie->states[ state ].wordnum;                    \
1488     regnode * const noper_next = regnext( noper );              \
1489                                                                 \
1490     DEBUG_r({                                                   \
1491         /* store the word for dumping */                        \
1492         SV* tmp;                                                \
1493         if (OP(noper) != NOTHING)                               \
1494             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1495         else                                                    \
1496             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1497         av_push( trie_words, tmp );                             \
1498     });                                                         \
1499                                                                 \
1500     curword++;                                                  \
1501     trie->wordinfo[curword].prev   = 0;                         \
1502     trie->wordinfo[curword].len    = wordlen;                   \
1503     trie->wordinfo[curword].accept = state;                     \
1504                                                                 \
1505     if ( noper_next < tail ) {                                  \
1506         if (!trie->jump)                                        \
1507             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1508         trie->jump[curword] = (U16)(noper_next - convert);      \
1509         if (!jumper)                                            \
1510             jumper = noper_next;                                \
1511         if (!nextbranch)                                        \
1512             nextbranch= regnext(cur);                           \
1513     }                                                           \
1514                                                                 \
1515     if ( dupe ) {                                               \
1516         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1517         /* chain, so that when the bits of chain are later    */\
1518         /* linked together, the dups appear in the chain      */\
1519         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1520         trie->wordinfo[dupe].prev = curword;                    \
1521     } else {                                                    \
1522         /* we haven't inserted this word yet.                */ \
1523         trie->states[ state ].wordnum = curword;                \
1524     }                                                           \
1525 } STMT_END
1526
1527
1528 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1529      ( ( base + charid >=  ucharcount                                   \
1530          && base + charid < ubound                                      \
1531          && state == trie->trans[ base - ucharcount + charid ].check    \
1532          && trie->trans[ base - ucharcount + charid ].next )            \
1533            ? trie->trans[ base - ucharcount + charid ].next             \
1534            : ( state==1 ? special : 0 )                                 \
1535       )
1536
1537 #define MADE_TRIE       1
1538 #define MADE_JUMP_TRIE  2
1539 #define MADE_EXACT_TRIE 4
1540
1541 STATIC I32
1542 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1543 {
1544     dVAR;
1545     /* first pass, loop through and scan words */
1546     reg_trie_data *trie;
1547     HV *widecharmap = NULL;
1548     AV *revcharmap = newAV();
1549     regnode *cur;
1550     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1551     STRLEN len = 0;
1552     UV uvc = 0;
1553     U16 curword = 0;
1554     U32 next_alloc = 0;
1555     regnode *jumper = NULL;
1556     regnode *nextbranch = NULL;
1557     regnode *convert = NULL;
1558     U32 *prev_states; /* temp array mapping each state to previous one */
1559     /* we just use folder as a flag in utf8 */
1560     const U8 * folder = NULL;
1561
1562 #ifdef DEBUGGING
1563     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1564     AV *trie_words = NULL;
1565     /* along with revcharmap, this only used during construction but both are
1566      * useful during debugging so we store them in the struct when debugging.
1567      */
1568 #else
1569     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1570     STRLEN trie_charcount=0;
1571 #endif
1572     SV *re_trie_maxbuff;
1573     GET_RE_DEBUG_FLAGS_DECL;
1574
1575     PERL_ARGS_ASSERT_MAKE_TRIE;
1576 #ifndef DEBUGGING
1577     PERL_UNUSED_ARG(depth);
1578 #endif
1579
1580     switch (flags) {
1581         case EXACT: break;
1582         case EXACTFA:
1583         case EXACTFU_SS:
1584         case EXACTFU_TRICKYFOLD:
1585         case EXACTFU: folder = PL_fold_latin1; break;
1586         case EXACTF:  folder = PL_fold; break;
1587         case EXACTFL: folder = PL_fold_locale; break;
1588         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1589     }
1590
1591     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1592     trie->refcount = 1;
1593     trie->startstate = 1;
1594     trie->wordcount = word_count;
1595     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1596     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1597     if (flags == EXACT)
1598         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1599     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1600                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1601
1602     DEBUG_r({
1603         trie_words = newAV();
1604     });
1605
1606     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1607     if (!SvIOK(re_trie_maxbuff)) {
1608         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1609     }
1610     DEBUG_TRIE_COMPILE_r({
1611                 PerlIO_printf( Perl_debug_log,
1612                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1613                   (int)depth * 2 + 2, "",
1614                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1615                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1616                   (int)depth);
1617     });
1618
1619    /* Find the node we are going to overwrite */
1620     if ( first == startbranch && OP( last ) != BRANCH ) {
1621         /* whole branch chain */
1622         convert = first;
1623     } else {
1624         /* branch sub-chain */
1625         convert = NEXTOPER( first );
1626     }
1627
1628     /*  -- First loop and Setup --
1629
1630        We first traverse the branches and scan each word to determine if it
1631        contains widechars, and how many unique chars there are, this is
1632        important as we have to build a table with at least as many columns as we
1633        have unique chars.
1634
1635        We use an array of integers to represent the character codes 0..255
1636        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1637        native representation of the character value as the key and IV's for the
1638        coded index.
1639
1640        *TODO* If we keep track of how many times each character is used we can
1641        remap the columns so that the table compression later on is more
1642        efficient in terms of memory by ensuring the most common value is in the
1643        middle and the least common are on the outside.  IMO this would be better
1644        than a most to least common mapping as theres a decent chance the most
1645        common letter will share a node with the least common, meaning the node
1646        will not be compressible. With a middle is most common approach the worst
1647        case is when we have the least common nodes twice.
1648
1649      */
1650
1651     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1652         regnode *noper = NEXTOPER( cur );
1653         const U8 *uc = (U8*)STRING( noper );
1654         const U8 *e  = uc + STR_LEN( noper );
1655         STRLEN foldlen = 0;
1656         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1657         STRLEN skiplen = 0;
1658         const U8 *scan = (U8*)NULL;
1659         U32 wordlen      = 0;         /* required init */
1660         STRLEN chars = 0;
1661         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1662
1663         if (OP(noper) == NOTHING) {
1664             regnode *noper_next= regnext(noper);
1665             if (noper_next != tail && OP(noper_next) == flags) {
1666                 noper = noper_next;
1667                 uc= (U8*)STRING(noper);
1668                 e= uc + STR_LEN(noper);
1669                 trie->minlen= STR_LEN(noper);
1670             } else {
1671                 trie->minlen= 0;
1672                 continue;
1673             }
1674         }
1675
1676         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1677             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1678                                           regardless of encoding */
1679             if (OP( noper ) == EXACTFU_SS) {
1680                 /* false positives are ok, so just set this */
1681                 TRIE_BITMAP_SET(trie,0xDF);
1682             }
1683         }
1684         for ( ; uc < e ; uc += len ) {
1685             TRIE_CHARCOUNT(trie)++;
1686             TRIE_READ_CHAR;
1687             chars++;
1688             if ( uvc < 256 ) {
1689                 if ( folder ) {
1690                     U8 folded= folder[ (U8) uvc ];
1691                     if ( !trie->charmap[ folded ] ) {
1692                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
1693                         TRIE_STORE_REVCHAR( folded );
1694                     }
1695                 }
1696                 if ( !trie->charmap[ uvc ] ) {
1697                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1698                     TRIE_STORE_REVCHAR( uvc );
1699                 }
1700                 if ( set_bit ) {
1701                     /* store the codepoint in the bitmap, and its folded
1702                      * equivalent. */
1703                     TRIE_BITMAP_SET(trie, uvc);
1704
1705                     /* store the folded codepoint */
1706                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1707
1708                     if ( !UTF ) {
1709                         /* store first byte of utf8 representation of
1710                            variant codepoints */
1711                         if (! UNI_IS_INVARIANT(uvc)) {
1712                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1713                         }
1714                     }
1715                     set_bit = 0; /* We've done our bit :-) */
1716                 }
1717             } else {
1718                 SV** svpp;
1719                 if ( !widecharmap )
1720                     widecharmap = newHV();
1721
1722                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1723
1724                 if ( !svpp )
1725                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1726
1727                 if ( !SvTRUE( *svpp ) ) {
1728                     sv_setiv( *svpp, ++trie->uniquecharcount );
1729                     TRIE_STORE_REVCHAR(uvc);
1730                 }
1731             }
1732         }
1733         if( cur == first ) {
1734             trie->minlen = chars;
1735             trie->maxlen = chars;
1736         } else if (chars < trie->minlen) {
1737             trie->minlen = chars;
1738         } else if (chars > trie->maxlen) {
1739             trie->maxlen = chars;
1740         }
1741         if (OP( noper ) == EXACTFU_SS) {
1742             /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1743             if (trie->minlen > 1)
1744                 trie->minlen= 1;
1745         }
1746         if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1747             /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1748              *                - We assume that any such sequence might match a 2 byte string */
1749             if (trie->minlen > 2 )
1750                 trie->minlen= 2;
1751         }
1752
1753     } /* end first pass */
1754     DEBUG_TRIE_COMPILE_r(
1755         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1756                 (int)depth * 2 + 2,"",
1757                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1758                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1759                 (int)trie->minlen, (int)trie->maxlen )
1760     );
1761
1762     /*
1763         We now know what we are dealing with in terms of unique chars and
1764         string sizes so we can calculate how much memory a naive
1765         representation using a flat table  will take. If it's over a reasonable
1766         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1767         conservative but potentially much slower representation using an array
1768         of lists.
1769
1770         At the end we convert both representations into the same compressed
1771         form that will be used in regexec.c for matching with. The latter
1772         is a form that cannot be used to construct with but has memory
1773         properties similar to the list form and access properties similar
1774         to the table form making it both suitable for fast searches and
1775         small enough that its feasable to store for the duration of a program.
1776
1777         See the comment in the code where the compressed table is produced
1778         inplace from the flat tabe representation for an explanation of how
1779         the compression works.
1780
1781     */
1782
1783
1784     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1785     prev_states[1] = 0;
1786
1787     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1788         /*
1789             Second Pass -- Array Of Lists Representation
1790
1791             Each state will be represented by a list of charid:state records
1792             (reg_trie_trans_le) the first such element holds the CUR and LEN
1793             points of the allocated array. (See defines above).
1794
1795             We build the initial structure using the lists, and then convert
1796             it into the compressed table form which allows faster lookups
1797             (but cant be modified once converted).
1798         */
1799
1800         STRLEN transcount = 1;
1801
1802         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1803             "%*sCompiling trie using list compiler\n",
1804             (int)depth * 2 + 2, ""));
1805
1806         trie->states = (reg_trie_state *)
1807             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1808                                   sizeof(reg_trie_state) );
1809         TRIE_LIST_NEW(1);
1810         next_alloc = 2;
1811
1812         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1813
1814             regnode *noper   = NEXTOPER( cur );
1815             U8 *uc           = (U8*)STRING( noper );
1816             const U8 *e      = uc + STR_LEN( noper );
1817             U32 state        = 1;         /* required init */
1818             U16 charid       = 0;         /* sanity init */
1819             U8 *scan         = (U8*)NULL; /* sanity init */
1820             STRLEN foldlen   = 0;         /* required init */
1821             U32 wordlen      = 0;         /* required init */
1822             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1823             STRLEN skiplen   = 0;
1824
1825             if (OP(noper) == NOTHING) {
1826                 regnode *noper_next= regnext(noper);
1827                 if (noper_next != tail && OP(noper_next) == flags) {
1828                     noper = noper_next;
1829                     uc= (U8*)STRING(noper);
1830                     e= uc + STR_LEN(noper);
1831                 }
1832             }
1833
1834             if (OP(noper) != NOTHING) {
1835                 for ( ; uc < e ; uc += len ) {
1836
1837                     TRIE_READ_CHAR;
1838
1839                     if ( uvc < 256 ) {
1840                         charid = trie->charmap[ uvc ];
1841                     } else {
1842                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1843                         if ( !svpp ) {
1844                             charid = 0;
1845                         } else {
1846                             charid=(U16)SvIV( *svpp );
1847                         }
1848                     }
1849                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1850                     if ( charid ) {
1851
1852                         U16 check;
1853                         U32 newstate = 0;
1854
1855                         charid--;
1856                         if ( !trie->states[ state ].trans.list ) {
1857                             TRIE_LIST_NEW( state );
1858                         }
1859                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1860                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1861                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1862                                 break;
1863                             }
1864                         }
1865                         if ( ! newstate ) {
1866                             newstate = next_alloc++;
1867                             prev_states[newstate] = state;
1868                             TRIE_LIST_PUSH( state, charid, newstate );
1869                             transcount++;
1870                         }
1871                         state = newstate;
1872                     } else {
1873                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1874                     }
1875                 }
1876             }
1877             TRIE_HANDLE_WORD(state);
1878
1879         } /* end second pass */
1880
1881         /* next alloc is the NEXT state to be allocated */
1882         trie->statecount = next_alloc;
1883         trie->states = (reg_trie_state *)
1884             PerlMemShared_realloc( trie->states,
1885                                    next_alloc
1886                                    * sizeof(reg_trie_state) );
1887
1888         /* and now dump it out before we compress it */
1889         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1890                                                          revcharmap, next_alloc,
1891                                                          depth+1)
1892         );
1893
1894         trie->trans = (reg_trie_trans *)
1895             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1896         {
1897             U32 state;
1898             U32 tp = 0;
1899             U32 zp = 0;
1900
1901
1902             for( state=1 ; state < next_alloc ; state ++ ) {
1903                 U32 base=0;
1904
1905                 /*
1906                 DEBUG_TRIE_COMPILE_MORE_r(
1907                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1908                 );
1909                 */
1910
1911                 if (trie->states[state].trans.list) {
1912                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1913                     U16 maxid=minid;
1914                     U16 idx;
1915
1916                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1917                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1918                         if ( forid < minid ) {
1919                             minid=forid;
1920                         } else if ( forid > maxid ) {
1921                             maxid=forid;
1922                         }
1923                     }
1924                     if ( transcount < tp + maxid - minid + 1) {
1925                         transcount *= 2;
1926                         trie->trans = (reg_trie_trans *)
1927                             PerlMemShared_realloc( trie->trans,
1928                                                      transcount
1929                                                      * sizeof(reg_trie_trans) );
1930                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1931                     }
1932                     base = trie->uniquecharcount + tp - minid;
1933                     if ( maxid == minid ) {
1934                         U32 set = 0;
1935                         for ( ; zp < tp ; zp++ ) {
1936                             if ( ! trie->trans[ zp ].next ) {
1937                                 base = trie->uniquecharcount + zp - minid;
1938                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1939                                 trie->trans[ zp ].check = state;
1940                                 set = 1;
1941                                 break;
1942                             }
1943                         }
1944                         if ( !set ) {
1945                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1946                             trie->trans[ tp ].check = state;
1947                             tp++;
1948                             zp = tp;
1949                         }
1950                     } else {
1951                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1952                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1953                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1954                             trie->trans[ tid ].check = state;
1955                         }
1956                         tp += ( maxid - minid + 1 );
1957                     }
1958                     Safefree(trie->states[ state ].trans.list);
1959                 }
1960                 /*
1961                 DEBUG_TRIE_COMPILE_MORE_r(
1962                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1963                 );
1964                 */
1965                 trie->states[ state ].trans.base=base;
1966             }
1967             trie->lasttrans = tp + 1;
1968         }
1969     } else {
1970         /*
1971            Second Pass -- Flat Table Representation.
1972
1973            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1974            We know that we will need Charcount+1 trans at most to store the data
1975            (one row per char at worst case) So we preallocate both structures
1976            assuming worst case.
1977
1978            We then construct the trie using only the .next slots of the entry
1979            structs.
1980
1981            We use the .check field of the first entry of the node temporarily to
1982            make compression both faster and easier by keeping track of how many non
1983            zero fields are in the node.
1984
1985            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1986            transition.
1987
1988            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1989            number representing the first entry of the node, and state as a
1990            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1991            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1992            are 2 entrys per node. eg:
1993
1994              A B       A B
1995           1. 2 4    1. 3 7
1996           2. 0 3    3. 0 5
1997           3. 0 0    5. 0 0
1998           4. 0 0    7. 0 0
1999
2000            The table is internally in the right hand, idx form. However as we also
2001            have to deal with the states array which is indexed by nodenum we have to
2002            use TRIE_NODENUM() to convert.
2003
2004         */
2005         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2006             "%*sCompiling trie using table compiler\n",
2007             (int)depth * 2 + 2, ""));
2008
2009         trie->trans = (reg_trie_trans *)
2010             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2011                                   * trie->uniquecharcount + 1,
2012                                   sizeof(reg_trie_trans) );
2013         trie->states = (reg_trie_state *)
2014             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2015                                   sizeof(reg_trie_state) );
2016         next_alloc = trie->uniquecharcount + 1;
2017
2018
2019         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2020
2021             regnode *noper   = NEXTOPER( cur );
2022             const U8 *uc     = (U8*)STRING( noper );
2023             const U8 *e      = uc + STR_LEN( noper );
2024
2025             U32 state        = 1;         /* required init */
2026
2027             U16 charid       = 0;         /* sanity init */
2028             U32 accept_state = 0;         /* sanity init */
2029             U8 *scan         = (U8*)NULL; /* sanity init */
2030
2031             STRLEN foldlen   = 0;         /* required init */
2032             U32 wordlen      = 0;         /* required init */
2033             STRLEN skiplen   = 0;
2034             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2035
2036             if (OP(noper) == NOTHING) {
2037                 regnode *noper_next= regnext(noper);
2038                 if (noper_next != tail && OP(noper_next) == flags) {
2039                     noper = noper_next;
2040                     uc= (U8*)STRING(noper);
2041                     e= uc + STR_LEN(noper);
2042                 }
2043             }
2044
2045             if ( OP(noper) != NOTHING ) {
2046                 for ( ; uc < e ; uc += len ) {
2047
2048                     TRIE_READ_CHAR;
2049
2050                     if ( uvc < 256 ) {
2051                         charid = trie->charmap[ uvc ];
2052                     } else {
2053                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2054                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2055                     }
2056                     if ( charid ) {
2057                         charid--;
2058                         if ( !trie->trans[ state + charid ].next ) {
2059                             trie->trans[ state + charid ].next = next_alloc;
2060                             trie->trans[ state ].check++;
2061                             prev_states[TRIE_NODENUM(next_alloc)]
2062                                     = TRIE_NODENUM(state);
2063                             next_alloc += trie->uniquecharcount;
2064                         }
2065                         state = trie->trans[ state + charid ].next;
2066                     } else {
2067                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2068                     }
2069                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
2070                 }
2071             }
2072             accept_state = TRIE_NODENUM( state );
2073             TRIE_HANDLE_WORD(accept_state);
2074
2075         } /* end second pass */
2076
2077         /* and now dump it out before we compress it */
2078         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2079                                                           revcharmap,
2080                                                           next_alloc, depth+1));
2081
2082         {
2083         /*
2084            * Inplace compress the table.*
2085
2086            For sparse data sets the table constructed by the trie algorithm will
2087            be mostly 0/FAIL transitions or to put it another way mostly empty.
2088            (Note that leaf nodes will not contain any transitions.)
2089
2090            This algorithm compresses the tables by eliminating most such
2091            transitions, at the cost of a modest bit of extra work during lookup:
2092
2093            - Each states[] entry contains a .base field which indicates the
2094            index in the state[] array wheres its transition data is stored.
2095
2096            - If .base is 0 there are no valid transitions from that node.
2097
2098            - If .base is nonzero then charid is added to it to find an entry in
2099            the trans array.
2100
2101            -If trans[states[state].base+charid].check!=state then the
2102            transition is taken to be a 0/Fail transition. Thus if there are fail
2103            transitions at the front of the node then the .base offset will point
2104            somewhere inside the previous nodes data (or maybe even into a node
2105            even earlier), but the .check field determines if the transition is
2106            valid.
2107
2108            XXX - wrong maybe?
2109            The following process inplace converts the table to the compressed
2110            table: We first do not compress the root node 1,and mark all its
2111            .check pointers as 1 and set its .base pointer as 1 as well. This
2112            allows us to do a DFA construction from the compressed table later,
2113            and ensures that any .base pointers we calculate later are greater
2114            than 0.
2115
2116            - We set 'pos' to indicate the first entry of the second node.
2117
2118            - We then iterate over the columns of the node, finding the first and
2119            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2120            and set the .check pointers accordingly, and advance pos
2121            appropriately and repreat for the next node. Note that when we copy
2122            the next pointers we have to convert them from the original
2123            NODEIDX form to NODENUM form as the former is not valid post
2124            compression.
2125
2126            - If a node has no transitions used we mark its base as 0 and do not
2127            advance the pos pointer.
2128
2129            - If a node only has one transition we use a second pointer into the
2130            structure to fill in allocated fail transitions from other states.
2131            This pointer is independent of the main pointer and scans forward
2132            looking for null transitions that are allocated to a state. When it
2133            finds one it writes the single transition into the "hole".  If the
2134            pointer doesnt find one the single transition is appended as normal.
2135
2136            - Once compressed we can Renew/realloc the structures to release the
2137            excess space.
2138
2139            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2140            specifically Fig 3.47 and the associated pseudocode.
2141
2142            demq
2143         */
2144         const U32 laststate = TRIE_NODENUM( next_alloc );
2145         U32 state, charid;
2146         U32 pos = 0, zp=0;
2147         trie->statecount = laststate;
2148
2149         for ( state = 1 ; state < laststate ; state++ ) {
2150             U8 flag = 0;
2151             const U32 stateidx = TRIE_NODEIDX( state );
2152             const U32 o_used = trie->trans[ stateidx ].check;
2153             U32 used = trie->trans[ stateidx ].check;
2154             trie->trans[ stateidx ].check = 0;
2155
2156             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2157                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2158                     if ( trie->trans[ stateidx + charid ].next ) {
2159                         if (o_used == 1) {
2160                             for ( ; zp < pos ; zp++ ) {
2161                                 if ( ! trie->trans[ zp ].next ) {
2162                                     break;
2163                                 }
2164                             }
2165                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2166                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2167                             trie->trans[ zp ].check = state;
2168                             if ( ++zp > pos ) pos = zp;
2169                             break;
2170                         }
2171                         used--;
2172                     }
2173                     if ( !flag ) {
2174                         flag = 1;
2175                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2176                     }
2177                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2178                     trie->trans[ pos ].check = state;
2179                     pos++;
2180                 }
2181             }
2182         }
2183         trie->lasttrans = pos + 1;
2184         trie->states = (reg_trie_state *)
2185             PerlMemShared_realloc( trie->states, laststate
2186                                    * sizeof(reg_trie_state) );
2187         DEBUG_TRIE_COMPILE_MORE_r(
2188                 PerlIO_printf( Perl_debug_log,
2189                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2190                     (int)depth * 2 + 2,"",
2191                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2192                     (IV)next_alloc,
2193                     (IV)pos,
2194                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2195             );
2196
2197         } /* end table compress */
2198     }
2199     DEBUG_TRIE_COMPILE_MORE_r(
2200             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2201                 (int)depth * 2 + 2, "",
2202                 (UV)trie->statecount,
2203                 (UV)trie->lasttrans)
2204     );
2205     /* resize the trans array to remove unused space */
2206     trie->trans = (reg_trie_trans *)
2207         PerlMemShared_realloc( trie->trans, trie->lasttrans
2208                                * sizeof(reg_trie_trans) );
2209
2210     {   /* Modify the program and insert the new TRIE node */
2211         U8 nodetype =(U8)(flags & 0xFF);
2212         char *str=NULL;
2213
2214 #ifdef DEBUGGING
2215         regnode *optimize = NULL;
2216 #ifdef RE_TRACK_PATTERN_OFFSETS
2217
2218         U32 mjd_offset = 0;
2219         U32 mjd_nodelen = 0;
2220 #endif /* RE_TRACK_PATTERN_OFFSETS */
2221 #endif /* DEBUGGING */
2222         /*
2223            This means we convert either the first branch or the first Exact,
2224            depending on whether the thing following (in 'last') is a branch
2225            or not and whther first is the startbranch (ie is it a sub part of
2226            the alternation or is it the whole thing.)
2227            Assuming its a sub part we convert the EXACT otherwise we convert
2228            the whole branch sequence, including the first.
2229          */
2230         /* Find the node we are going to overwrite */
2231         if ( first != startbranch || OP( last ) == BRANCH ) {
2232             /* branch sub-chain */
2233             NEXT_OFF( first ) = (U16)(last - first);
2234 #ifdef RE_TRACK_PATTERN_OFFSETS
2235             DEBUG_r({
2236                 mjd_offset= Node_Offset((convert));
2237                 mjd_nodelen= Node_Length((convert));
2238             });
2239 #endif
2240             /* whole branch chain */
2241         }
2242 #ifdef RE_TRACK_PATTERN_OFFSETS
2243         else {
2244             DEBUG_r({
2245                 const  regnode *nop = NEXTOPER( convert );
2246                 mjd_offset= Node_Offset((nop));
2247                 mjd_nodelen= Node_Length((nop));
2248             });
2249         }
2250         DEBUG_OPTIMISE_r(
2251             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2252                 (int)depth * 2 + 2, "",
2253                 (UV)mjd_offset, (UV)mjd_nodelen)
2254         );
2255 #endif
2256         /* But first we check to see if there is a common prefix we can
2257            split out as an EXACT and put in front of the TRIE node.  */
2258         trie->startstate= 1;
2259         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2260             U32 state;
2261             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2262                 U32 ofs = 0;
2263                 I32 idx = -1;
2264                 U32 count = 0;
2265                 const U32 base = trie->states[ state ].trans.base;
2266
2267                 if ( trie->states[state].wordnum )
2268                         count = 1;
2269
2270                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2271                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2272                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2273                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2274                     {
2275                         if ( ++count > 1 ) {
2276                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2277                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2278                             if ( state == 1 ) break;
2279                             if ( count == 2 ) {
2280                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2281                                 DEBUG_OPTIMISE_r(
2282                                     PerlIO_printf(Perl_debug_log,
2283                                         "%*sNew Start State=%"UVuf" Class: [",
2284                                         (int)depth * 2 + 2, "",
2285                                         (UV)state));
2286                                 if (idx >= 0) {
2287                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2288                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2289
2290                                     TRIE_BITMAP_SET(trie,*ch);
2291                                     if ( folder )
2292                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2293                                     DEBUG_OPTIMISE_r(
2294                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2295                                     );
2296                                 }
2297                             }
2298                             TRIE_BITMAP_SET(trie,*ch);
2299                             if ( folder )
2300                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2301                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2302                         }
2303                         idx = ofs;
2304                     }
2305                 }
2306                 if ( count == 1 ) {
2307                     SV **tmp = av_fetch( revcharmap, idx, 0);
2308                     STRLEN len;
2309                     char *ch = SvPV( *tmp, len );
2310                     DEBUG_OPTIMISE_r({
2311                         SV *sv=sv_newmortal();
2312                         PerlIO_printf( Perl_debug_log,
2313                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2314                             (int)depth * 2 + 2, "",
2315                             (UV)state, (UV)idx,
2316                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2317                                 PL_colors[0], PL_colors[1],
2318                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2319                                 PERL_PV_ESCAPE_FIRSTCHAR
2320                             )
2321                         );
2322                     });
2323                     if ( state==1 ) {
2324                         OP( convert ) = nodetype;
2325                         str=STRING(convert);
2326                         STR_LEN(convert)=0;
2327                     }
2328                     STR_LEN(convert) += len;
2329                     while (len--)
2330                         *str++ = *ch++;
2331                 } else {
2332 #ifdef DEBUGGING
2333                     if (state>1)
2334                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2335 #endif
2336                     break;
2337                 }
2338             }
2339             trie->prefixlen = (state-1);
2340             if (str) {
2341                 regnode *n = convert+NODE_SZ_STR(convert);
2342                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2343                 trie->startstate = state;
2344                 trie->minlen -= (state - 1);
2345                 trie->maxlen -= (state - 1);
2346 #ifdef DEBUGGING
2347                /* At least the UNICOS C compiler choked on this
2348                 * being argument to DEBUG_r(), so let's just have
2349                 * it right here. */
2350                if (
2351 #ifdef PERL_EXT_RE_BUILD
2352                    1
2353 #else
2354                    DEBUG_r_TEST
2355 #endif
2356                    ) {
2357                    regnode *fix = convert;
2358                    U32 word = trie->wordcount;
2359                    mjd_nodelen++;
2360                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2361                    while( ++fix < n ) {
2362                        Set_Node_Offset_Length(fix, 0, 0);
2363                    }
2364                    while (word--) {
2365                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2366                        if (tmp) {
2367                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2368                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2369                            else
2370                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2371                        }
2372                    }
2373                }
2374 #endif
2375                 if (trie->maxlen) {
2376                     convert = n;
2377                 } else {
2378                     NEXT_OFF(convert) = (U16)(tail - convert);
2379                     DEBUG_r(optimize= n);
2380                 }
2381             }
2382         }
2383         if (!jumper)
2384             jumper = last;
2385         if ( trie->maxlen ) {
2386             NEXT_OFF( convert ) = (U16)(tail - convert);
2387             ARG_SET( convert, data_slot );
2388             /* Store the offset to the first unabsorbed branch in
2389                jump[0], which is otherwise unused by the jump logic.
2390                We use this when dumping a trie and during optimisation. */
2391             if (trie->jump)
2392                 trie->jump[0] = (U16)(nextbranch - convert);
2393
2394             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2395              *   and there is a bitmap
2396              *   and the first "jump target" node we found leaves enough room
2397              * then convert the TRIE node into a TRIEC node, with the bitmap
2398              * embedded inline in the opcode - this is hypothetically faster.
2399              */
2400             if ( !trie->states[trie->startstate].wordnum
2401                  && trie->bitmap
2402                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2403             {
2404                 OP( convert ) = TRIEC;
2405                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2406                 PerlMemShared_free(trie->bitmap);
2407                 trie->bitmap= NULL;
2408             } else
2409                 OP( convert ) = TRIE;
2410
2411             /* store the type in the flags */
2412             convert->flags = nodetype;
2413             DEBUG_r({
2414             optimize = convert
2415                       + NODE_STEP_REGNODE
2416                       + regarglen[ OP( convert ) ];
2417             });
2418             /* XXX We really should free up the resource in trie now,
2419                    as we won't use them - (which resources?) dmq */
2420         }
2421         /* needed for dumping*/
2422         DEBUG_r(if (optimize) {
2423             regnode *opt = convert;
2424
2425             while ( ++opt < optimize) {
2426                 Set_Node_Offset_Length(opt,0,0);
2427             }
2428             /*
2429                 Try to clean up some of the debris left after the
2430                 optimisation.
2431              */
2432             while( optimize < jumper ) {
2433                 mjd_nodelen += Node_Length((optimize));
2434                 OP( optimize ) = OPTIMIZED;
2435                 Set_Node_Offset_Length(optimize,0,0);
2436                 optimize++;
2437             }
2438             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2439         });
2440     } /* end node insert */
2441
2442     /*  Finish populating the prev field of the wordinfo array.  Walk back
2443      *  from each accept state until we find another accept state, and if
2444      *  so, point the first word's .prev field at the second word. If the
2445      *  second already has a .prev field set, stop now. This will be the
2446      *  case either if we've already processed that word's accept state,
2447      *  or that state had multiple words, and the overspill words were
2448      *  already linked up earlier.
2449      */
2450     {
2451         U16 word;
2452         U32 state;
2453         U16 prev;
2454
2455         for (word=1; word <= trie->wordcount; word++) {
2456             prev = 0;
2457             if (trie->wordinfo[word].prev)
2458                 continue;
2459             state = trie->wordinfo[word].accept;
2460             while (state) {
2461                 state = prev_states[state];
2462                 if (!state)
2463                     break;
2464                 prev = trie->states[state].wordnum;
2465                 if (prev)
2466                     break;
2467             }
2468             trie->wordinfo[word].prev = prev;
2469         }
2470         Safefree(prev_states);
2471     }
2472
2473
2474     /* and now dump out the compressed format */
2475     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2476
2477     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2478 #ifdef DEBUGGING
2479     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2480     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2481 #else
2482     SvREFCNT_dec_NN(revcharmap);
2483 #endif
2484     return trie->jump
2485            ? MADE_JUMP_TRIE
2486            : trie->startstate>1
2487              ? MADE_EXACT_TRIE
2488              : MADE_TRIE;
2489 }
2490
2491 STATIC void
2492 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2493 {
2494 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2495
2496    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2497    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2498    ISBN 0-201-10088-6
2499
2500    We find the fail state for each state in the trie, this state is the longest proper
2501    suffix of the current state's 'word' that is also a proper prefix of another word in our
2502    trie. State 1 represents the word '' and is thus the default fail state. This allows
2503    the DFA not to have to restart after its tried and failed a word at a given point, it
2504    simply continues as though it had been matching the other word in the first place.
2505    Consider
2506       'abcdgu'=~/abcdefg|cdgu/
2507    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2508    fail, which would bring us to the state representing 'd' in the second word where we would
2509    try 'g' and succeed, proceeding to match 'cdgu'.
2510  */
2511  /* add a fail transition */
2512     const U32 trie_offset = ARG(source);
2513     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2514     U32 *q;
2515     const U32 ucharcount = trie->uniquecharcount;
2516     const U32 numstates = trie->statecount;
2517     const U32 ubound = trie->lasttrans + ucharcount;
2518     U32 q_read = 0;
2519     U32 q_write = 0;
2520     U32 charid;
2521     U32 base = trie->states[ 1 ].trans.base;
2522     U32 *fail;
2523     reg_ac_data *aho;
2524     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2525     GET_RE_DEBUG_FLAGS_DECL;
2526
2527     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2528 #ifndef DEBUGGING
2529     PERL_UNUSED_ARG(depth);
2530 #endif
2531
2532
2533     ARG_SET( stclass, data_slot );
2534     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2535     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2536     aho->trie=trie_offset;
2537     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2538     Copy( trie->states, aho->states, numstates, reg_trie_state );
2539     Newxz( q, numstates, U32);
2540     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2541     aho->refcount = 1;
2542     fail = aho->fail;
2543     /* initialize fail[0..1] to be 1 so that we always have
2544        a valid final fail state */
2545     fail[ 0 ] = fail[ 1 ] = 1;
2546
2547     for ( charid = 0; charid < ucharcount ; charid++ ) {
2548         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2549         if ( newstate ) {
2550             q[ q_write ] = newstate;
2551             /* set to point at the root */
2552             fail[ q[ q_write++ ] ]=1;
2553         }
2554     }
2555     while ( q_read < q_write) {
2556         const U32 cur = q[ q_read++ % numstates ];
2557         base = trie->states[ cur ].trans.base;
2558
2559         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2560             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2561             if (ch_state) {
2562                 U32 fail_state = cur;
2563                 U32 fail_base;
2564                 do {
2565                     fail_state = fail[ fail_state ];
2566                     fail_base = aho->states[ fail_state ].trans.base;
2567                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2568
2569                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2570                 fail[ ch_state ] = fail_state;
2571                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2572                 {
2573                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2574                 }
2575                 q[ q_write++ % numstates] = ch_state;
2576             }
2577         }
2578     }
2579     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2580        when we fail in state 1, this allows us to use the
2581        charclass scan to find a valid start char. This is based on the principle
2582        that theres a good chance the string being searched contains lots of stuff
2583        that cant be a start char.
2584      */
2585     fail[ 0 ] = fail[ 1 ] = 0;
2586     DEBUG_TRIE_COMPILE_r({
2587         PerlIO_printf(Perl_debug_log,
2588                       "%*sStclass Failtable (%"UVuf" states): 0",
2589                       (int)(depth * 2), "", (UV)numstates
2590         );
2591         for( q_read=1; q_read<numstates; q_read++ ) {
2592             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2593         }
2594         PerlIO_printf(Perl_debug_log, "\n");
2595     });
2596     Safefree(q);
2597     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2598 }
2599
2600
2601 /*
2602  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2603  * These need to be revisited when a newer toolchain becomes available.
2604  */
2605 #if defined(__sparc64__) && defined(__GNUC__)
2606 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2607 #       undef  SPARC64_GCC_WORKAROUND
2608 #       define SPARC64_GCC_WORKAROUND 1
2609 #   endif
2610 #endif
2611
2612 #define DEBUG_PEEP(str,scan,depth) \
2613     DEBUG_OPTIMISE_r({if (scan){ \
2614        SV * const mysv=sv_newmortal(); \
2615        regnode *Next = regnext(scan); \
2616        regprop(RExC_rx, mysv, scan); \
2617        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2618        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2619        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2620    }});
2621
2622
2623 /* The below joins as many adjacent EXACTish nodes as possible into a single
2624  * one.  The regop may be changed if the node(s) contain certain sequences that
2625  * require special handling.  The joining is only done if:
2626  * 1) there is room in the current conglomerated node to entirely contain the
2627  *    next one.
2628  * 2) they are the exact same node type
2629  *
2630  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2631  * these get optimized out
2632  *
2633  * If a node is to match under /i (folded), the number of characters it matches
2634  * can be different than its character length if it contains a multi-character
2635  * fold.  *min_subtract is set to the total delta of the input nodes.
2636  *
2637  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2638  * and contains LATIN SMALL LETTER SHARP S
2639  *
2640  * This is as good a place as any to discuss the design of handling these
2641  * multi-character fold sequences.  It's been wrong in Perl for a very long
2642  * time.  There are three code points in Unicode whose multi-character folds
2643  * were long ago discovered to mess things up.  The previous designs for
2644  * dealing with these involved assigning a special node for them.  This
2645  * approach doesn't work, as evidenced by this example:
2646  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2647  * Both these fold to "sss", but if the pattern is parsed to create a node that
2648  * would match just the \xDF, it won't be able to handle the case where a
2649  * successful match would have to cross the node's boundary.  The new approach
2650  * that hopefully generally solves the problem generates an EXACTFU_SS node
2651  * that is "sss".
2652  *
2653  * It turns out that there are problems with all multi-character folds, and not
2654  * just these three.  Now the code is general, for all such cases, but the
2655  * three still have some special handling.  The approach taken is:
2656  * 1)   This routine examines each EXACTFish node that could contain multi-
2657  *      character fold sequences.  It returns in *min_subtract how much to
2658  *      subtract from the the actual length of the string to get a real minimum
2659  *      match length; it is 0 if there are no multi-char folds.  This delta is
2660  *      used by the caller to adjust the min length of the match, and the delta
2661  *      between min and max, so that the optimizer doesn't reject these
2662  *      possibilities based on size constraints.
2663  * 2)   Certain of these sequences require special handling by the trie code,
2664  *      so, if found, this code changes the joined node type to special ops:
2665  *      EXACTFU_TRICKYFOLD and EXACTFU_SS.
2666  * 3)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2667  *      is used for an EXACTFU node that contains at least one "ss" sequence in
2668  *      it.  For non-UTF-8 patterns and strings, this is the only case where
2669  *      there is a possible fold length change.  That means that a regular
2670  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
2671  *      with length changes, and so can be processed faster.  regexec.c takes
2672  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
2673  *      pre-folded by regcomp.c.  This saves effort in regex matching.
2674  *      However, the pre-folding isn't done for non-UTF8 patterns because the
2675  *      fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2676  *      down by forcing the pattern into UTF8 unless necessary.  Also what
2677  *      EXACTF and EXACTFL nodes fold to isn't known until runtime.  The fold
2678  *      possibilities for the non-UTF8 patterns are quite simple, except for
2679  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
2680  *      members of a fold-pair, and arrays are set up for all of them so that
2681  *      the other member of the pair can be found quickly.  Code elsewhere in
2682  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2683  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
2684  *      described in the next item.
2685  * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
2686  *      'ss' or not is not knowable at compile time.  It will match iff the
2687  *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
2688  *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
2689  *      it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2690  *      described in item 3).  An assumption that the optimizer part of
2691  *      regexec.c (probably unwittingly) makes is that a character in the
2692  *      pattern corresponds to at most a single character in the target string.
2693  *      (And I do mean character, and not byte here, unlike other parts of the
2694  *      documentation that have never been updated to account for multibyte
2695  *      Unicode.)  This assumption is wrong only in this case, as all other
2696  *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
2697  *      virtue of having this file pre-fold UTF-8 patterns.   I'm
2698  *      reluctant to try to change this assumption, so instead the code punts.
2699  *      This routine examines EXACTF nodes for the sharp s, and returns a
2700  *      boolean indicating whether or not the node is an EXACTF node that
2701  *      contains a sharp s.  When it is true, the caller sets a flag that later
2702  *      causes the optimizer in this file to not set values for the floating
2703  *      and fixed string lengths, and thus avoids the optimizer code in
2704  *      regexec.c that makes the invalid assumption.  Thus, there is no
2705  *      optimization based on string lengths for EXACTF nodes that contain the
2706  *      sharp s.  This only happens for /id rules (which means the pattern
2707  *      isn't in UTF-8).
2708  */
2709
2710 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2711     if (PL_regkind[OP(scan)] == EXACT) \
2712         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2713
2714 STATIC U32
2715 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2716     /* Merge several consecutive EXACTish nodes into one. */
2717     regnode *n = regnext(scan);
2718     U32 stringok = 1;
2719     regnode *next = scan + NODE_SZ_STR(scan);
2720     U32 merged = 0;
2721     U32 stopnow = 0;
2722 #ifdef DEBUGGING
2723     regnode *stop = scan;
2724     GET_RE_DEBUG_FLAGS_DECL;
2725 #else
2726     PERL_UNUSED_ARG(depth);
2727 #endif
2728
2729     PERL_ARGS_ASSERT_JOIN_EXACT;
2730 #ifndef EXPERIMENTAL_INPLACESCAN
2731     PERL_UNUSED_ARG(flags);
2732     PERL_UNUSED_ARG(val);
2733 #endif
2734     DEBUG_PEEP("join",scan,depth);
2735
2736     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2737      * EXACT ones that are mergeable to the current one. */
2738     while (n
2739            && (PL_regkind[OP(n)] == NOTHING
2740                || (stringok && OP(n) == OP(scan)))
2741            && NEXT_OFF(n)
2742            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2743     {
2744
2745         if (OP(n) == TAIL || n > next)
2746             stringok = 0;
2747         if (PL_regkind[OP(n)] == NOTHING) {
2748             DEBUG_PEEP("skip:",n,depth);
2749             NEXT_OFF(scan) += NEXT_OFF(n);
2750             next = n + NODE_STEP_REGNODE;
2751 #ifdef DEBUGGING
2752             if (stringok)
2753                 stop = n;
2754 #endif
2755             n = regnext(n);
2756         }
2757         else if (stringok) {
2758             const unsigned int oldl = STR_LEN(scan);
2759             regnode * const nnext = regnext(n);
2760
2761             /* XXX I (khw) kind of doubt that this works on platforms where
2762              * U8_MAX is above 255 because of lots of other assumptions */
2763             /* Don't join if the sum can't fit into a single node */
2764             if (oldl + STR_LEN(n) > U8_MAX)
2765                 break;
2766
2767             DEBUG_PEEP("merg",n,depth);
2768             merged++;
2769
2770             NEXT_OFF(scan) += NEXT_OFF(n);
2771             STR_LEN(scan) += STR_LEN(n);
2772             next = n + NODE_SZ_STR(n);
2773             /* Now we can overwrite *n : */
2774             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2775 #ifdef DEBUGGING
2776             stop = next - 1;
2777 #endif
2778             n = nnext;
2779             if (stopnow) break;
2780         }
2781
2782 #ifdef EXPERIMENTAL_INPLACESCAN
2783         if (flags && !NEXT_OFF(n)) {
2784             DEBUG_PEEP("atch", val, depth);
2785             if (reg_off_by_arg[OP(n)]) {
2786                 ARG_SET(n, val - n);
2787             }
2788             else {
2789                 NEXT_OFF(n) = val - n;
2790             }
2791             stopnow = 1;
2792         }
2793 #endif
2794     }
2795
2796     *min_subtract = 0;
2797     *has_exactf_sharp_s = FALSE;
2798
2799     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2800      * can now analyze for sequences of problematic code points.  (Prior to
2801      * this final joining, sequences could have been split over boundaries, and
2802      * hence missed).  The sequences only happen in folding, hence for any
2803      * non-EXACT EXACTish node */
2804     if (OP(scan) != EXACT) {
2805         const U8 * const s0 = (U8*) STRING(scan);
2806         const U8 * s = s0;
2807         const U8 * const s_end = s0 + STR_LEN(scan);
2808
2809         /* One pass is made over the node's string looking for all the
2810          * possibilities.  to avoid some tests in the loop, there are two main
2811          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2812          * non-UTF-8 */
2813         if (UTF) {
2814
2815             /* Examine the string for a multi-character fold sequence.  UTF-8
2816              * patterns have all characters pre-folded by the time this code is
2817              * executed */
2818             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2819                                      length sequence we are looking for is 2 */
2820             {
2821                 int count = 0;
2822                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2823                 if (! len) {    /* Not a multi-char fold: get next char */
2824                     s += UTF8SKIP(s);
2825                     continue;
2826                 }
2827
2828                 /* Nodes with 'ss' require special handling, except for EXACTFL
2829                  * and EXACTFA for which there is no multi-char fold to this */
2830                 if (len == 2 && *s == 's' && *(s+1) == 's'
2831                     && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2832                 {
2833                     count = 2;
2834                     OP(scan) = EXACTFU_SS;
2835                     s += 2;
2836                 }
2837                 else if (len == 6   /* len is the same in both ASCII and EBCDIC for these */
2838                          && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
2839                                       COMBINING_DIAERESIS_UTF8
2840                                       COMBINING_ACUTE_ACCENT_UTF8,
2841                                    6)
2842                              || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
2843                                          COMBINING_DIAERESIS_UTF8
2844                                          COMBINING_ACUTE_ACCENT_UTF8,
2845                                      6)))
2846                 {
2847                     count = 3;
2848
2849                     /* These two folds require special handling by trie's, so
2850                      * change the node type to indicate this.  If EXACTFA and
2851                      * EXACTFL were ever to be handled by trie's, this would
2852                      * have to be changed.  If this node has already been
2853                      * changed to EXACTFU_SS in this loop, leave it as is.  (I
2854                      * (khw) think it doesn't matter in regexec.c for UTF
2855                      * patterns, but no need to change it */
2856                     if (OP(scan) == EXACTFU) {
2857                         OP(scan) = EXACTFU_TRICKYFOLD;
2858                     }
2859                     s += 6;
2860                 }
2861                 else { /* Here is a generic multi-char fold. */
2862                     const U8* multi_end  = s + len;
2863
2864                     /* Count how many characters in it.  In the case of /l and
2865                      * /aa, no folds which contain ASCII code points are
2866                      * allowed, so check for those, and skip if found.  (In
2867                      * EXACTFL, no folds are allowed to any Latin1 code point,
2868                      * not just ASCII.  But there aren't any of these
2869                      * currently, nor ever likely, so don't take the time to
2870                      * test for them.  The code that generates the
2871                      * is_MULTI_foo() macros croaks should one actually get put
2872                      * into Unicode .) */
2873                     if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2874                         count = utf8_length(s, multi_end);
2875                         s = multi_end;
2876                     }
2877                     else {
2878                         while (s < multi_end) {
2879                             if (isASCII(*s)) {
2880                                 s++;
2881                                 goto next_iteration;
2882                             }
2883                             else {
2884                                 s += UTF8SKIP(s);
2885                             }
2886                             count++;
2887                         }
2888                     }
2889                 }
2890
2891                 /* The delta is how long the sequence is minus 1 (1 is how long
2892                  * the character that folds to the sequence is) */
2893                 *min_subtract += count - 1;
2894             next_iteration: ;
2895             }
2896         }
2897         else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2898
2899             /* Here, the pattern is not UTF-8.  Look for the multi-char folds
2900              * that are all ASCII.  As in the above case, EXACTFL and EXACTFA
2901              * nodes can't have multi-char folds to this range (and there are
2902              * no existing ones in the upper latin1 range).  In the EXACTF
2903              * case we look also for the sharp s, which can be in the final
2904              * position.  Otherwise we can stop looking 1 byte earlier because
2905              * have to find at least two characters for a multi-fold */
2906             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2907
2908             /* The below is perhaps overboard, but this allows us to save a
2909              * test each time through the loop at the expense of a mask.  This
2910              * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
2911              * by a single bit.  On ASCII they are 32 apart; on EBCDIC, they
2912              * are 64.  This uses an exclusive 'or' to find that bit and then
2913              * inverts it to form a mask, with just a single 0, in the bit
2914              * position where 'S' and 's' differ. */
2915             const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2916             const U8 s_masked = 's' & S_or_s_mask;
2917
2918             while (s < upper) {
2919                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2920                 if (! len) {    /* Not a multi-char fold. */
2921                     if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2922                     {
2923                         *has_exactf_sharp_s = TRUE;
2924                     }
2925                     s++;
2926                     continue;
2927                 }
2928
2929                 if (len == 2
2930                     && ((*s & S_or_s_mask) == s_masked)
2931                     && ((*(s+1) & S_or_s_mask) == s_masked))
2932                 {
2933
2934                     /* EXACTF nodes need to know that the minimum length
2935                      * changed so that a sharp s in the string can match this
2936                      * ss in the pattern, but they remain EXACTF nodes, as they
2937                      * won't match this unless the target string is is UTF-8,
2938                      * which we don't know until runtime */
2939                     if (OP(scan) != EXACTF) {
2940                         OP(scan) = EXACTFU_SS;
2941                     }
2942                 }
2943
2944                 *min_subtract += len - 1;
2945                 s += len;
2946             }
2947         }
2948     }
2949
2950 #ifdef DEBUGGING
2951     /* Allow dumping but overwriting the collection of skipped
2952      * ops and/or strings with fake optimized ops */
2953     n = scan + NODE_SZ_STR(scan);
2954     while (n <= stop) {
2955         OP(n) = OPTIMIZED;
2956         FLAGS(n) = 0;
2957         NEXT_OFF(n) = 0;
2958         n++;
2959     }
2960 #endif
2961     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2962     return stopnow;
2963 }
2964
2965 /* REx optimizer.  Converts nodes into quicker variants "in place".
2966    Finds fixed substrings.  */
2967
2968 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2969    to the position after last scanned or to NULL. */
2970
2971 #define INIT_AND_WITHP \
2972     assert(!and_withp); \
2973     Newx(and_withp,1,struct regnode_charclass_class); \
2974     SAVEFREEPV(and_withp)
2975
2976 /* this is a chain of data about sub patterns we are processing that
2977    need to be handled separately/specially in study_chunk. Its so
2978    we can simulate recursion without losing state.  */
2979 struct scan_frame;
2980 typedef struct scan_frame {
2981     regnode *last;  /* last node to process in this frame */
2982     regnode *next;  /* next node to process when last is reached */
2983     struct scan_frame *prev; /*previous frame*/
2984     I32 stop; /* what stopparen do we use */
2985 } scan_frame;
2986
2987
2988 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2989
2990 STATIC I32
2991 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2992                         I32 *minlenp, I32 *deltap,
2993                         regnode *last,
2994                         scan_data_t *data,
2995                         I32 stopparen,
2996                         U8* recursed,
2997                         struct regnode_charclass_class *and_withp,
2998                         U32 flags, U32 depth)
2999                         /* scanp: Start here (read-write). */
3000                         /* deltap: Write maxlen-minlen here. */
3001                         /* last: Stop before this one. */
3002                         /* data: string data about the pattern */
3003                         /* stopparen: treat close N as END */
3004                         /* recursed: which subroutines have we recursed into */
3005                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3006 {
3007     dVAR;
3008     I32 min = 0;    /* There must be at least this number of characters to match */
3009     I32 pars = 0, code;
3010     regnode *scan = *scanp, *next;
3011     I32 delta = 0;
3012     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3013     int is_inf_internal = 0;            /* The studied chunk is infinite */
3014     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3015     scan_data_t data_fake;
3016     SV *re_trie_maxbuff = NULL;
3017     regnode *first_non_open = scan;
3018     I32 stopmin = I32_MAX;
3019     scan_frame *frame = NULL;
3020     GET_RE_DEBUG_FLAGS_DECL;
3021
3022     PERL_ARGS_ASSERT_STUDY_CHUNK;
3023
3024 #ifdef DEBUGGING
3025     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3026 #endif
3027
3028     if ( depth == 0 ) {
3029         while (first_non_open && OP(first_non_open) == OPEN)
3030             first_non_open=regnext(first_non_open);
3031     }
3032
3033
3034   fake_study_recurse:
3035     while ( scan && OP(scan) != END && scan < last ){
3036         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3037                                    node length to get a real minimum (because
3038                                    the folded version may be shorter) */
3039         bool has_exactf_sharp_s = FALSE;
3040         /* Peephole optimizer: */
3041         DEBUG_STUDYDATA("Peep:", data,depth);
3042         DEBUG_PEEP("Peep",scan,depth);
3043
3044         /* Its not clear to khw or hv why this is done here, and not in the
3045          * clauses that deal with EXACT nodes.  khw's guess is that it's
3046          * because of a previous design */
3047         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3048
3049         /* Follow the next-chain of the current node and optimize
3050            away all the NOTHINGs from it.  */
3051         if (OP(scan) != CURLYX) {
3052             const int max = (reg_off_by_arg[OP(scan)]
3053                        ? I32_MAX
3054                        /* I32 may be smaller than U16 on CRAYs! */
3055                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3056             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3057             int noff;
3058             regnode *n = scan;
3059
3060             /* Skip NOTHING and LONGJMP. */
3061             while ((n = regnext(n))
3062                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3063                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3064                    && off + noff < max)
3065                 off += noff;
3066             if (reg_off_by_arg[OP(scan)])
3067                 ARG(scan) = off;
3068             else
3069                 NEXT_OFF(scan) = off;
3070         }
3071
3072
3073
3074         /* The principal pseudo-switch.  Cannot be a switch, since we
3075            look into several different things.  */
3076         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3077                    || OP(scan) == IFTHEN) {
3078             next = regnext(scan);
3079             code = OP(scan);
3080             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3081
3082             if (OP(next) == code || code == IFTHEN) {
3083                 /* NOTE - There is similar code to this block below for handling
3084                    TRIE nodes on a re-study.  If you change stuff here check there
3085                    too. */
3086                 I32 max1 = 0, min1 = I32_MAX, num = 0;
3087                 struct regnode_charclass_class accum;
3088                 regnode * const startbranch=scan;
3089
3090                 if (flags & SCF_DO_SUBSTR)
3091                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3092                 if (flags & SCF_DO_STCLASS)
3093                     cl_init_zero(pRExC_state, &accum);
3094
3095                 while (OP(scan) == code) {
3096                     I32 deltanext, minnext, f = 0, fake;
3097                     struct regnode_charclass_class this_class;
3098
3099                     num++;
3100                     data_fake.flags = 0;
3101                     if (data) {
3102                         data_fake.whilem_c = data->whilem_c;
3103                         data_fake.last_closep = data->last_closep;
3104                     }
3105                     else
3106                         data_fake.last_closep = &fake;
3107
3108                     data_fake.pos_delta = delta;
3109                     next = regnext(scan);
3110                     scan = NEXTOPER(scan);
3111                     if (code != BRANCH)
3112                         scan = NEXTOPER(scan);
3113                     if (flags & SCF_DO_STCLASS) {
3114                         cl_init(pRExC_state, &this_class);
3115                         data_fake.start_class = &this_class;
3116                         f = SCF_DO_STCLASS_AND;
3117                     }
3118                     if (flags & SCF_WHILEM_VISITED_POS)
3119                         f |= SCF_WHILEM_VISITED_POS;
3120
3121                     /* we suppose the run is continuous, last=next...*/
3122                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3123                                           next, &data_fake,
3124                                           stopparen, recursed, NULL, f,depth+1);
3125                     if (min1 > minnext)
3126                         min1 = minnext;
3127                     if (max1 < minnext + deltanext)
3128                         max1 = minnext + deltanext;
3129                     if (deltanext == I32_MAX)
3130                         is_inf = is_inf_internal = 1;
3131                     scan = next;
3132                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3133                         pars++;
3134                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3135                         if ( stopmin > minnext)
3136                             stopmin = min + min1;
3137                         flags &= ~SCF_DO_SUBSTR;
3138                         if (data)
3139                             data->flags |= SCF_SEEN_ACCEPT;
3140                     }
3141                     if (data) {
3142                         if (data_fake.flags & SF_HAS_EVAL)
3143                             data->flags |= SF_HAS_EVAL;
3144                         data->whilem_c = data_fake.whilem_c;
3145                     }
3146                     if (flags & SCF_DO_STCLASS)
3147                         cl_or(pRExC_state, &accum, &this_class);
3148                 }
3149                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3150                     min1 = 0;
3151                 if (flags & SCF_DO_SUBSTR) {
3152                     data->pos_min += min1;
3153                     data->pos_delta += max1 - min1;
3154                     if (max1 != min1 || is_inf)
3155                         data->longest = &(data->longest_float);
3156                 }
3157                 min += min1;
3158                 delta += max1 - min1;
3159                 if (flags & SCF_DO_STCLASS_OR) {
3160                     cl_or(pRExC_state, data->start_class, &accum);
3161                     if (min1) {
3162                         cl_and(data->start_class, and_withp);
3163                         flags &= ~SCF_DO_STCLASS;
3164                     }
3165                 }
3166                 else if (flags & SCF_DO_STCLASS_AND) {
3167                     if (min1) {
3168                         cl_and(data->start_class, &accum);
3169                         flags &= ~SCF_DO_STCLASS;
3170                     }
3171                     else {
3172                         /* Switch to OR mode: cache the old value of
3173                          * data->start_class */
3174                         INIT_AND_WITHP;
3175                         StructCopy(data->start_class, and_withp,
3176                                    struct regnode_charclass_class);
3177                         flags &= ~SCF_DO_STCLASS_AND;
3178                         StructCopy(&accum, data->start_class,
3179                                    struct regnode_charclass_class);
3180                         flags |= SCF_DO_STCLASS_OR;
3181                         SET_SSC_EOS(data->start_class);
3182                     }
3183                 }
3184
3185                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3186                 /* demq.
3187
3188                    Assuming this was/is a branch we are dealing with: 'scan' now
3189                    points at the item that follows the branch sequence, whatever
3190                    it is. We now start at the beginning of the sequence and look
3191                    for subsequences of
3192
3193                    BRANCH->EXACT=>x1
3194                    BRANCH->EXACT=>x2
3195                    tail
3196
3197                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
3198
3199                    If we can find such a subsequence we need to turn the first
3200                    element into a trie and then add the subsequent branch exact
3201                    strings to the trie.
3202
3203                    We have two cases
3204
3205                      1. patterns where the whole set of branches can be converted.
3206
3207                      2. patterns where only a subset can be converted.
3208
3209                    In case 1 we can replace the whole set with a single regop
3210                    for the trie. In case 2 we need to keep the start and end
3211                    branches so
3212
3213                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3214                      becomes BRANCH TRIE; BRANCH X;
3215
3216                   There is an additional case, that being where there is a
3217                   common prefix, which gets split out into an EXACT like node
3218                   preceding the TRIE node.
3219
3220                   If x(1..n)==tail then we can do a simple trie, if not we make
3221                   a "jump" trie, such that when we match the appropriate word
3222                   we "jump" to the appropriate tail node. Essentially we turn
3223                   a nested if into a case structure of sorts.
3224
3225                 */
3226
3227                     int made=0;
3228                     if (!re_trie_maxbuff) {
3229                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3230                         if (!SvIOK(re_trie_maxbuff))
3231                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3232                     }
3233                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3234                         regnode *cur;
3235                         regnode *first = (regnode *)NULL;
3236                         regnode *last = (regnode *)NULL;
3237                         regnode *tail = scan;
3238                         U8 trietype = 0;
3239                         U32 count=0;
3240
3241 #ifdef DEBUGGING
3242                         SV * const mysv = sv_newmortal();       /* for dumping */
3243 #endif
3244                         /* var tail is used because there may be a TAIL
3245                            regop in the way. Ie, the exacts will point to the
3246                            thing following the TAIL, but the last branch will
3247                            point at the TAIL. So we advance tail. If we
3248                            have nested (?:) we may have to move through several
3249                            tails.
3250                          */
3251
3252                         while ( OP( tail ) == TAIL ) {
3253                             /* this is the TAIL generated by (?:) */
3254                             tail = regnext( tail );
3255                         }
3256
3257
3258                         DEBUG_TRIE_COMPILE_r({
3259                             regprop(RExC_rx, mysv, tail );
3260                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3261                                 (int)depth * 2 + 2, "",
3262                                 "Looking for TRIE'able sequences. Tail node is: ",
3263                                 SvPV_nolen_const( mysv )
3264                             );
3265                         });
3266
3267                         /*
3268
3269                             Step through the branches
3270                                 cur represents each branch,
3271                                 noper is the first thing to be matched as part of that branch
3272                                 noper_next is the regnext() of that node.
3273
3274                             We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3275                             via a "jump trie" but we also support building with NOJUMPTRIE,
3276                             which restricts the trie logic to structures like /FOO|BAR/.
3277
3278                             If noper is a trieable nodetype then the branch is a possible optimization
3279                             target. If we are building under NOJUMPTRIE then we require that noper_next
3280                             is the same as scan (our current position in the regex program).
3281
3282                             Once we have two or more consecutive such branches we can create a
3283                             trie of the EXACT's contents and stitch it in place into the program.
3284
3285                             If the sequence represents all of the branches in the alternation we
3286                             replace the entire thing with a single TRIE node.
3287
3288                             Otherwise when it is a subsequence we need to stitch it in place and
3289                             replace only the relevant branches. This means the first branch has
3290                             to remain as it is used by the alternation logic, and its next pointer,
3291                             and needs to be repointed at the item on the branch chain following
3292                             the last branch we have optimized away.
3293
3294                             This could be either a BRANCH, in which case the subsequence is internal,
3295                             or it could be the item following the branch sequence in which case the
3296                             subsequence is at the end (which does not necessarily mean the first node
3297                             is the start of the alternation).
3298
3299                             TRIE_TYPE(X) is a define which maps the optype to a trietype.
3300
3301                                 optype          |  trietype
3302                                 ----------------+-----------
3303                                 NOTHING         | NOTHING
3304                                 EXACT           | EXACT
3305                                 EXACTFU         | EXACTFU
3306                                 EXACTFU_SS      | EXACTFU
3307                                 EXACTFU_TRICKYFOLD | EXACTFU
3308                                 EXACTFA         | 0
3309
3310
3311                         */
3312 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3313                        ( EXACT == (X) )   ? EXACT :        \
3314                        ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
3315                        0 )
3316
3317                         /* dont use tail as the end marker for this traverse */
3318                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3319                             regnode * const noper = NEXTOPER( cur );
3320                             U8 noper_type = OP( noper );
3321                             U8 noper_trietype = TRIE_TYPE( noper_type );
3322 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3323                             regnode * const noper_next = regnext( noper );
3324                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3325                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3326 #endif
3327
3328                             DEBUG_TRIE_COMPILE_r({
3329                                 regprop(RExC_rx, mysv, cur);
3330                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3331                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3332
3333                                 regprop(RExC_rx, mysv, noper);
3334                                 PerlIO_printf( Perl_debug_log, " -> %s",
3335                                     SvPV_nolen_const(mysv));
3336
3337                                 if ( noper_next ) {
3338                                   regprop(RExC_rx, mysv, noper_next );
3339                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3340                                     SvPV_nolen_const(mysv));
3341                                 }
3342                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3343                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3344                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3345                                 );
3346                             });
3347
3348                             /* Is noper a trieable nodetype that can be merged with the
3349                              * current trie (if there is one)? */
3350                             if ( noper_trietype
3351                                   &&
3352                                   (
3353                                         ( noper_trietype == NOTHING)
3354                                         || ( trietype == NOTHING )
3355                                         || ( trietype == noper_trietype )
3356                                   )
3357 #ifdef NOJUMPTRIE
3358                                   && noper_next == tail
3359 #endif
3360                                   && count < U16_MAX)
3361                             {
3362                                 /* Handle mergable triable node
3363                                  * Either we are the first node in a new trieable sequence,
3364                                  * in which case we do some bookkeeping, otherwise we update
3365                                  * the end pointer. */
3366                                 if ( !first ) {
3367                                     first = cur;
3368                                     if ( noper_trietype == NOTHING ) {
3369 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3370                                         regnode * const noper_next = regnext( noper );
3371                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3372                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3373 #endif
3374
3375                                         if ( noper_next_trietype ) {
3376                                             trietype = noper_next_trietype;
3377                                         } else if (noper_next_type)  {
3378                                             /* a NOTHING regop is 1 regop wide. We need at least two
3379                                              * for a trie so we can't merge this in */
3380                                             first = NULL;
3381                                         }
3382                                     } else {
3383                                         trietype = noper_trietype;
3384                                     }
3385                                 } else {
3386                                     if ( trietype == NOTHING )
3387                                         trietype = noper_trietype;
3388                                     last = cur;
3389                                 }
3390                                 if (first)
3391                                     count++;
3392                             } /* end handle mergable triable node */
3393                             else {
3394                                 /* handle unmergable node -
3395                                  * noper may either be a triable node which can not be tried
3396                                  * together with the current trie, or a non triable node */
3397                                 if ( last ) {
3398                                     /* If last is set and trietype is not NOTHING then we have found
3399                                      * at least two triable branch sequences in a row of a similar
3400                                      * trietype so we can turn them into a trie. If/when we
3401                                      * allow NOTHING to start a trie sequence this condition will be
3402                                      * required, and it isn't expensive so we leave it in for now. */
3403                                     if ( trietype && trietype != NOTHING )
3404                                         make_trie( pRExC_state,
3405                                                 startbranch, first, cur, tail, count,
3406                                                 trietype, depth+1 );
3407                                     last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3408                                 }
3409                                 if ( noper_trietype
3410 #ifdef NOJUMPTRIE
3411                                      && noper_next == tail
3412 #endif
3413                                 ){
3414                                     /* noper is triable, so we can start a new trie sequence */
3415                                     count = 1;
3416                                     first = cur;
3417                                     trietype = noper_trietype;
3418                                 } else if (first) {
3419                                     /* if we already saw a first but the current node is not triable then we have
3420                                      * to reset the first information. */
3421                                     count = 0;
3422                                     first = NULL;
3423                                     trietype = 0;
3424                                 }
3425                             } /* end handle unmergable node */
3426                         } /* loop over branches */
3427                         DEBUG_TRIE_COMPILE_r({
3428                             regprop(RExC_rx, mysv, cur);
3429                             PerlIO_printf( Perl_debug_log,
3430                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3431                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3432
3433                         });
3434                         if ( last && trietype ) {
3435                             if ( trietype != NOTHING ) {
3436                                 /* the last branch of the sequence was part of a trie,
3437                                  * so we have to construct it here outside of the loop
3438                                  */
3439                                 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3440 #ifdef TRIE_STUDY_OPT
3441                                 if ( ((made == MADE_EXACT_TRIE &&
3442                                      startbranch == first)
3443                                      || ( first_non_open == first )) &&
3444                                      depth==0 ) {
3445                                     flags |= SCF_TRIE_RESTUDY;
3446                                     if ( startbranch == first
3447                                          && scan == tail )
3448                                     {
3449                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3450                                     }
3451                                 }
3452 #endif
3453                             } else {
3454                                 /* at this point we know whatever we have is a NOTHING sequence/branch
3455                                  * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3456                                  */
3457                                 if ( startbranch == first ) {
3458                                     regnode *opt;
3459                                     /* the entire thing is a NOTHING sequence, something like this:
3460                                      * (?:|) So we can turn it into a plain NOTHING op. */
3461                                     DEBUG_TRIE_COMPILE_r({
3462                                         regprop(RExC_rx, mysv, cur);
3463                                         PerlIO_printf( Perl_debug_log,
3464                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3465                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3466
3467                                     });
3468                                     OP(startbranch)= NOTHING;
3469                                     NEXT_OFF(startbranch)= tail - startbranch;
3470                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
3471                                         OP(opt)= OPTIMIZED;
3472                                 }
3473                             }
3474                         } /* end if ( last) */
3475                     } /* TRIE_MAXBUF is non zero */
3476
3477                 } /* do trie */
3478
3479             }
3480             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3481                 scan = NEXTOPER(NEXTOPER(scan));
3482             } else                      /* single branch is optimized. */
3483                 scan = NEXTOPER(scan);
3484             continue;
3485         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3486             scan_frame *newframe = NULL;
3487             I32 paren;
3488             regnode *start;
3489             regnode *end;
3490
3491             if (OP(scan) != SUSPEND) {
3492             /* set the pointer */
3493                 if (OP(scan) == GOSUB) {
3494                     paren = ARG(scan);
3495                     RExC_recurse[ARG2L(scan)] = scan;
3496                     start = RExC_open_parens[paren-1];
3497                     end   = RExC_close_parens[paren-1];
3498                 } else {
3499                     paren = 0;
3500                     start = RExC_rxi->program + 1;
3501                     end   = RExC_opend;
3502                 }
3503                 if (!recursed) {
3504                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3505                     SAVEFREEPV(recursed);
3506                 }
3507                 if (!PAREN_TEST(recursed,paren+1)) {
3508                     PAREN_SET(recursed,paren+1);
3509                     Newx(newframe,1,scan_frame);
3510                 } else {
3511                     if (flags & SCF_DO_SUBSTR) {
3512                         SCAN_COMMIT(pRExC_state,data,minlenp);
3513                         data->longest = &(data->longest_float);
3514                     }
3515                     is_inf = is_inf_internal = 1;
3516                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3517                         cl_anything(pRExC_state, data->start_class);
3518                     flags &= ~SCF_DO_STCLASS;
3519                 }
3520             } else {
3521                 Newx(newframe,1,scan_frame);
3522                 paren = stopparen;
3523                 start = scan+2;
3524                 end = regnext(scan);
3525             }
3526             if (newframe) {
3527                 assert(start);
3528                 assert(end);
3529                 SAVEFREEPV(newframe);
3530                 newframe->next = regnext(scan);
3531                 newframe->last = last;
3532                 newframe->stop = stopparen;
3533                 newframe->prev = frame;
3534
3535                 frame = newframe;
3536                 scan =  start;
3537                 stopparen = paren;
3538                 last = end;
3539
3540                 continue;
3541             }
3542         }
3543         else if (OP(scan) == EXACT) {
3544             I32 l = STR_LEN(scan);
3545             UV uc;
3546             if (UTF) {
3547                 const U8 * const s = (U8*)STRING(scan);
3548                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3549                 l = utf8_length(s, s + l);
3550             } else {
3551                 uc = *((U8*)STRING(scan));
3552             }
3553             min += l;
3554             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3555                 /* The code below prefers earlier match for fixed
3556                    offset, later match for variable offset.  */
3557                 if (data->last_end == -1) { /* Update the start info. */
3558                     data->last_start_min = data->pos_min;
3559                     data->last_start_max = is_inf
3560                         ? I32_MAX : data->pos_min + data->pos_delta;
3561                 }
3562                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3563                 if (UTF)
3564                     SvUTF8_on(data->last_found);
3565                 {
3566                     SV * const sv = data->last_found;
3567                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3568                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3569                     if (mg && mg->mg_len >= 0)
3570                         mg->mg_len += utf8_length((U8*)STRING(scan),
3571                                                   (U8*)STRING(scan)+STR_LEN(scan));
3572                 }
3573                 data->last_end = data->pos_min + l;
3574                 data->pos_min += l; /* As in the first entry. */
3575                 data->flags &= ~SF_BEFORE_EOL;
3576             }
3577             if (flags & SCF_DO_STCLASS_AND) {
3578                 /* Check whether it is compatible with what we know already! */
3579                 int compat = 1;
3580
3581
3582                 /* If compatible, we or it in below.  It is compatible if is
3583                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3584                  * it's for a locale.  Even if there isn't unicode semantics
3585                  * here, at runtime there may be because of matching against a
3586                  * utf8 string, so accept a possible false positive for
3587                  * latin1-range folds */
3588                 if (uc >= 0x100 ||
3589                     (!(data->start_class->flags & ANYOF_LOCALE)
3590                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3591                     && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3592                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3593                     )
3594                 {
3595                     compat = 0;
3596                 }
3597                 ANYOF_CLASS_ZERO(data->start_class);
3598                 ANYOF_BITMAP_ZERO(data->start_class);
3599                 if (compat)
3600                     ANYOF_BITMAP_SET(data->start_class, uc);
3601                 else if (uc >= 0x100) {
3602                     int i;
3603
3604                     /* Some Unicode code points fold to the Latin1 range; as
3605                      * XXX temporary code, instead of figuring out if this is
3606                      * one, just assume it is and set all the start class bits
3607                      * that could be some such above 255 code point's fold
3608                      * which will generate fals positives.  As the code
3609                      * elsewhere that does compute the fold settles down, it
3610                      * can be extracted out and re-used here */
3611                     for (i = 0; i < 256; i++){
3612                         if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3613                             ANYOF_BITMAP_SET(data->start_class, i);
3614                         }
3615                     }
3616                 }
3617                 CLEAR_SSC_EOS(data->start_class);
3618                 if (uc < 0x100)
3619                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3620             }
3621             else if (flags & SCF_DO_STCLASS_OR) {
3622                 /* false positive possible if the class is case-folded */
3623                 if (uc < 0x100)
3624                     ANYOF_BITMAP_SET(data->start_class, uc);
3625                 else
3626                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3627                 CLEAR_SSC_EOS(data->start_class);
3628                 cl_and(data->start_class, and_withp);
3629             }
3630             flags &= ~SCF_DO_STCLASS;
3631         }
3632         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3633             I32 l = STR_LEN(scan);
3634             UV uc = *((U8*)STRING(scan));
3635
3636             /* Search for fixed substrings supports EXACT only. */
3637             if (flags & SCF_DO_SUBSTR) {
3638                 assert(data);
3639                 SCAN_COMMIT(pRExC_state, data, minlenp);
3640             }
3641             if (UTF) {
3642                 const U8 * const s = (U8 *)STRING(scan);
3643                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3644                 l = utf8_length(s, s + l);
3645             }
3646             if (has_exactf_sharp_s) {
3647                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3648             }
3649             min += l - min_subtract;
3650             assert (min >= 0);
3651             delta += min_subtract;
3652             if (flags & SCF_DO_SUBSTR) {
3653                 data->pos_min += l - min_subtract;
3654                 if (data->pos_min < 0) {
3655                     data->pos_min = 0;
3656                 }
3657                 data->pos_delta += min_subtract;
3658                 if (min_subtract) {
3659                     data->longest = &(data->longest_float);
3660                 }
3661             }
3662             if (flags & SCF_DO_STCLASS_AND) {
3663                 /* Check whether it is compatible with what we know already! */
3664                 int compat = 1;
3665                 if (uc >= 0x100 ||
3666                  (!(data->start_class->flags & ANYOF_LOCALE)
3667                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3668                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3669                 {
3670                     compat = 0;
3671                 }
3672                 ANYOF_CLASS_ZERO(data->start_class);
3673                 ANYOF_BITMAP_ZERO(data->start_class);
3674                 if (compat) {
3675                     ANYOF_BITMAP_SET(data->start_class, uc);
3676                     CLEAR_SSC_EOS(data->start_class);
3677                     if (OP(scan) == EXACTFL) {
3678                         /* XXX This set is probably no longer necessary, and
3679                          * probably wrong as LOCALE now is on in the initial
3680                          * state */
3681                         data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3682                     }
3683                     else {
3684
3685                         /* Also set the other member of the fold pair.  In case
3686                          * that unicode semantics is called for at runtime, use
3687                          * the full latin1 fold.  (Can't do this for locale,
3688                          * because not known until runtime) */
3689                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3690
3691                         /* All other (EXACTFL handled above) folds except under
3692                          * /iaa that include s, S, and sharp_s also may include
3693                          * the others */
3694                         if (OP(scan) != EXACTFA) {
3695                             if (uc == 's' || uc == 'S') {
3696                                 ANYOF_BITMAP_SET(data->start_class,
3697                                                  LATIN_SMALL_LETTER_SHARP_S);
3698                             }
3699                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3700                                 ANYOF_BITMAP_SET(data->start_class, 's');
3701                                 ANYOF_BITMAP_SET(data->start_class, 'S');
3702                             }
3703                         }
3704                     }
3705                 }
3706                 else if (uc >= 0x100) {
3707                     int i;
3708                     for (i = 0; i < 256; i++){
3709                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3710                             ANYOF_BITMAP_SET(data->start_class, i);
3711                         }
3712                     }
3713                 }
3714             }
3715             else if (flags & SCF_DO_STCLASS_OR) {
3716                 if (data->start_class->flags & ANYOF_LOC_FOLD) {
3717                     /* false positive possible if the class is case-folded.
3718                        Assume that the locale settings are the same... */
3719                     if (uc < 0x100) {
3720                         ANYOF_BITMAP_SET(data->start_class, uc);
3721                         if (OP(scan) != EXACTFL) {
3722
3723                             /* And set the other member of the fold pair, but
3724                              * can't do that in locale because not known until
3725                              * run-time */
3726                             ANYOF_BITMAP_SET(data->start_class,
3727                                              PL_fold_latin1[uc]);
3728
3729                             /* All folds except under /iaa that include s, S,
3730                              * and sharp_s also may include the others */
3731                             if (OP(scan) != EXACTFA) {
3732                                 if (uc == 's' || uc == 'S') {
3733                                     ANYOF_BITMAP_SET(data->start_class,
3734                                                    LATIN_SMALL_LETTER_SHARP_S);
3735                                 }
3736                                 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3737                                     ANYOF_BITMAP_SET(data->start_class, 's');
3738                                     ANYOF_BITMAP_SET(data->start_class, 'S');
3739                                 }
3740                             }
3741                         }
3742                     }
3743                     CLEAR_SSC_EOS(data->start_class);
3744                 }
3745                 cl_and(data->start_class, and_withp);
3746             }
3747             flags &= ~SCF_DO_STCLASS;
3748         }
3749         else if (REGNODE_VARIES(OP(scan))) {
3750             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3751             I32 f = flags, pos_before = 0;
3752             regnode * const oscan = scan;
3753             struct regnode_charclass_class this_class;
3754             struct regnode_charclass_class *oclass = NULL;
3755             I32 next_is_eval = 0;
3756
3757             switch (PL_regkind[OP(scan)]) {
3758             case WHILEM:                /* End of (?:...)* . */
3759                 scan = NEXTOPER(scan);
3760                 goto finish;
3761             case PLUS:
3762                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3763                     next = NEXTOPER(scan);
3764                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3765                         mincount = 1;
3766                         maxcount = REG_INFTY;
3767                         next = regnext(scan);
3768                         scan = NEXTOPER(scan);
3769                         goto do_curly;
3770                     }
3771                 }
3772                 if (flags & SCF_DO_SUBSTR)
3773                     data->pos_min++;
3774                 min++;
3775                 /* Fall through. */
3776             case STAR:
3777                 if (flags & SCF_DO_STCLASS) {
3778                     mincount = 0;
3779                     maxcount = REG_INFTY;
3780                     next = regnext(scan);
3781                     scan = NEXTOPER(scan);
3782                     goto do_curly;
3783                 }
3784                 is_inf = is_inf_internal = 1;
3785                 scan = regnext(scan);
3786                 if (flags & SCF_DO_SUBSTR) {
3787                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3788                     data->longest = &(data->longest_float);
3789                 }
3790                 goto optimize_curly_tail;
3791             case CURLY:
3792                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3793                     && (scan->flags == stopparen))
3794                 {
3795                     mincount = 1;
3796                     maxcount = 1;
3797                 } else {
3798                     mincount = ARG1(scan);
3799                     maxcount = ARG2(scan);
3800                 }
3801                 next = regnext(scan);
3802                 if (OP(scan) == CURLYX) {
3803                     I32 lp = (data ? *(data->last_closep) : 0);
3804                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3805                 }
3806                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3807                 next_is_eval = (OP(scan) == EVAL);
3808               do_curly:
3809                 if (flags & SCF_DO_SUBSTR) {
3810                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3811                     pos_before = data->pos_min;
3812                 }
3813                 if (data) {
3814                     fl = data->flags;
3815                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3816                     if (is_inf)
3817                         data->flags |= SF_IS_INF;
3818                 }
3819                 if (flags & SCF_DO_STCLASS) {
3820                     cl_init(pRExC_state, &this_class);
3821                     oclass = data->start_class;
3822                     data->start_class = &this_class;
3823                     f |= SCF_DO_STCLASS_AND;
3824                     f &= ~SCF_DO_STCLASS_OR;
3825                 }
3826                 /* Exclude from super-linear cache processing any {n,m}
3827                    regops for which the combination of input pos and regex
3828                    pos is not enough information to determine if a match
3829                    will be possible.
3830
3831                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3832                    regex pos at the \s*, the prospects for a match depend not
3833                    only on the input position but also on how many (bar\s*)
3834                    repeats into the {4,8} we are. */
3835                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3836                     f &= ~SCF_WHILEM_VISITED_POS;
3837
3838                 /* This will finish on WHILEM, setting scan, or on NULL: */
3839                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3840                                       last, data, stopparen, recursed, NULL,
3841                                       (mincount == 0
3842                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3843
3844                 if (flags & SCF_DO_STCLASS)
3845                     data->start_class = oclass;
3846                 if (mincount == 0 || minnext == 0) {
3847                     if (flags & SCF_DO_STCLASS_OR) {
3848                         cl_or(pRExC_state, data->start_class, &this_class);
3849                     }
3850                     else if (flags & SCF_DO_STCLASS_AND) {
3851                         /* Switch to OR mode: cache the old value of
3852                          * data->start_class */
3853                         INIT_AND_WITHP;
3854                         StructCopy(data->start_class, and_withp,
3855                                    struct regnode_charclass_class);
3856                         flags &= ~SCF_DO_STCLASS_AND;
3857                         StructCopy(&this_class, data->start_class,
3858                                    struct regnode_charclass_class);
3859                         flags |= SCF_DO_STCLASS_OR;
3860                         SET_SSC_EOS(data->start_class);
3861                     }
3862                 } else {                /* Non-zero len */
3863                     if (flags & SCF_DO_STCLASS_OR) {
3864                         cl_or(pRExC_state, data->start_class, &this_class);
3865                         cl_and(data->start_class, and_withp);
3866                     }
3867                     else if (flags & SCF_DO_STCLASS_AND)
3868                         cl_and(data->start_class, &this_class);
3869                     flags &= ~SCF_DO_STCLASS;
3870                 }
3871                 if (!scan)              /* It was not CURLYX, but CURLY. */
3872                     scan = next;
3873                 if ( /* ? quantifier ok, except for (?{ ... }) */
3874                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3875                     && (minnext == 0) && (deltanext == 0)
3876                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3877                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3878                 {
3879                     /* Fatal warnings may leak the regexp without this: */
3880                     SAVEFREESV(RExC_rx_sv);
3881                     ckWARNreg(RExC_parse,
3882                               "Quantifier unexpected on zero-length expression");
3883                     (void)ReREFCNT_inc(RExC_rx_sv);
3884                 }
3885
3886                 min += minnext * mincount;
3887                 is_inf_internal |= ((maxcount == REG_INFTY
3888                                      && (minnext + deltanext) > 0)
3889                                     || deltanext == I32_MAX);
3890                 is_inf |= is_inf_internal;
3891                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3892
3893                 /* Try powerful optimization CURLYX => CURLYN. */
3894                 if (  OP(oscan) == CURLYX && data
3895                       && data->flags & SF_IN_PAR
3896                       && !(data->flags & SF_HAS_EVAL)
3897                       && !deltanext && minnext == 1 ) {
3898                     /* Try to optimize to CURLYN.  */
3899                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3900                     regnode * const nxt1 = nxt;
3901 #ifdef DEBUGGING
3902                     regnode *nxt2;
3903 #endif
3904
3905                     /* Skip open. */
3906                     nxt = regnext(nxt);
3907                     if (!REGNODE_SIMPLE(OP(nxt))
3908                         && !(PL_regkind[OP(nxt)] == EXACT
3909                              && STR_LEN(nxt) == 1))
3910                         goto nogo;
3911 #ifdef DEBUGGING
3912                     nxt2 = nxt;
3913 #endif
3914                     nxt = regnext(nxt);
3915                     if (OP(nxt) != CLOSE)
3916                         goto nogo;
3917                     if (RExC_open_parens) {
3918                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3919                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3920                     }
3921                     /* Now we know that nxt2 is the only contents: */
3922                     oscan->flags = (U8)ARG(nxt);
3923                     OP(oscan) = CURLYN;
3924                     OP(nxt1) = NOTHING; /* was OPEN. */
3925
3926 #ifdef DEBUGGING
3927                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3928                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3929                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3930                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3931                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3932                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3933 #endif
3934                 }
3935               nogo:
3936
3937                 /* Try optimization CURLYX => CURLYM. */
3938                 if (  OP(oscan) == CURLYX && data
3939                       && !(data->flags & SF_HAS_PAR)
3940                       && !(data->flags & SF_HAS_EVAL)
3941                       && !deltanext     /* atom is fixed width */
3942                       && minnext != 0   /* CURLYM can't handle zero width */
3943                       && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3944                 ) {
3945                     /* XXXX How to optimize if data == 0? */
3946                     /* Optimize to a simpler form.  */
3947                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3948                     regnode *nxt2;
3949
3950                     OP(oscan) = CURLYM;
3951                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3952                             && (OP(nxt2) != WHILEM))
3953                         nxt = nxt2;
3954                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3955                     /* Need to optimize away parenths. */
3956                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3957                         /* Set the parenth number.  */
3958                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3959
3960                         oscan->flags = (U8)ARG(nxt);
3961                         if (RExC_open_parens) {
3962                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3963                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3964                         }
3965                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3966                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3967
3968 #ifdef DEBUGGING
3969                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3970                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3971                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3972                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3973 #endif
3974 #if 0
3975                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3976                             regnode *nnxt = regnext(nxt1);
3977                             if (nnxt == nxt) {
3978                                 if (reg_off_by_arg[OP(nxt1)])
3979                                     ARG_SET(nxt1, nxt2 - nxt1);
3980                                 else if (nxt2 - nxt1 < U16_MAX)
3981                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3982                                 else
3983                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3984                             }
3985                             nxt1 = nnxt;
3986                         }
3987 #endif
3988                         /* Optimize again: */
3989                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3990                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3991                     }
3992                     else
3993                         oscan->flags = 0;
3994                 }
3995                 else if ((OP(oscan) == CURLYX)
3996                          && (flags & SCF_WHILEM_VISITED_POS)
3997                          /* See the comment on a similar expression above.
3998                             However, this time it's not a subexpression
3999                             we care about, but the expression itself. */
4000                          && (maxcount == REG_INFTY)
4001                          && data && ++data->whilem_c < 16) {
4002                     /* This stays as CURLYX, we can put the count/of pair. */
4003                     /* Find WHILEM (as in regexec.c) */
4004                     regnode *nxt = oscan + NEXT_OFF(oscan);
4005
4006                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4007                         nxt += ARG(nxt);
4008                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4009                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4010                 }
4011                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4012                     pars++;
4013                 if (flags & SCF_DO_SUBSTR) {
4014                     SV *last_str = NULL;
4015                     int counted = mincount != 0;
4016
4017                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4018 #if defined(SPARC64_GCC_WORKAROUND)
4019                         I32 b = 0;
4020                         STRLEN l = 0;
4021                         const char *s = NULL;
4022                         I32 old = 0;
4023
4024                         if (pos_before >= data->last_start_min)
4025                             b = pos_before;
4026                         else
4027                             b = data->last_start_min;
4028
4029                         l = 0;
4030                         s = SvPV_const(data->last_found, l);
4031                         old = b - data->last_start_min;
4032
4033 #else
4034                         I32 b = pos_before >= data->last_start_min
4035                             ? pos_before : data->last_start_min;
4036                         STRLEN l;
4037                         const char * const s = SvPV_const(data->last_found, l);
4038                         I32 old = b - data->last_start_min;
4039 #endif
4040
4041                         if (UTF)
4042                             old = utf8_hop((U8*)s, old) - (U8*)s;
4043                         l -= old;
4044                         /* Get the added string: */
4045                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4046                         if (deltanext == 0 && pos_before == b) {
4047                             /* What was added is a constant string */
4048                             if (mincount > 1) {
4049                                 SvGROW(last_str, (mincount * l) + 1);
4050                                 repeatcpy(SvPVX(last_str) + l,
4051                                           SvPVX_const(last_str), l, mincount - 1);
4052                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4053                                 /* Add additional parts. */
4054                                 SvCUR_set(data->last_found,
4055                                           SvCUR(data->last_found) - l);
4056                                 sv_catsv(data->last_found, last_str);
4057                                 {
4058                                     SV * sv = data->last_found;
4059                                     MAGIC *mg =
4060                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4061                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4062                                     if (mg && mg->mg_len >= 0)
4063                                         mg->mg_len += CHR_SVLEN(last_str) - l;
4064                                 }
4065                                 data->last_end += l * (mincount - 1);
4066                             }
4067                         } else {
4068                             /* start offset must point into the last copy */
4069                             data->last_start_min += minnext * (mincount - 1);
4070                             data->last_start_max += is_inf ? I32_MAX
4071                                 : (maxcount - 1) * (minnext + data->pos_delta);
4072                         }
4073                     }
4074                     /* It is counted once already... */
4075                     data->pos_min += minnext * (mincount - counted);
4076                     data->pos_delta += - counted * deltanext +
4077                         (minnext + deltanext) * maxcount - minnext * mincount;
4078                     if (mincount != maxcount) {
4079                          /* Cannot extend fixed substrings found inside
4080                             the group.  */
4081                         SCAN_COMMIT(pRExC_state,data,minlenp);
4082                         if (mincount && last_str) {
4083                             SV * const sv = data->last_found;
4084                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4085                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4086
4087                             if (mg)
4088                                 mg->mg_len = -1;
4089                             sv_setsv(sv, last_str);
4090                             data->last_end = data->pos_min;
4091                             data->last_start_min =
4092                                 data->pos_min - CHR_SVLEN(last_str);
4093                             data->last_start_max = is_inf
4094                                 ? I32_MAX
4095                                 : data->pos_min + data->pos_delta
4096                                 - CHR_SVLEN(last_str);
4097                         }
4098                         data->longest = &(data->longest_float);
4099                     }
4100                     SvREFCNT_dec(last_str);
4101                 }
4102                 if (data && (fl & SF_HAS_EVAL))
4103                     data->flags |= SF_HAS_EVAL;
4104               optimize_curly_tail:
4105                 if (OP(oscan) != CURLYX) {
4106                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4107                            && NEXT_OFF(next))
4108                         NEXT_OFF(oscan) += NEXT_OFF(next);
4109                 }
4110                 continue;
4111             default:                    /* REF, ANYOFV, and CLUMP only? */
4112                 if (flags & SCF_DO_SUBSTR) {
4113                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
4114                     data->longest = &(data->longest_float);
4115                 }
4116                 is_inf = is_inf_internal = 1;
4117                 if (flags & SCF_DO_STCLASS_OR)
4118                     cl_anything(pRExC_state, data->start_class);
4119                 flags &= ~SCF_DO_STCLASS;
4120                 break;
4121             }
4122         }
4123         else if (OP(scan) == LNBREAK) {
4124             if (flags & SCF_DO_STCLASS) {
4125                 int value = 0;
4126                 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4127                 if (flags & SCF_DO_STCLASS_AND) {
4128                     for (value = 0; value < 256; value++)
4129                         if (!is_VERTWS_cp(value))
4130                             ANYOF_BITMAP_CLEAR(data->start_class, value);
4131                 }
4132                 else {
4133                     for (value = 0; value < 256; value++)
4134                         if (is_VERTWS_cp(value))
4135                             ANYOF_BITMAP_SET(data->start_class, value);
4136                 }
4137                 if (flags & SCF_DO_STCLASS_OR)
4138                     cl_and(data->start_class, and_withp);
4139                 flags &= ~SCF_DO_STCLASS;
4140             }
4141             min++;
4142             delta++;    /* Because of the 2 char string cr-lf */
4143             if (flags & SCF_DO_SUBSTR) {
4144                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4145                 data->pos_min += 1;
4146                 data->pos_delta += 1;
4147                 data->longest = &(data->longest_float);
4148             }
4149         }
4150         else if (REGNODE_SIMPLE(OP(scan))) {
4151             int value = 0;
4152
4153             if (flags & SCF_DO_SUBSTR) {
4154                 SCAN_COMMIT(pRExC_state,data,minlenp);
4155                 data->pos_min++;
4156             }
4157             min++;
4158             if (flags & SCF_DO_STCLASS) {
4159                 int loop_max = 256;
4160                 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4161
4162                 /* Some of the logic below assumes that switching
4163                    locale on will only add false positives. */
4164                 switch (PL_regkind[OP(scan)]) {
4165                     U8 classnum;
4166
4167                 case SANY:
4168                 default:
4169 #ifdef DEBUGGING
4170                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4171 #endif
4172                  do_default:
4173                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4174                         cl_anything(pRExC_state, data->start_class);
4175                     break;
4176                 case REG_ANY:
4177                     if (OP(scan) == SANY)
4178                         goto do_default;
4179                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4180                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4181                                 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4182                         cl_anything(pRExC_state, data->start_class);
4183                     }
4184                     if (flags & SCF_DO_STCLASS_AND || !value)
4185                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4186                     break;
4187                 case ANYOF:
4188                     if (flags & SCF_DO_STCLASS_AND)
4189                         cl_and(data->start_class,
4190                                (struct regnode_charclass_class*)scan);
4191                     else
4192                         cl_or(pRExC_state, data->start_class,
4193                               (struct regnode_charclass_class*)scan);
4194                     break;
4195                 case POSIXA:
4196                     loop_max = 128;
4197                     /* FALL THROUGH */
4198                 case POSIXL:
4199                 case POSIXD:
4200                 case POSIXU:
4201                     classnum = FLAGS(scan);
4202                     if (flags & SCF_DO_STCLASS_AND) {
4203                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4204                             ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
4205                             for (value = 0; value < loop_max; value++) {
4206                                 if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4207                                     ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4208                                 }
4209                             }
4210                         }
4211                     }
4212                     else {
4213                         if (data->start_class->flags & ANYOF_LOCALE) {
4214                             ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum));
4215                         }
4216                         else {
4217
4218                         /* Even if under locale, set the bits for non-locale
4219                          * in case it isn't a true locale-node.  This will
4220                          * create false positives if it truly is locale */
4221                         for (value = 0; value < loop_max; value++) {
4222                             if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4223                                 ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4224                             }
4225                         }
4226                         }
4227                     }
4228                     break;
4229                 case NPOSIXA:
4230                     loop_max = 128;
4231                     /* FALL THROUGH */
4232                 case NPOSIXL:
4233                 case NPOSIXU:
4234                 case NPOSIXD:
4235                     classnum = FLAGS(scan);
4236                     if (flags & SCF_DO_STCLASS_AND) {
4237                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4238                             ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
4239                             for (value = 0; value < loop_max; value++) {
4240                                 if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4241                                     ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4242                                 }
4243                             }
4244                         }
4245                     }
4246                     else {
4247                         if (data->start_class->flags & ANYOF_LOCALE) {
4248                             ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1);
4249                         }
4250                         else {
4251
4252                         /* Even if under locale, set the bits for non-locale in
4253                          * case it isn't a true locale-node.  This will create
4254                          * false positives if it truly is locale */
4255                         for (value = 0; value < loop_max; value++) {
4256                             if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4257                                 ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4258                             }
4259                         }
4260                         if (PL_regkind[OP(scan)] == NPOSIXD) {
4261                             data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
4262                         }
4263                         }
4264                     }
4265                     break;
4266                 }
4267                 if (flags & SCF_DO_STCLASS_OR)
4268                     cl_and(data->start_class, and_withp);
4269                 flags &= ~SCF_DO_STCLASS;
4270             }
4271         }
4272         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4273             data->flags |= (OP(scan) == MEOL
4274                             ? SF_BEFORE_MEOL
4275                             : SF_BEFORE_SEOL);
4276             SCAN_COMMIT(pRExC_state, data, minlenp);
4277
4278         }
4279         else if (  PL_regkind[OP(scan)] == BRANCHJ
4280                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4281                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4282                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4283             if ( OP(scan) == UNLESSM &&
4284                  scan->flags == 0 &&
4285                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4286                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4287             ) {
4288                 regnode *opt;
4289                 regnode *upto= regnext(scan);
4290                 DEBUG_PARSE_r({
4291                     SV * const mysv_val=sv_newmortal();
4292                     DEBUG_STUDYDATA("OPFAIL",data,depth);
4293
4294                     /*DEBUG_PARSE_MSG("opfail");*/
4295                     regprop(RExC_rx, mysv_val, upto);
4296                     PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4297                                   SvPV_nolen_const(mysv_val),
4298                                   (IV)REG_NODE_NUM(upto),
4299                                   (IV)(upto - scan)
4300                     );
4301                 });
4302                 OP(scan) = OPFAIL;
4303                 NEXT_OFF(scan) = upto - scan;
4304                 for (opt= scan + 1; opt < upto ; opt++)
4305                     OP(opt) = OPTIMIZED;
4306                 scan= upto;
4307                 continue;
4308             }
4309             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4310                 || OP(scan) == UNLESSM )
4311             {
4312                 /* Negative Lookahead/lookbehind
4313                    In this case we can't do fixed string optimisation.
4314                 */
4315
4316                 I32 deltanext, minnext, fake = 0;
4317                 regnode *nscan;
4318                 struct regnode_charclass_class intrnl;
4319                 int f = 0;
4320
4321                 data_fake.flags = 0;
4322                 if (data) {
4323                     data_fake.whilem_c = data->whilem_c;
4324                     data_fake.last_closep = data->last_closep;
4325                 }
4326                 else
4327                     data_fake.last_closep = &fake;
4328                 data_fake.pos_delta = delta;
4329                 if ( flags & SCF_DO_STCLASS && !scan->flags
4330                      && OP(scan) == IFMATCH ) { /* Lookahead */
4331                     cl_init(pRExC_state, &intrnl);
4332                     data_fake.start_class = &intrnl;
4333                     f |= SCF_DO_STCLASS_AND;
4334                 }
4335                 if (flags & SCF_WHILEM_VISITED_POS)
4336                     f |= SCF_WHILEM_VISITED_POS;
4337                 next = regnext(scan);
4338                 nscan = NEXTOPER(NEXTOPER(scan));
4339                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4340                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4341                 if (scan->flags) {
4342                     if (deltanext) {
4343                         FAIL("Variable length lookbehind not implemented");
4344                     }
4345                     else if (minnext > (I32)U8_MAX) {
4346                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4347                     }
4348                     scan->flags = (U8)minnext;
4349                 }
4350                 if (data) {
4351                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4352                         pars++;
4353                     if (data_fake.flags & SF_HAS_EVAL)
4354                         data->flags |= SF_HAS_EVAL;
4355                     data->whilem_c = data_fake.whilem_c;
4356                 }
4357                 if (f & SCF_DO_STCLASS_AND) {
4358                     if (flags & SCF_DO_STCLASS_OR) {
4359                         /* OR before, AND after: ideally we would recurse with
4360                          * data_fake to get the AND applied by study of the
4361                          * remainder of the pattern, and then derecurse;
4362                          * *** HACK *** for now just treat as "no information".
4363                          * See [perl #56690].
4364                          */
4365                         cl_init(pRExC_state, data->start_class);
4366                     }  else {
4367                         /* AND before and after: combine and continue */
4368                         const int was = TEST_SSC_EOS(data->start_class);
4369
4370                         cl_and(data->start_class, &intrnl);
4371                         if (was)
4372                             SET_SSC_EOS(data->start_class);
4373                     }
4374                 }
4375             }
4376 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4377             else {
4378                 /* Positive Lookahead/lookbehind
4379                    In this case we can do fixed string optimisation,
4380                    but we must be careful about it. Note in the case of
4381                    lookbehind the positions will be offset by the minimum
4382                    length of the pattern, something we won't know about
4383                    until after the recurse.
4384                 */
4385                 I32 deltanext, fake = 0;
4386                 regnode *nscan;
4387                 struct regnode_charclass_class intrnl;
4388                 int f = 0;
4389                 /* We use SAVEFREEPV so that when the full compile
4390                     is finished perl will clean up the allocated
4391                     minlens when it's all done. This way we don't
4392                     have to worry about freeing them when we know
4393                     they wont be used, which would be a pain.
4394                  */
4395                 I32 *minnextp;
4396                 Newx( minnextp, 1, I32 );
4397                 SAVEFREEPV(minnextp);
4398
4399                 if (data) {
4400                     StructCopy(data, &data_fake, scan_data_t);
4401                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4402                         f |= SCF_DO_SUBSTR;
4403                         if (scan->flags)
4404                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4405                         data_fake.last_found=newSVsv(data->last_found);
4406                     }
4407                 }
4408                 else
4409                     data_fake.last_closep = &fake;
4410                 data_fake.flags = 0;
4411                 data_fake.pos_delta = delta;
4412                 if (is_inf)
4413                     data_fake.flags |= SF_IS_INF;
4414                 if ( flags & SCF_DO_STCLASS && !scan->flags
4415                      && OP(scan) == IFMATCH ) { /* Lookahead */
4416                     cl_init(pRExC_state, &intrnl);
4417                     data_fake.start_class = &intrnl;
4418                     f |= SCF_DO_STCLASS_AND;
4419                 }
4420                 if (flags & SCF_WHILEM_VISITED_POS)
4421                     f |= SCF_WHILEM_VISITED_POS;
4422                 next = regnext(scan);
4423                 nscan = NEXTOPER(NEXTOPER(scan));
4424
4425                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4426                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4427                 if (scan->flags) {
4428                     if (deltanext) {
4429                         FAIL("Variable length lookbehind not implemented");
4430                     }
4431                     else if (*minnextp > (I32)U8_MAX) {
4432                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4433                     }
4434                     scan->flags = (U8)*minnextp;
4435                 }
4436
4437                 *minnextp += min;
4438
4439                 if (f & SCF_DO_STCLASS_AND) {
4440                     const int was = TEST_SSC_EOS(data.start_class);
4441
4442                     cl_and(data->start_class, &intrnl);
4443                     if (was)
4444                         SET_SSC_EOS(data->start_class);
4445                 }
4446                 if (data) {
4447                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4448                         pars++;
4449                     if (data_fake.flags & SF_HAS_EVAL)
4450                         data->flags |= SF_HAS_EVAL;
4451                     data->whilem_c = data_fake.whilem_c;
4452                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4453                         if (RExC_rx->minlen<*minnextp)
4454                             RExC_rx->minlen=*minnextp;
4455                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4456                         SvREFCNT_dec_NN(data_fake.last_found);
4457
4458                         if ( data_fake.minlen_fixed != minlenp )
4459                         {
4460                             data->offset_fixed= data_fake.offset_fixed;
4461                             data->minlen_fixed= data_fake.minlen_fixed;
4462                             data->lookbehind_fixed+= scan->flags;
4463                         }
4464                         if ( data_fake.minlen_float != minlenp )
4465                         {
4466                             data->minlen_float= data_fake.minlen_float;
4467                             data->offset_float_min=data_fake.offset_float_min;
4468                             data->offset_float_max=data_fake.offset_float_max;
4469                             data->lookbehind_float+= scan->flags;
4470                         }
4471                     }
4472                 }
4473             }
4474 #endif
4475         }
4476         else if (OP(scan) == OPEN) {
4477             if (stopparen != (I32)ARG(scan))
4478                 pars++;
4479         }
4480         else if (OP(scan) == CLOSE) {
4481             if (stopparen == (I32)ARG(scan)) {
4482                 break;
4483             }
4484             if ((I32)ARG(scan) == is_par) {
4485                 next = regnext(scan);
4486
4487                 if ( next && (OP(next) != WHILEM) && next < last)
4488                     is_par = 0;         /* Disable optimization */
4489             }
4490             if (data)
4491                 *(data->last_closep) = ARG(scan);
4492         }
4493         else if (OP(scan) == EVAL) {
4494                 if (data)
4495                     data->flags |= SF_HAS_EVAL;
4496         }
4497         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4498             if (flags & SCF_DO_SUBSTR) {
4499                 SCAN_COMMIT(pRExC_state,data,minlenp);
4500                 flags &= ~SCF_DO_SUBSTR;
4501             }
4502             if (data && OP(scan)==ACCEPT) {
4503                 data->flags |= SCF_SEEN_ACCEPT;
4504                 if (stopmin > min)
4505                     stopmin = min;
4506             }
4507         }
4508         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4509         {
4510                 if (flags & SCF_DO_SUBSTR) {
4511                     SCAN_COMMIT(pRExC_state,data,minlenp);
4512                     data->longest = &(data->longest_float);
4513                 }
4514                 is_inf = is_inf_internal = 1;
4515                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4516                     cl_anything(pRExC_state, data->start_class);
4517                 flags &= ~SCF_DO_STCLASS;
4518         }
4519         else if (OP(scan) == GPOS) {
4520             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4521                 !(delta || is_inf || (data && data->pos_delta)))
4522             {
4523                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4524                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4525                 if (RExC_rx->gofs < (U32)min)
4526                     RExC_rx->gofs = min;
4527             } else {
4528                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4529                 RExC_rx->gofs = 0;
4530             }
4531         }
4532 #ifdef TRIE_STUDY_OPT
4533 #ifdef FULL_TRIE_STUDY
4534         else if (PL_regkind[OP(scan)] == TRIE) {
4535             /* NOTE - There is similar code to this block above for handling
4536                BRANCH nodes on the initial study.  If you change stuff here
4537                check there too. */
4538             regnode *trie_node= scan;
4539             regnode *tail= regnext(scan);
4540             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4541             I32 max1 = 0, min1 = I32_MAX;
4542             struct regnode_charclass_class accum;
4543
4544             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4545                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4546             if (flags & SCF_DO_STCLASS)
4547                 cl_init_zero(pRExC_state, &accum);
4548
4549             if (!trie->jump) {
4550                 min1= trie->minlen;
4551                 max1= trie->maxlen;
4552             } else {
4553                 const regnode *nextbranch= NULL;
4554                 U32 word;
4555
4556                 for ( word=1 ; word <= trie->wordcount ; word++)
4557                 {
4558                     I32 deltanext=0, minnext=0, f = 0, fake;
4559                     struct regnode_charclass_class this_class;
4560
4561                     data_fake.flags = 0;
4562                     if (data) {
4563                         data_fake.whilem_c = data->whilem_c;
4564                         data_fake.last_closep = data->last_closep;
4565                     }
4566                     else
4567                         data_fake.last_closep = &fake;
4568                     data_fake.pos_delta = delta;
4569                     if (flags & SCF_DO_STCLASS) {
4570                         cl_init(pRExC_state, &this_class);
4571                         data_fake.start_class = &this_class;
4572                         f = SCF_DO_STCLASS_AND;
4573                     }
4574                     if (flags & SCF_WHILEM_VISITED_POS)
4575                         f |= SCF_WHILEM_VISITED_POS;
4576
4577                     if (trie->jump[word]) {
4578                         if (!nextbranch)
4579                             nextbranch = trie_node + trie->jump[0];
4580                         scan= trie_node + trie->jump[word];
4581                         /* We go from the jump point to the branch that follows
4582                            it. Note this means we need the vestigal unused branches
4583                            even though they arent otherwise used.
4584                          */
4585                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4586                             &deltanext, (regnode *)nextbranch, &data_fake,
4587                             stopparen, recursed, NULL, f,depth+1);
4588                     }
4589                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4590                         nextbranch= regnext((regnode*)nextbranch);
4591
4592                     if (min1 > (I32)(minnext + trie->minlen))
4593                         min1 = minnext + trie->minlen;
4594                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4595                         max1 = minnext + deltanext + trie->maxlen;
4596                     if (deltanext == I32_MAX)
4597                         is_inf = is_inf_internal = 1;
4598
4599                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4600                         pars++;
4601                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4602                         if ( stopmin > min + min1)
4603                             stopmin = min + min1;
4604                         flags &= ~SCF_DO_SUBSTR;
4605                         if (data)
4606                             data->flags |= SCF_SEEN_ACCEPT;
4607                     }
4608                     if (data) {
4609                         if (data_fake.flags & SF_HAS_EVAL)
4610                             data->flags |= SF_HAS_EVAL;
4611                         data->whilem_c = data_fake.whilem_c;
4612                     }
4613                     if (flags & SCF_DO_STCLASS)
4614                         cl_or(pRExC_state, &accum, &this_class);
4615                 }
4616             }
4617             if (flags & SCF_DO_SUBSTR) {
4618                 data->pos_min += min1;
4619                 data->pos_delta += max1 - min1;
4620                 if (max1 != min1 || is_inf)
4621                     data->longest = &(data->longest_float);
4622             }
4623             min += min1;
4624             delta += max1 - min1;
4625             if (flags & SCF_DO_STCLASS_OR) {
4626                 cl_or(pRExC_state, data->start_class, &accum);
4627                 if (min1) {
4628                     cl_and(data->start_class, and_withp);
4629                     flags &= ~SCF_DO_STCLASS;
4630                 }
4631             }
4632             else if (flags & SCF_DO_STCLASS_AND) {
4633                 if (min1) {
4634                     cl_and(data->start_class, &accum);
4635                     flags &= ~SCF_DO_STCLASS;
4636                 }
4637                 else {
4638                     /* Switch to OR mode: cache the old value of
4639                      * data->start_class */
4640                     INIT_AND_WITHP;
4641                     StructCopy(data->start_class, and_withp,
4642                                struct regnode_charclass_class);
4643                     flags &= ~SCF_DO_STCLASS_AND;
4644                     StructCopy(&accum, data->start_class,
4645                                struct regnode_charclass_class);
4646                     flags |= SCF_DO_STCLASS_OR;
4647                     SET_SSC_EOS(data->start_class);
4648                 }
4649             }
4650             scan= tail;
4651             continue;
4652         }
4653 #else
4654         else if (PL_regkind[OP(scan)] == TRIE) {
4655             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4656             U8*bang=NULL;
4657
4658             min += trie->minlen;
4659             delta += (trie->maxlen - trie->minlen);
4660             flags &= ~SCF_DO_STCLASS; /* xxx */
4661             if (flags & SCF_DO_SUBSTR) {
4662                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4663                 data->pos_min += trie->minlen;
4664                 data->pos_delta += (trie->maxlen - trie->minlen);
4665                 if (trie->maxlen != trie->minlen)
4666                     data->longest = &(data->longest_float);
4667             }
4668             if (trie->jump) /* no more substrings -- for now /grr*/
4669                 flags &= ~SCF_DO_SUBSTR;
4670         }
4671 #endif /* old or new */
4672 #endif /* TRIE_STUDY_OPT */
4673
4674         /* Else: zero-length, ignore. */
4675         scan = regnext(scan);
4676     }
4677     if (frame) {
4678         last = frame->last;
4679         scan = frame->next;
4680         stopparen = frame->stop;
4681         frame = frame->prev;
4682         goto fake_study_recurse;
4683     }
4684
4685   finish:
4686     assert(!frame);
4687     DEBUG_STUDYDATA("pre-fin:",data,depth);
4688
4689     *scanp = scan;
4690     *deltap = is_inf_internal ? I32_MAX : delta;
4691     if (flags & SCF_DO_SUBSTR && is_inf)
4692         data->pos_delta = I32_MAX - data->pos_min;
4693     if (is_par > (I32)U8_MAX)
4694         is_par = 0;
4695     if (is_par && pars==1 && data) {
4696         data->flags |= SF_IN_PAR;
4697         data->flags &= ~SF_HAS_PAR;
4698     }
4699     else if (pars && data) {
4700         data->flags |= SF_HAS_PAR;
4701         data->flags &= ~SF_IN_PAR;
4702     }
4703     if (flags & SCF_DO_STCLASS_OR)
4704         cl_and(data->start_class, and_withp);
4705     if (flags & SCF_TRIE_RESTUDY)
4706         data->flags |=  SCF_TRIE_RESTUDY;
4707
4708     DEBUG_STUDYDATA("post-fin:",data,depth);
4709
4710     return min < stopmin ? min : stopmin;
4711 }
4712
4713 STATIC U32
4714 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4715 {
4716     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4717
4718     PERL_ARGS_ASSERT_ADD_DATA;
4719
4720     Renewc(RExC_rxi->data,
4721            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4722            char, struct reg_data);
4723     if(count)
4724         Renew(RExC_rxi->data->what, count + n, U8);
4725     else
4726         Newx(RExC_rxi->data->what, n, U8);
4727     RExC_rxi->data->count = count + n;
4728     Copy(s, RExC_rxi->data->what + count, n, U8);
4729     return count;
4730 }
4731
4732 /*XXX: todo make this not included in a non debugging perl */
4733 #ifndef PERL_IN_XSUB_RE
4734 void
4735 Perl_reginitcolors(pTHX)
4736 {
4737     dVAR;
4738     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4739     if (s) {
4740         char *t = savepv(s);
4741         int i = 0;
4742         PL_colors[0] = t;
4743         while (++i < 6) {
4744             t = strchr(t, '\t');
4745             if (t) {
4746                 *t = '\0';
4747                 PL_colors[i] = ++t;
4748             }
4749             else
4750                 PL_colors[i] = t = (char *)"";
4751         }
4752     } else {
4753         int i = 0;
4754         while (i < 6)
4755             PL_colors[i++] = (char *)"";
4756     }
4757     PL_colorset = 1;
4758 }
4759 #endif
4760
4761
4762 #ifdef TRIE_STUDY_OPT
4763 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
4764     STMT_START {                                            \
4765         if (                                                \
4766               (data.flags & SCF_TRIE_RESTUDY)               \
4767               && ! restudied++                              \
4768         ) {                                                 \
4769             dOsomething;                                    \
4770             goto reStudy;                                   \
4771         }                                                   \
4772     } STMT_END
4773 #else
4774 #define CHECK_RESTUDY_GOTO_butfirst
4775 #endif
4776
4777 /*
4778  * pregcomp - compile a regular expression into internal code
4779  *
4780  * Decides which engine's compiler to call based on the hint currently in
4781  * scope
4782  */
4783
4784 #ifndef PERL_IN_XSUB_RE
4785
4786 /* return the currently in-scope regex engine (or the default if none)  */
4787
4788 regexp_engine const *
4789 Perl_current_re_engine(pTHX)
4790 {
4791     dVAR;
4792
4793     if (IN_PERL_COMPILETIME) {
4794         HV * const table = GvHV(PL_hintgv);
4795         SV **ptr;
4796
4797         if (!table)
4798             return &PL_core_reg_engine;
4799         ptr = hv_fetchs(table, "regcomp", FALSE);
4800         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4801             return &PL_core_reg_engine;
4802         return INT2PTR(regexp_engine*,SvIV(*ptr));
4803     }
4804     else {
4805         SV *ptr;
4806         if (!PL_curcop->cop_hints_hash)
4807             return &PL_core_reg_engine;
4808         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4809         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4810             return &PL_core_reg_engine;
4811         return INT2PTR(regexp_engine*,SvIV(ptr));
4812     }
4813 }
4814
4815
4816 REGEXP *
4817 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4818 {
4819     dVAR;
4820     regexp_engine const *eng = current_re_engine();
4821     GET_RE_DEBUG_FLAGS_DECL;
4822
4823     PERL_ARGS_ASSERT_PREGCOMP;
4824
4825     /* Dispatch a request to compile a regexp to correct regexp engine. */
4826     DEBUG_COMPILE_r({
4827         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4828                         PTR2UV(eng));
4829     });
4830     return CALLREGCOMP_ENG(eng, pattern, flags);
4831 }
4832 #endif
4833
4834 /* public(ish) entry point for the perl core's own regex compiling code.
4835  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4836  * pattern rather than a list of OPs, and uses the internal engine rather
4837  * than the current one */
4838
4839 REGEXP *
4840 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4841 {
4842     SV *pat = pattern; /* defeat constness! */
4843     PERL_ARGS_ASSERT_RE_COMPILE;
4844     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4845 #ifdef PERL_IN_XSUB_RE
4846                                 &my_reg_engine,
4847 #else
4848                                 &PL_core_reg_engine,
4849 #endif
4850                                 NULL, NULL, rx_flags, 0);
4851 }
4852
4853 /* see if there are any run-time code blocks in the pattern.
4854  * False positives are allowed */
4855
4856 static bool
4857 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4858                     U32 pm_flags, char *pat, STRLEN plen)
4859 {
4860     int n = 0;
4861     STRLEN s;
4862
4863     /* avoid infinitely recursing when we recompile the pattern parcelled up
4864      * as qr'...'. A single constant qr// string can't have have any
4865      * run-time component in it, and thus, no runtime code. (A non-qr
4866      * string, however, can, e.g. $x =~ '(?{})') */
4867     if  ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
4868         return 0;
4869
4870     for (s = 0; s < plen; s++) {
4871         if (n < pRExC_state->num_code_blocks
4872             && s == pRExC_state->code_blocks[n].start)
4873         {
4874             s = pRExC_state->code_blocks[n].end;
4875             n++;
4876             continue;
4877         }
4878         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
4879          * positives here */
4880         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
4881             (pat[s+2] == '{'
4882                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
4883         )
4884             return 1;
4885     }
4886     return 0;
4887 }
4888
4889 /* Handle run-time code blocks. We will already have compiled any direct
4890  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
4891  * copy of it, but with any literal code blocks blanked out and
4892  * appropriate chars escaped; then feed it into
4893  *
4894  *    eval "qr'modified_pattern'"
4895  *
4896  * For example,
4897  *
4898  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
4899  *
4900  * becomes
4901  *
4902  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
4903  *
4904  * After eval_sv()-ing that, grab any new code blocks from the returned qr
4905  * and merge them with any code blocks of the original regexp.
4906  *
4907  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
4908  * instead, just save the qr and return FALSE; this tells our caller that
4909  * the original pattern needs upgrading to utf8.
4910  */
4911
4912 static bool
4913 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
4914     char *pat, STRLEN plen)
4915 {
4916     SV *qr;
4917
4918     GET_RE_DEBUG_FLAGS_DECL;
4919
4920     if (pRExC_state->runtime_code_qr) {
4921         /* this is the second time we've been called; this should
4922          * only happen if the main pattern got upgraded to utf8
4923          * during compilation; re-use the qr we compiled first time
4924          * round (which should be utf8 too)
4925          */
4926         qr = pRExC_state->runtime_code_qr;
4927         pRExC_state->runtime_code_qr = NULL;
4928         assert(RExC_utf8 && SvUTF8(qr));
4929     }
4930     else {
4931         int n = 0;
4932         STRLEN s;
4933         char *p, *newpat;
4934         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
4935         SV *sv, *qr_ref;
4936         dSP;
4937
4938         /* determine how many extra chars we need for ' and \ escaping */
4939         for (s = 0; s < plen; s++) {
4940             if (pat[s] == '\'' || pat[s] == '\\')
4941                 newlen++;
4942         }
4943
4944         Newx(newpat, newlen, char);
4945         p = newpat;
4946         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
4947
4948         for (s = 0; s < plen; s++) {
4949             if (n < pRExC_state->num_code_blocks
4950                 && s == pRExC_state->code_blocks[n].start)
4951             {
4952                 /* blank out literal code block */
4953                 assert(pat[s] == '(');
4954                 while (s <= pRExC_state->code_blocks[n].end) {
4955                     *p++ = '_';
4956                     s++;
4957                 }
4958                 s--;
4959                 n++;
4960                 continue;
4961             }
4962             if (pat[s] == '\'' || pat[s] == '\\')
4963                 *p++ = '\\';
4964             *p++ = pat[s];
4965         }
4966         *p++ = '\'';
4967         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
4968             *p++ = 'x';
4969         *p++ = '\0';
4970         DEBUG_COMPILE_r({
4971             PerlIO_printf(Perl_debug_log,
4972                 "%sre-parsing pattern for runtime code:%s %s\n",
4973                 PL_colors[4],PL_colors[5],newpat);
4974         });
4975
4976         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
4977         Safefree(newpat);
4978
4979         ENTER;
4980         SAVETMPS;
4981         save_re_context();
4982         PUSHSTACKi(PERLSI_REQUIRE);
4983         /* this causes the toker to collapse \\ into \ when parsing
4984          * qr''; normally only q'' does this. It also alters hints
4985          * handling */
4986         PL_reg_state.re_reparsing = TRUE;
4987         eval_sv(sv, G_SCALAR);
4988         SvREFCNT_dec_NN(sv);
4989         SPAGAIN;
4990         qr_ref = POPs;
4991         PUTBACK;
4992         {
4993             SV * const errsv = ERRSV;
4994             if (SvTRUE_NN(errsv))
4995             {
4996                 Safefree(pRExC_state->code_blocks);
4997                 /* use croak_sv ? */
4998                 Perl_croak_nocontext("%s", SvPV_nolen_const(errsv));
4999             }
5000         }
5001         assert(SvROK(qr_ref));
5002         qr = SvRV(qr_ref);
5003         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5004         /* the leaving below frees the tmp qr_ref.
5005          * Give qr a life of its own */
5006         SvREFCNT_inc(qr);
5007         POPSTACK;
5008         FREETMPS;
5009         LEAVE;
5010
5011     }
5012
5013     if (!RExC_utf8 && SvUTF8(qr)) {
5014         /* first time through; the pattern got upgraded; save the
5015          * qr for the next time through */
5016         assert(!pRExC_state->runtime_code_qr);
5017         pRExC_state->runtime_code_qr = qr;
5018         return 0;
5019     }
5020
5021
5022     /* extract any code blocks within the returned qr//  */
5023
5024
5025     /* merge the main (r1) and run-time (r2) code blocks into one */
5026     {
5027         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5028         struct reg_code_block *new_block, *dst;
5029         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5030         int i1 = 0, i2 = 0;
5031
5032         if (!r2->num_code_blocks) /* we guessed wrong */
5033         {
5034             SvREFCNT_dec_NN(qr);
5035             return 1;
5036         }
5037
5038         Newx(new_block,
5039             r1->num_code_blocks + r2->num_code_blocks,
5040             struct reg_code_block);
5041         dst = new_block;
5042
5043         while (    i1 < r1->num_code_blocks
5044                 || i2 < r2->num_code_blocks)
5045         {
5046             struct reg_code_block *src;
5047             bool is_qr = 0;
5048
5049             if (i1 == r1->num_code_blocks) {
5050                 src = &r2->code_blocks[i2++];
5051                 is_qr = 1;
5052             }
5053             else if (i2 == r2->num_code_blocks)
5054                 src = &r1->code_blocks[i1++];
5055             else if (  r1->code_blocks[i1].start
5056                      < r2->code_blocks[i2].start)
5057             {
5058                 src = &r1->code_blocks[i1++];
5059                 assert(src->end < r2->code_blocks[i2].start);
5060             }
5061             else {
5062                 assert(  r1->code_blocks[i1].start
5063                        > r2->code_blocks[i2].start);
5064                 src = &r2->code_blocks[i2++];
5065                 is_qr = 1;
5066                 assert(src->end < r1->code_blocks[i1].start);
5067             }
5068
5069             assert(pat[src->start] == '(');
5070             assert(pat[src->end]   == ')');
5071             dst->start      = src->start;
5072             dst->end        = src->end;
5073             dst->block      = src->block;
5074             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5075                                     : src->src_regex;
5076             dst++;
5077         }
5078         r1->num_code_blocks += r2->num_code_blocks;
5079         Safefree(r1->code_blocks);
5080         r1->code_blocks = new_block;
5081     }
5082
5083     SvREFCNT_dec_NN(qr);
5084     return 1;
5085 }
5086
5087
5088 STATIC bool
5089 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5090 {
5091     /* This is the common code for setting up the floating and fixed length
5092      * string data extracted from Perlre_op_compile() below.  Returns a boolean
5093      * as to whether succeeded or not */
5094
5095     I32 t,ml;
5096
5097     if (! (longest_length
5098            || (eol /* Can't have SEOL and MULTI */
5099                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5100           )
5101             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5102         || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5103     {
5104         return FALSE;
5105     }
5106
5107     /* copy the information about the longest from the reg_scan_data
5108         over to the program. */
5109     if (SvUTF8(sv_longest)) {
5110         *rx_utf8 = sv_longest;
5111         *rx_substr = NULL;
5112     } else {
5113         *rx_substr = sv_longest;
5114         *rx_utf8 = NULL;
5115     }
5116     /* end_shift is how many chars that must be matched that
5117         follow this item. We calculate it ahead of time as once the
5118         lookbehind offset is added in we lose the ability to correctly
5119         calculate it.*/
5120     ml = minlen ? *(minlen) : (I32)longest_length;
5121     *rx_end_shift = ml - offset
5122         - longest_length + (SvTAIL(sv_longest) != 0)
5123         + lookbehind;
5124
5125     t = (eol/* Can't have SEOL and MULTI */
5126          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5127     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5128
5129     return TRUE;
5130 }
5131
5132 /*
5133  * Perl_re_op_compile - the perl internal RE engine's function to compile a
5134  * regular expression into internal code.
5135  * The pattern may be passed either as:
5136  *    a list of SVs (patternp plus pat_count)
5137  *    a list of OPs (expr)
5138  * If both are passed, the SV list is used, but the OP list indicates
5139  * which SVs are actually pre-compiled code blocks
5140  *
5141  * The SVs in the list have magic and qr overloading applied to them (and
5142  * the list may be modified in-place with replacement SVs in the latter
5143  * case).
5144  *
5145  * If the pattern hasn't changed from old_re, then old_re will be
5146  * returned.
5147  *
5148  * eng is the current engine. If that engine has an op_comp method, then
5149  * handle directly (i.e. we assume that op_comp was us); otherwise, just
5150  * do the initial concatenation of arguments and pass on to the external
5151  * engine.
5152  *
5153  * If is_bare_re is not null, set it to a boolean indicating whether the
5154  * arg list reduced (after overloading) to a single bare regex which has
5155  * been returned (i.e. /$qr/).
5156  *
5157  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5158  *
5159  * pm_flags contains the PMf_* flags, typically based on those from the
5160  * pm_flags field of the related PMOP. Currently we're only interested in
5161  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5162  *
5163  * We can't allocate space until we know how big the compiled form will be,
5164  * but we can't compile it (and thus know how big it is) until we've got a
5165  * place to put the code.  So we cheat:  we compile it twice, once with code
5166  * generation turned off and size counting turned on, and once "for real".
5167  * This also means that we don't allocate space until we are sure that the
5168  * thing really will compile successfully, and we never have to move the
5169  * code and thus invalidate pointers into it.  (Note that it has to be in
5170  * one piece because free() must be able to free it all.) [NB: not true in perl]
5171  *
5172  * Beware that the optimization-preparation code in here knows about some
5173  * of the structure of the compiled regexp.  [I'll say.]
5174  */
5175
5176 REGEXP *
5177 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5178                     OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5179                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5180 {
5181     dVAR;
5182     REGEXP *rx;
5183     struct regexp *r;
5184     regexp_internal *ri;
5185     STRLEN plen;
5186     char  * VOL exp;
5187     char* xend;
5188     regnode *scan;
5189     I32 flags;
5190     I32 minlen = 0;
5191     U32 rx_flags;
5192     SV * VOL pat;
5193     SV * VOL code_blocksv = NULL;
5194
5195     /* these are all flags - maybe they should be turned
5196      * into a single int with different bit masks */
5197     I32 sawlookahead = 0;
5198     I32 sawplus = 0;
5199     I32 sawopen = 0;
5200     bool used_setjump = FALSE;
5201     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5202     bool code_is_utf8 = 0;
5203     bool VOL recompile = 0;
5204     bool runtime_code = 0;
5205     U8 jump_ret = 0;
5206     dJMPENV;
5207     scan_data_t data;
5208     RExC_state_t RExC_state;
5209     RExC_state_t * const pRExC_state = &RExC_state;
5210 #ifdef TRIE_STUDY_OPT
5211     int restudied;
5212     RExC_state_t copyRExC_state;
5213 #endif
5214     GET_RE_DEBUG_FLAGS_DECL;
5215
5216     PERL_ARGS_ASSERT_RE_OP_COMPILE;
5217
5218     DEBUG_r(if (!PL_colorset) reginitcolors());
5219
5220 #ifndef PERL_IN_XSUB_RE
5221     /* Initialize these here instead of as-needed, as is quick and avoids
5222      * having to test them each time otherwise */
5223     if (! PL_AboveLatin1) {
5224         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5225         PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5226         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5227
5228         PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5229                                 = _new_invlist_C_array(L1PosixAlnum_invlist);
5230         PL_Posix_ptrs[_CC_ALPHANUMERIC]
5231                                 = _new_invlist_C_array(PosixAlnum_invlist);
5232
5233         PL_L1Posix_ptrs[_CC_ALPHA]
5234                                 = _new_invlist_C_array(L1PosixAlpha_invlist);
5235         PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5236
5237         PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5238         PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5239
5240         /* Cased is the same as Alpha in the ASCII range */
5241         PL_L1Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(L1Cased_invlist);
5242         PL_Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(PosixAlpha_invlist);
5243
5244         PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5245         PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5246
5247         PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5248         PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5249
5250         PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5251         PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5252
5253         PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5254         PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5255
5256         PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5257         PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5258
5259         PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5260         PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5261
5262         PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5263         PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5264         PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5265         PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5266
5267         PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5268         PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
5269
5270         PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
5271
5272         PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
5273         PL_L1Posix_ptrs[_CC_WORDCHAR]
5274                                 = _new_invlist_C_array(L1PosixWord_invlist);
5275
5276         PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
5277         PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
5278
5279         PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5280     }
5281 #endif
5282
5283     pRExC_state->code_blocks = NULL;
5284     pRExC_state->num_code_blocks = 0;
5285
5286     if (is_bare_re)
5287         *is_bare_re = FALSE;
5288
5289     if (expr && (expr->op_type == OP_LIST ||
5290                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5291
5292         /* is the source UTF8, and how many code blocks are there? */
5293         OP *o;
5294         int ncode = 0;
5295
5296         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5297             if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5298                 code_is_utf8 = 1;
5299             else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5300                 /* count of DO blocks */
5301                 ncode++;
5302         }
5303         if (ncode) {
5304             pRExC_state->num_code_blocks = ncode;
5305             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5306         }
5307     }
5308
5309     if (pat_count) {
5310         /* handle a list of SVs */
5311
5312         SV **svp;
5313
5314         /* apply magic and RE overloading to each arg */
5315         for (svp = patternp; svp < patternp + pat_count; svp++) {
5316             SV *rx = *svp;
5317             SvGETMAGIC(rx);
5318             if (SvROK(rx) && SvAMAGIC(rx)) {
5319                 SV *sv = AMG_CALLunary(rx, regexp_amg);
5320                 if (sv) {
5321                     if (SvROK(sv))
5322                         sv = SvRV(sv);
5323                     if (SvTYPE(sv) != SVt_REGEXP)
5324                         Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5325                     *svp = sv;
5326                 }
5327             }
5328         }
5329
5330         if (pat_count > 1) {
5331             /* concat multiple args and find any code block indexes */
5332
5333             OP *o = NULL;
5334             int n = 0;
5335             bool utf8 = 0;
5336             STRLEN orig_patlen = 0;
5337
5338             if (pRExC_state->num_code_blocks) {
5339                 o = cLISTOPx(expr)->op_first;
5340                 assert(   o->op_type == OP_PUSHMARK
5341                        || (o->op_type == OP_NULL && o->op_targ == OP_PUSHMARK)
5342                        || o->op_type == OP_PADRANGE);
5343                 o = o->op_sibling;
5344             }
5345
5346             pat = newSVpvn("", 0);
5347             SAVEFREESV(pat);
5348
5349             /* determine if the pattern is going to be utf8 (needed
5350              * in advance to align code block indices correctly).
5351              * XXX This could fail to be detected for an arg with
5352              * overloading but not concat overloading; but the main effect
5353              * in this obscure case is to need a 'use re eval' for a
5354              * literal code block */
5355             for (svp = patternp; svp < patternp + pat_count; svp++) {
5356                 if (SvUTF8(*svp))
5357                     utf8 = 1;
5358             }
5359             if (utf8)
5360                 SvUTF8_on(pat);
5361
5362             for (svp = patternp; svp < patternp + pat_count; svp++) {
5363                 SV *sv, *msv = *svp;
5364                 SV *rx;
5365                 bool code = 0;
5366                 /* we make the assumption here that each op in the list of
5367                  * op_siblings maps to one SV pushed onto the stack,
5368                  * except for code blocks, with have both an OP_NULL and
5369                  * and OP_CONST.
5370                  * This allows us to match up the list of SVs against the
5371                  * list of OPs to find the next code block.
5372                  *
5373                  * Note that       PUSHMARK PADSV PADSV ..
5374                  * is optimised to
5375                  *                 PADRANGE NULL  NULL  ..
5376                  * so the alignment still works. */
5377                 if (o) {
5378                     if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5379                         assert(n < pRExC_state->num_code_blocks);
5380                         pRExC_state->code_blocks[n].start = SvCUR(pat);
5381                         pRExC_state->code_blocks[n].block = o;
5382                         pRExC_state->code_blocks[n].src_regex = NULL;
5383                         n++;
5384                         code = 1;
5385                         o = o->op_sibling; /* skip CONST */
5386                         assert(o);
5387                     }
5388                     o = o->op_sibling;;
5389                 }
5390
5391                 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5392                         (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5393                 {
5394                     sv_setsv(pat, sv);
5395                     /* overloading involved: all bets are off over literal
5396                      * code. Pretend we haven't seen it */
5397                     pRExC_state->num_code_blocks -= n;
5398                     n = 0;
5399                     rx = NULL;
5400
5401                 }
5402                 else  {
5403                     while (SvAMAGIC(msv)
5404                             && (sv = AMG_CALLunary(msv, string_amg))
5405                             && sv != msv
5406                             &&  !(   SvROK(msv)
5407                                   && SvROK(sv)
5408                                   && SvRV(msv) == SvRV(sv))
5409                     ) {
5410                         msv = sv;
5411                         SvGETMAGIC(msv);
5412                     }
5413                     if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5414                         msv = SvRV(msv);
5415                     orig_patlen = SvCUR(pat);
5416                     sv_catsv_nomg(pat, msv);
5417                     rx = msv;
5418                     if (code)
5419                         pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5420                 }
5421
5422                 /* extract any code blocks within any embedded qr//'s */
5423                 if (rx && SvTYPE(rx) == SVt_REGEXP
5424                     && RX_ENGINE((REGEXP*)rx)->op_comp)
5425                 {
5426
5427                     RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5428                     if (ri->num_code_blocks) {
5429                         int i;
5430                         /* the presence of an embedded qr// with code means
5431                          * we should always recompile: the text of the
5432                          * qr// may not have changed, but it may be a
5433                          * different closure than last time */
5434                         recompile = 1;
5435                         Renew(pRExC_state->code_blocks,
5436                             pRExC_state->num_code_blocks + ri->num_code_blocks,
5437                             struct reg_code_block);
5438                         pRExC_state->num_code_blocks += ri->num_code_blocks;
5439                         for (i=0; i < ri->num_code_blocks; i++) {
5440                             struct reg_code_block *src, *dst;
5441                             STRLEN offset =  orig_patlen
5442                                 + ReANY((REGEXP *)rx)->pre_prefix;
5443                             assert(n < pRExC_state->num_code_blocks);
5444                             src = &ri->code_blocks[i];
5445                             dst = &pRExC_state->code_blocks[n];
5446                             dst->start      = src->start + offset;
5447                             dst->end        = src->end   + offset;
5448                             dst->block      = src->block;
5449                             dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5450                                                     src->src_regex
5451                                                         ? src->src_regex
5452                                                         : (REGEXP*)rx);
5453                             n++;
5454                         }
5455                     }
5456                 }
5457             }
5458             SvSETMAGIC(pat);
5459         }
5460         else {
5461             SV *sv;
5462             pat = *patternp;
5463             while (SvAMAGIC(pat)
5464                     && (sv = AMG_CALLunary(pat, string_amg))
5465                     && sv != pat)
5466             {
5467                 pat = sv;
5468                 SvGETMAGIC(pat);
5469             }
5470         }
5471
5472         /* handle bare regex: foo =~ $re */
5473         {
5474             SV *re = pat;
5475             if (SvROK(re))
5476                 re = SvRV(re);
5477             if (SvTYPE(re) == SVt_REGEXP) {
5478                 if (is_bare_re)
5479                     *is_bare_re = TRUE;
5480                 SvREFCNT_inc(re);
5481                 Safefree(pRExC_state->code_blocks);
5482                 return (REGEXP*)re;
5483             }
5484         }
5485     }
5486     else {
5487         /* not a list of SVs, so must be a list of OPs */
5488         assert(expr);
5489         if (expr->op_type == OP_LIST) {
5490             int i = -1;
5491             bool is_code = 0;
5492             OP *o;
5493
5494             pat = newSVpvn("", 0);
5495             SAVEFREESV(pat);
5496             if (code_is_utf8)
5497                 SvUTF8_on(pat);
5498
5499             /* given a list of CONSTs and DO blocks in expr, append all
5500              * the CONSTs to pat, and record the start and end of each
5501              * code block in code_blocks[] (each DO{} op is followed by an
5502              * OP_CONST containing the corresponding literal '(?{...})
5503              * text)
5504              */
5505             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5506                 if (o->op_type == OP_CONST) {
5507                     sv_catsv(pat, cSVOPo_sv);
5508                     if (is_code) {
5509                         pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5510                         is_code = 0;
5511                     }
5512                 }
5513                 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5514                     assert(i+1 < pRExC_state->num_code_blocks);
5515                     pRExC_state->code_blocks[++i].start = SvCUR(pat);
5516                     pRExC_state->code_blocks[i].block = o;
5517                     pRExC_state->code_blocks[i].src_regex = NULL;
5518                     is_code = 1;
5519                 }
5520             }
5521         }
5522         else {
5523             assert(expr->op_type == OP_CONST);
5524             pat = cSVOPx_sv(expr);
5525         }
5526     }
5527
5528     exp = SvPV_nomg(pat, plen);
5529
5530     if (!eng->op_comp) {
5531         if ((SvUTF8(pat) && IN_BYTES)
5532                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5533         {
5534             /* make a temporary copy; either to convert to bytes,
5535              * or to avoid repeating get-magic / overloaded stringify */
5536             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5537                                         (IN_BYTES ? 0 : SvUTF8(pat)));
5538         }
5539         Safefree(pRExC_state->code_blocks);
5540         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5541     }
5542
5543     /* ignore the utf8ness if the pattern is 0 length */
5544     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5545     RExC_uni_semantics = 0;
5546     RExC_contains_locale = 0;
5547     pRExC_state->runtime_code_qr = NULL;
5548
5549     /****************** LONG JUMP TARGET HERE***********************/
5550     /* Longjmp back to here if have to switch in midstream to utf8 */
5551     if (! RExC_orig_utf8) {
5552         JMPENV_PUSH(jump_ret);
5553         used_setjump = TRUE;
5554     }
5555
5556     if (jump_ret == 0) {    /* First time through */
5557         xend = exp + plen;
5558
5559         DEBUG_COMPILE_r({
5560             SV *dsv= sv_newmortal();
5561             RE_PV_QUOTED_DECL(s, RExC_utf8,
5562                 dsv, exp, plen, 60);
5563             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5564                            PL_colors[4],PL_colors[5],s);
5565         });
5566     }
5567     else {  /* longjumped back */
5568         U8 *src, *dst;
5569         int n=0;
5570         STRLEN s = 0, d = 0;
5571         bool do_end = 0;
5572
5573         /* If the cause for the longjmp was other than changing to utf8, pop
5574          * our own setjmp, and longjmp to the correct handler */
5575         if (jump_ret != UTF8_LONGJMP) {
5576             JMPENV_POP;
5577             JMPENV_JUMP(jump_ret);
5578         }
5579
5580         GET_RE_DEBUG_FLAGS;
5581
5582         /* It's possible to write a regexp in ascii that represents Unicode
5583         codepoints outside of the byte range, such as via \x{100}. If we
5584         detect such a sequence we have to convert the entire pattern to utf8
5585         and then recompile, as our sizing calculation will have been based
5586         on 1 byte == 1 character, but we will need to use utf8 to encode
5587         at least some part of the pattern, and therefore must convert the whole
5588         thing.
5589         -- dmq */
5590         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5591             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5592
5593         /* upgrade pattern to UTF8, and if there are code blocks,
5594          * recalculate the indices.
5595          * This is essentially an unrolled Perl_bytes_to_utf8() */
5596
5597         src = (U8*)SvPV_nomg(pat, plen);
5598         Newx(dst, plen * 2 + 1, U8);
5599
5600         while (s < plen) {
5601             const UV uv = NATIVE_TO_ASCII(src[s]);
5602             if (UNI_IS_INVARIANT(uv))
5603                 dst[d]   = (U8)UTF_TO_NATIVE(uv);
5604             else {
5605                 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5606                 dst[d]   = (U8)UTF8_EIGHT_BIT_LO(uv);
5607             }
5608             if (n < pRExC_state->num_code_blocks) {
5609                 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5610                     pRExC_state->code_blocks[n].start = d;
5611                     assert(dst[d] == '(');
5612                     do_end = 1;
5613                 }
5614                 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5615                     pRExC_state->code_blocks[n].end = d;
5616                     assert(dst[d] == ')');
5617                     do_end = 0;
5618                     n++;
5619                 }
5620             }
5621             s++;
5622             d++;
5623         }
5624         dst[d] = '\0';
5625         plen = d;
5626         exp = (char*) dst;
5627         xend = exp + plen;
5628         SAVEFREEPV(exp);
5629         RExC_orig_utf8 = RExC_utf8 = 1;
5630     }
5631
5632     /* return old regex if pattern hasn't changed */
5633
5634     if (   old_re
5635         && !recompile
5636         && !!RX_UTF8(old_re) == !!RExC_utf8
5637         && RX_PRECOMP(old_re)
5638         && RX_PRELEN(old_re) == plen
5639         && memEQ(RX_PRECOMP(old_re), exp, plen))
5640     {
5641         /* with runtime code, always recompile */
5642         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5643                                             exp, plen);
5644         if (!runtime_code) {
5645             if (used_setjump) {
5646                 JMPENV_POP;
5647             }
5648             Safefree(pRExC_state->code_blocks);
5649             return old_re;
5650         }
5651     }
5652     else if ((pm_flags & PMf_USE_RE_EVAL)
5653                 /* this second condition covers the non-regex literal case,
5654                  * i.e.  $foo =~ '(?{})'. */
5655                 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5656                     && (PL_hints & HINT_RE_EVAL))
5657     )
5658         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5659                             exp, plen);
5660
5661 #ifdef TRIE_STUDY_OPT
5662     restudied = 0;
5663 #endif
5664
5665     rx_flags = orig_rx_flags;
5666
5667     if (initial_charset == REGEX_LOCALE_CHARSET) {
5668         RExC_contains_locale = 1;
5669     }
5670     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5671
5672         /* Set to use unicode semantics if the pattern is in utf8 and has the
5673          * 'depends' charset specified, as it means unicode when utf8  */
5674         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5675     }
5676
5677     RExC_precomp = exp;
5678     RExC_flags = rx_flags;
5679     RExC_pm_flags = pm_flags;
5680
5681     if (runtime_code) {
5682         if (TAINTING_get && TAINT_get)
5683             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5684
5685         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5686             /* whoops, we have a non-utf8 pattern, whilst run-time code
5687              * got compiled as utf8. Try again with a utf8 pattern */
5688              JMPENV_JUMP(UTF8_LONGJMP);
5689         }
5690     }
5691     assert(!pRExC_state->runtime_code_qr);
5692
5693     RExC_sawback = 0;
5694
5695     RExC_seen = 0;
5696     RExC_in_lookbehind = 0;
5697     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5698     RExC_extralen = 0;
5699     RExC_override_recoding = 0;
5700     RExC_in_multi_char_class = 0;
5701
5702     /* First pass: determine size, legality. */
5703     RExC_parse = exp;
5704     RExC_start = exp;
5705     RExC_end = xend;
5706     RExC_naughty = 0;
5707     RExC_npar = 1;
5708     RExC_nestroot = 0;
5709     RExC_size = 0L;
5710     RExC_emit = &PL_regdummy;
5711     RExC_whilem_seen = 0;
5712     RExC_open_parens = NULL;
5713     RExC_close_parens = NULL;
5714     RExC_opend = NULL;
5715     RExC_paren_names = NULL;
5716 #ifdef DEBUGGING
5717     RExC_paren_name_list = NULL;
5718 #endif
5719     RExC_recurse = NULL;
5720     RExC_recurse_count = 0;
5721     pRExC_state->code_index = 0;
5722
5723 #if 0 /* REGC() is (currently) a NOP at the first pass.
5724        * Clever compilers notice this and complain. --jhi */
5725     REGC((U8)REG_MAGIC, (char*)RExC_emit);
5726 #endif
5727     DEBUG_PARSE_r(
5728         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5729         RExC_lastnum=0;
5730         RExC_lastparse=NULL;
5731     );
5732     /* reg may croak on us, not giving us a chance to free
5733        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
5734        need it to survive as long as the regexp (qr/(?{})/).
5735        We must check that code_blocksv is not already set, because we may
5736        have longjmped back. */
5737     if (pRExC_state->code_blocks && !code_blocksv) {
5738         code_blocksv = newSV_type(SVt_PV);
5739         SAVEFREESV(code_blocksv);
5740         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
5741         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
5742     }
5743     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5744         RExC_precomp = NULL;
5745         return(NULL);
5746     }
5747     if (code_blocksv)
5748         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
5749
5750     /* Here, finished first pass.  Get rid of any added setjmp */
5751     if (used_setjump) {
5752         JMPENV_POP;
5753     }
5754
5755     DEBUG_PARSE_r({
5756         PerlIO_printf(Perl_debug_log,
5757             "Required size %"IVdf" nodes\n"
5758             "Starting second pass (creation)\n",
5759             (IV)RExC_size);
5760         RExC_lastnum=0;
5761         RExC_lastparse=NULL;
5762     });
5763
5764     /* The first pass could have found things that force Unicode semantics */
5765     if ((RExC_utf8 || RExC_uni_semantics)
5766          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5767     {
5768         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5769     }
5770
5771     /* Small enough for pointer-storage convention?
5772        If extralen==0, this means that we will not need long jumps. */
5773     if (RExC_size >= 0x10000L && RExC_extralen)
5774         RExC_size += RExC_extralen;
5775     else
5776         RExC_extralen = 0;
5777     if (RExC_whilem_seen > 15)
5778         RExC_whilem_seen = 15;
5779
5780     /* Allocate space and zero-initialize. Note, the two step process
5781        of zeroing when in debug mode, thus anything assigned has to
5782        happen after that */
5783     rx = (REGEXP*) newSV_type(SVt_REGEXP);
5784     r = ReANY(rx);
5785     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5786          char, regexp_internal);
5787     if ( r == NULL || ri == NULL )
5788         FAIL("Regexp out of space");
5789 #ifdef DEBUGGING
5790     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5791     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5792 #else
5793     /* bulk initialize base fields with 0. */
5794     Zero(ri, sizeof(regexp_internal), char);
5795 #endif
5796
5797     /* non-zero initialization begins here */
5798     RXi_SET( r, ri );
5799     r->engine= eng;
5800     r->extflags = rx_flags;
5801     if (pm_flags & PMf_IS_QR) {
5802         ri->code_blocks = pRExC_state->code_blocks;
5803         ri->num_code_blocks = pRExC_state->num_code_blocks;
5804     }
5805     else
5806     {
5807         int n;
5808         for (n = 0; n < pRExC_state->num_code_blocks; n++)
5809             if (pRExC_state->code_blocks[n].src_regex)
5810                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
5811         SAVEFREEPV(pRExC_state->code_blocks);
5812     }
5813
5814     {
5815         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5816         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5817
5818         /* The caret is output if there are any defaults: if not all the STD
5819          * flags are set, or if no character set specifier is needed */
5820         bool has_default =
5821                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5822                     || ! has_charset);
5823         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5824         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5825                             >> RXf_PMf_STD_PMMOD_SHIFT);
5826         const char *fptr = STD_PAT_MODS;        /*"msix"*/
5827         char *p;
5828         /* Allocate for the worst case, which is all the std flags are turned
5829          * on.  If more precision is desired, we could do a population count of
5830          * the flags set.  This could be done with a small lookup table, or by
5831          * shifting, masking and adding, or even, when available, assembly
5832          * language for a machine-language population count.
5833          * We never output a minus, as all those are defaults, so are
5834          * covered by the caret */
5835         const STRLEN wraplen = plen + has_p + has_runon
5836             + has_default       /* If needs a caret */
5837
5838                 /* If needs a character set specifier */
5839             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5840             + (sizeof(STD_PAT_MODS) - 1)
5841             + (sizeof("(?:)") - 1);
5842
5843         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
5844         r->xpv_len_u.xpvlenu_pv = p;
5845         if (RExC_utf8)
5846             SvFLAGS(rx) |= SVf_UTF8;
5847         *p++='('; *p++='?';
5848
5849         /* If a default, cover it using the caret */
5850         if (has_default) {
5851             *p++= DEFAULT_PAT_MOD;
5852         }
5853         if (has_charset) {
5854             STRLEN len;
5855             const char* const name = get_regex_charset_name(r->extflags, &len);
5856             Copy(name, p, len, char);
5857             p += len;
5858         }
5859         if (has_p)
5860             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5861         {
5862             char ch;
5863             while((ch = *fptr++)) {
5864                 if(reganch & 1)
5865                     *p++ = ch;
5866                 reganch >>= 1;
5867             }
5868         }
5869
5870         *p++ = ':';
5871         Copy(RExC_precomp, p, plen, char);
5872         assert ((RX_WRAPPED(rx) - p) < 16);
5873         r->pre_prefix = p - RX_WRAPPED(rx);
5874         p += plen;
5875         if (has_runon)
5876             *p++ = '\n';
5877         *p++ = ')';
5878         *p = 0;
5879         SvCUR_set(rx, p - RX_WRAPPED(rx));
5880     }
5881
5882     r->intflags = 0;
5883     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5884
5885     if (RExC_seen & REG_SEEN_RECURSE) {
5886         Newxz(RExC_open_parens, RExC_npar,regnode *);
5887         SAVEFREEPV(RExC_open_parens);
5888         Newxz(RExC_close_parens,RExC_npar,regnode *);
5889         SAVEFREEPV(RExC_close_parens);
5890     }
5891
5892     /* Useful during FAIL. */
5893 #ifdef RE_TRACK_PATTERN_OFFSETS
5894     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5895     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5896                           "%s %"UVuf" bytes for offset annotations.\n",
5897                           ri->u.offsets ? "Got" : "Couldn't get",
5898                           (UV)((2*RExC_size+1) * sizeof(U32))));
5899 #endif
5900     SetProgLen(ri,RExC_size);
5901     RExC_rx_sv = rx;
5902     RExC_rx = r;
5903     RExC_rxi = ri;
5904
5905     /* Second pass: emit code. */
5906     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
5907     RExC_pm_flags = pm_flags;
5908     RExC_parse = exp;
5909     RExC_end = xend;
5910     RExC_naughty = 0;
5911     RExC_npar = 1;
5912     RExC_emit_start = ri->program;
5913     RExC_emit = ri->program;
5914     RExC_emit_bound = ri->program + RExC_size + 1;
5915     pRExC_state->code_index = 0;
5916
5917     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5918     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5919         ReREFCNT_dec(rx);
5920         return(NULL);
5921     }
5922     /* XXXX To minimize changes to RE engine we always allocate
5923        3-units-long substrs field. */
5924     Newx(r->substrs, 1, struct reg_substr_data);
5925     if (RExC_recurse_count) {
5926         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5927         SAVEFREEPV(RExC_recurse);
5928     }
5929
5930 reStudy:
5931     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5932     Zero(r->substrs, 1, struct reg_substr_data);
5933
5934 #ifdef TRIE_STUDY_OPT
5935     if (!restudied) {
5936         StructCopy(&zero_scan_data, &data, scan_data_t);
5937         copyRExC_state = RExC_state;
5938     } else {
5939         U32 seen=RExC_seen;
5940         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5941
5942         RExC_state = copyRExC_state;
5943         if (seen & REG_TOP_LEVEL_BRANCHES)
5944             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5945         else
5946             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5947         StructCopy(&zero_scan_data, &data, scan_data_t);
5948     }
5949 #else
5950     StructCopy(&zero_scan_data, &data, scan_data_t);
5951 #endif
5952
5953     /* Dig out information for optimizations. */
5954     r->extflags = RExC_flags; /* was pm_op */
5955     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5956
5957     if (UTF)
5958         SvUTF8_on(rx);  /* Unicode in it? */
5959     ri->regstclass = NULL;
5960     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
5961         r->intflags |= PREGf_NAUGHTY;
5962     scan = ri->program + 1;             /* First BRANCH. */
5963
5964     /* testing for BRANCH here tells us whether there is "must appear"
5965        data in the pattern. If there is then we can use it for optimisations */
5966     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
5967         I32 fake;
5968         STRLEN longest_float_length, longest_fixed_length;
5969         struct regnode_charclass_class ch_class; /* pointed to by data */
5970         int stclass_flag;
5971         I32 last_close = 0; /* pointed to by data */
5972         regnode *first= scan;
5973         regnode *first_next= regnext(first);
5974         /*
5975          * Skip introductions and multiplicators >= 1
5976          * so that we can extract the 'meat' of the pattern that must
5977          * match in the large if() sequence following.
5978          * NOTE that EXACT is NOT covered here, as it is normally
5979          * picked up by the optimiser separately.
5980          *
5981          * This is unfortunate as the optimiser isnt handling lookahead
5982          * properly currently.
5983          *
5984          */
5985         while ((OP(first) == OPEN && (sawopen = 1)) ||
5986                /* An OR of *one* alternative - should not happen now. */
5987             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
5988             /* for now we can't handle lookbehind IFMATCH*/
5989             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
5990             (OP(first) == PLUS) ||
5991             (OP(first) == MINMOD) ||
5992                /* An {n,m} with n>0 */
5993             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
5994             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
5995         {
5996                 /*
5997                  * the only op that could be a regnode is PLUS, all the rest
5998                  * will be regnode_1 or regnode_2.
5999                  *
6000                  */
6001                 if (OP(first) == PLUS)
6002                     sawplus = 1;
6003                 else
6004                     first += regarglen[OP(first)];
6005
6006                 first = NEXTOPER(first);
6007                 first_next= regnext(first);
6008         }
6009
6010         /* Starting-point info. */
6011       again:
6012         DEBUG_PEEP("first:",first,0);
6013         /* Ignore EXACT as we deal with it later. */
6014         if (PL_regkind[OP(first)] == EXACT) {
6015             if (OP(first) == EXACT)
6016                 NOOP;   /* Empty, get anchored substr later. */
6017             else
6018                 ri->regstclass = first;
6019         }
6020 #ifdef TRIE_STCLASS
6021         else if (PL_regkind[OP(first)] == TRIE &&
6022                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6023         {
6024             regnode *trie_op;
6025             /* this can happen only on restudy */
6026             if ( OP(first) == TRIE ) {
6027                 struct regnode_1 *trieop = (struct regnode_1 *)
6028                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
6029                 StructCopy(first,trieop,struct regnode_1);
6030                 trie_op=(regnode *)trieop;
6031             } else {
6032                 struct regnode_charclass *trieop = (struct regnode_charclass *)
6033                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6034                 StructCopy(first,trieop,struct regnode_charclass);
6035                 trie_op=(regnode *)trieop;
6036             }
6037             OP(trie_op)+=2;
6038             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6039             ri->regstclass = trie_op;
6040         }
6041 #endif
6042         else if (REGNODE_SIMPLE(OP(first)))
6043             ri->regstclass = first;
6044         else if (PL_regkind[OP(first)] == BOUND ||
6045                  PL_regkind[OP(first)] == NBOUND)
6046             ri->regstclass = first;
6047         else if (PL_regkind[OP(first)] == BOL) {
6048             r->extflags |= (OP(first) == MBOL
6049                            ? RXf_ANCH_MBOL
6050                            : (OP(first) == SBOL
6051                               ? RXf_ANCH_SBOL
6052                               : RXf_ANCH_BOL));
6053             first = NEXTOPER(first);
6054             goto again;
6055         }
6056         else if (OP(first) == GPOS) {
6057             r->extflags |= RXf_ANCH_GPOS;
6058             first = NEXTOPER(first);
6059             goto again;
6060         }
6061         else if ((!sawopen || !RExC_sawback) &&
6062             (OP(first) == STAR &&
6063             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6064             !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6065         {
6066             /* turn .* into ^.* with an implied $*=1 */
6067             const int type =
6068                 (OP(NEXTOPER(first)) == REG_ANY)
6069                     ? RXf_ANCH_MBOL
6070                     : RXf_ANCH_SBOL;
6071             r->extflags |= type;
6072             r->intflags |= PREGf_IMPLICIT;
6073             first = NEXTOPER(first);
6074             goto again;
6075         }
6076         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6077             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6078             /* x+ must match at the 1st pos of run of x's */
6079             r->intflags |= PREGf_SKIP;
6080
6081         /* Scan is after the zeroth branch, first is atomic matcher. */
6082 #ifdef TRIE_STUDY_OPT
6083         DEBUG_PARSE_r(
6084             if (!restudied)
6085                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6086                               (IV)(first - scan + 1))
6087         );
6088 #else
6089         DEBUG_PARSE_r(
6090             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6091                 (IV)(first - scan + 1))
6092         );
6093 #endif
6094
6095
6096         /*
6097         * If there's something expensive in the r.e., find the
6098         * longest literal string that must appear and make it the
6099         * regmust.  Resolve ties in favor of later strings, since
6100         * the regstart check works with the beginning of the r.e.
6101         * and avoiding duplication strengthens checking.  Not a
6102         * strong reason, but sufficient in the absence of others.
6103         * [Now we resolve ties in favor of the earlier string if
6104         * it happens that c_offset_min has been invalidated, since the
6105         * earlier string may buy us something the later one won't.]
6106         */
6107
6108         data.longest_fixed = newSVpvs("");
6109         data.longest_float = newSVpvs("");
6110         data.last_found = newSVpvs("");
6111         data.longest = &(data.longest_fixed);
6112         ENTER_with_name("study_chunk");
6113         SAVEFREESV(data.longest_fixed);
6114         SAVEFREESV(data.longest_float);
6115         SAVEFREESV(data.last_found);
6116         first = scan;
6117         if (!ri->regstclass) {
6118             cl_init(pRExC_state, &ch_class);
6119             data.start_class = &ch_class;
6120             stclass_flag = SCF_DO_STCLASS_AND;
6121         } else                          /* XXXX Check for BOUND? */
6122             stclass_flag = 0;
6123         data.last_closep = &last_close;
6124
6125         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6126             &data, -1, NULL, NULL,
6127             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6128
6129
6130         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6131
6132
6133         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6134              && data.last_start_min == 0 && data.last_end > 0
6135              && !RExC_seen_zerolen
6136              && !(RExC_seen & REG_SEEN_VERBARG)
6137              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6138             r->extflags |= RXf_CHECK_ALL;
6139         scan_commit(pRExC_state, &data,&minlen,0);
6140
6141         longest_float_length = CHR_SVLEN(data.longest_float);
6142
6143         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6144                    && data.offset_fixed == data.offset_float_min
6145                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6146             && S_setup_longest (aTHX_ pRExC_state,
6147                                     data.longest_float,
6148                                     &(r->float_utf8),
6149                                     &(r->float_substr),
6150                                     &(r->float_end_shift),
6151                                     data.lookbehind_float,
6152                                     data.offset_float_min,
6153                                     data.minlen_float,
6154                                     longest_float_length,
6155                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
6156                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6157         {
6158             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6159             r->float_max_offset = data.offset_float_max;
6160             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6161                 r->float_max_offset -= data.lookbehind_float;
6162             SvREFCNT_inc_simple_void_NN(data.longest_float);
6163         }
6164         else {
6165             r->float_substr = r->float_utf8 = NULL;
6166             longest_float_length = 0;
6167         }
6168
6169         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6170
6171         if (S_setup_longest (aTHX_ pRExC_state,
6172                                 data.longest_fixed,
6173                                 &(r->anchored_utf8),
6174                                 &(r->anchored_substr),
6175                                 &(r->anchored_end_shift),
6176                                 data.lookbehind_fixed,
6177                                 data.offset_fixed,
6178                                 data.minlen_fixed,
6179                                 longest_fixed_length,
6180                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6181                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6182         {
6183             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6184             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6185         }
6186         else {
6187             r->anchored_substr = r->anchored_utf8 = NULL;
6188             longest_fixed_length = 0;
6189         }
6190         LEAVE_with_name("study_chunk");
6191
6192         if (ri->regstclass
6193             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6194             ri->regstclass = NULL;
6195
6196         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6197             && stclass_flag
6198             && ! TEST_SSC_EOS(data.start_class)
6199             && !cl_is_anything(data.start_class))
6200         {
6201             const U32 n = add_data(pRExC_state, 1, "f");
6202             OP(data.start_class) = ANYOF_SYNTHETIC;
6203
6204             Newx(RExC_rxi->data->data[n], 1,
6205                 struct regnode_charclass_class);
6206             StructCopy(data.start_class,
6207                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6208                        struct regnode_charclass_class);
6209             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6210             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6211             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6212                       regprop(r, sv, (regnode*)data.start_class);
6213                       PerlIO_printf(Perl_debug_log,
6214                                     "synthetic stclass \"%s\".\n",
6215                                     SvPVX_const(sv));});
6216         }
6217
6218         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6219         if (longest_fixed_length > longest_float_length) {
6220             r->check_end_shift = r->anchored_end_shift;
6221             r->check_substr = r->anchored_substr;
6222             r->check_utf8 = r->anchored_utf8;
6223             r->check_offset_min = r->check_offset_max = r->anchored_offset;
6224             if (r->extflags & RXf_ANCH_SINGLE)
6225                 r->extflags |= RXf_NOSCAN;
6226         }
6227         else {
6228             r->check_end_shift = r->float_end_shift;
6229             r->check_substr = r->float_substr;
6230             r->check_utf8 = r->float_utf8;
6231             r->check_offset_min = r->float_min_offset;
6232             r->check_offset_max = r->float_max_offset;
6233         }
6234         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6235            This should be changed ASAP!  */
6236         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6237             r->extflags |= RXf_USE_INTUIT;
6238             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6239                 r->extflags |= RXf_INTUIT_TAIL;
6240         }
6241         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6242         if ( (STRLEN)minlen < longest_float_length )
6243             minlen= longest_float_length;
6244         if ( (STRLEN)minlen < longest_fixed_length )
6245             minlen= longest_fixed_length;
6246         */
6247     }
6248     else {
6249         /* Several toplevels. Best we can is to set minlen. */
6250         I32 fake;
6251         struct regnode_charclass_class ch_class;
6252         I32 last_close = 0;
6253
6254         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6255
6256         scan = ri->program + 1;
6257         cl_init(pRExC_state, &ch_class);
6258         data.start_class = &ch_class;
6259         data.last_closep = &last_close;
6260
6261
6262         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6263             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6264
6265         CHECK_RESTUDY_GOTO_butfirst(NOOP);
6266
6267         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6268                 = r->float_substr = r->float_utf8 = NULL;
6269
6270         if (! TEST_SSC_EOS(data.start_class)
6271             && !cl_is_anything(data.start_class))
6272         {
6273             const U32 n = add_data(pRExC_state, 1, "f");
6274             OP(data.start_class) = ANYOF_SYNTHETIC;
6275
6276             Newx(RExC_rxi->data->data[n], 1,
6277                 struct regnode_charclass_class);
6278             StructCopy(data.start_class,
6279                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6280                        struct regnode_charclass_class);
6281             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6282             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6283             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6284                       regprop(r, sv, (regnode*)data.start_class);
6285                       PerlIO_printf(Perl_debug_log,
6286                                     "synthetic stclass \"%s\".\n",
6287                                     SvPVX_const(sv));});
6288         }
6289     }
6290
6291     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6292        the "real" pattern. */
6293     DEBUG_OPTIMISE_r({
6294         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6295                       (IV)minlen, (IV)r->minlen);
6296     });
6297     r->minlenret = minlen;
6298     if (r->minlen < minlen)
6299         r->minlen = minlen;
6300
6301     if (RExC_seen & REG_SEEN_GPOS)
6302         r->extflags |= RXf_GPOS_SEEN;
6303     if (RExC_seen & REG_SEEN_LOOKBEHIND)
6304         r->extflags |= RXf_LOOKBEHIND_SEEN;
6305     if (pRExC_state->num_code_blocks)
6306         r->extflags |= RXf_EVAL_SEEN;
6307     if (RExC_seen & REG_SEEN_CANY)
6308         r->extflags |= RXf_CANY_SEEN;
6309     if (RExC_seen & REG_SEEN_VERBARG)
6310     {
6311         r->intflags |= PREGf_VERBARG_SEEN;
6312         r->extflags |= RXf_MODIFIES_VARS;
6313     }
6314     if (RExC_seen & REG_SEEN_CUTGROUP)
6315         r->intflags |= PREGf_CUTGROUP_SEEN;
6316     if (pm_flags & PMf_USE_RE_EVAL)
6317         r->intflags |= PREGf_USE_RE_EVAL;
6318     if (RExC_paren_names)
6319         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6320     else
6321         RXp_PAREN_NAMES(r) = NULL;
6322
6323 #ifdef STUPID_PATTERN_CHECKS
6324     if (RX_PRELEN(rx) == 0)
6325         r->extflags |= RXf_NULL;
6326     if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6327         r->extflags |= RXf_WHITE;
6328     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6329         r->extflags |= RXf_START_ONLY;
6330 #else
6331     {
6332         regnode *first = ri->program + 1;
6333         U8 fop = OP(first);
6334
6335         if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6336             r->extflags |= RXf_NULL;
6337         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6338             r->extflags |= RXf_START_ONLY;
6339         else if (fop == PLUS && PL_regkind[OP(NEXTOPER(first))] == POSIXD && FLAGS(NEXTOPER(first)) == _CC_SPACE
6340                              && OP(regnext(first)) == END)
6341             r->extflags |= RXf_WHITE;
6342     }
6343 #endif
6344 #ifdef DEBUGGING
6345     if (RExC_paren_names) {
6346         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6347         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6348     } else
6349 #endif
6350         ri->name_list_idx = 0;
6351
6352     if (RExC_recurse_count) {
6353         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6354             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6355             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6356         }
6357     }
6358     Newxz(r->offs, RExC_npar, regexp_paren_pair);
6359     /* assume we don't need to swap parens around before we match */
6360
6361     DEBUG_DUMP_r({
6362         PerlIO_printf(Perl_debug_log,"Final program:\n");
6363         regdump(r);
6364     });
6365 #ifdef RE_TRACK_PATTERN_OFFSETS
6366     DEBUG_OFFSETS_r(if (ri->u.offsets) {
6367         const U32 len = ri->u.offsets[0];
6368         U32 i;
6369         GET_RE_DEBUG_FLAGS_DECL;
6370         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6371         for (i = 1; i <= len; i++) {
6372             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6373                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6374                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6375             }
6376         PerlIO_printf(Perl_debug_log, "\n");
6377     });
6378 #endif
6379
6380 #ifdef USE_ITHREADS
6381     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6382      * by setting the regexp SV to readonly-only instead. If the
6383      * pattern's been recompiled, the USEDness should remain. */
6384     if (old_re && SvREADONLY(old_re))
6385         SvREADONLY_on(rx);
6386 #endif
6387     return rx;
6388 }
6389
6390
6391 SV*
6392 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6393                     const U32 flags)
6394 {
6395     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6396
6397     PERL_UNUSED_ARG(value);
6398
6399     if (flags & RXapif_FETCH) {
6400         return reg_named_buff_fetch(rx, key, flags);
6401     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6402         Perl_croak_no_modify();
6403         return NULL;
6404     } else if (flags & RXapif_EXISTS) {
6405         return reg_named_buff_exists(rx, key, flags)
6406             ? &PL_sv_yes
6407             : &PL_sv_no;
6408     } else if (flags & RXapif_REGNAMES) {
6409         return reg_named_buff_all(rx, flags);
6410     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6411         return reg_named_buff_scalar(rx, flags);
6412     } else {
6413         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6414         return NULL;
6415     }
6416 }
6417
6418 SV*
6419 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6420                          const U32 flags)
6421 {
6422     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6423     PERL_UNUSED_ARG(lastkey);
6424
6425     if (flags & RXapif_FIRSTKEY)
6426         return reg_named_buff_firstkey(rx, flags);
6427     else if (flags & RXapif_NEXTKEY)
6428         return reg_named_buff_nextkey(rx, flags);
6429     else {
6430         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6431         return NULL;
6432     }
6433 }
6434
6435 SV*
6436 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6437                           const U32 flags)
6438 {
6439     AV *retarray = NULL;
6440     SV *ret;
6441     struct regexp *const rx = ReANY(r);
6442
6443     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6444
6445     if (flags & RXapif_ALL)
6446         retarray=newAV();
6447
6448     if (rx && RXp_PAREN_NAMES(rx)) {
6449         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6450         if (he_str) {
6451             IV i;
6452             SV* sv_dat=HeVAL(he_str);
6453             I32 *nums=(I32*)SvPVX(sv_dat);
6454             for ( i=0; i<SvIVX(sv_dat); i++ ) {
6455                 if ((I32)(rx->nparens) >= nums[i]
6456                     && rx->offs[nums[i]].start != -1
6457                     && rx->offs[nums[i]].end != -1)
6458                 {
6459                     ret = newSVpvs("");
6460                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6461                     if (!retarray)
6462                         return ret;
6463                 } else {
6464                     if (retarray)
6465                         ret = newSVsv(&PL_sv_undef);
6466                 }
6467                 if (retarray)
6468                     av_push(retarray, ret);
6469             }
6470             if (retarray)
6471                 return newRV_noinc(MUTABLE_SV(retarray));
6472         }
6473     }
6474     return NULL;
6475 }
6476
6477 bool
6478 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6479                            const U32 flags)
6480 {
6481     struct regexp *const rx = ReANY(r);
6482
6483     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6484
6485     if (rx && RXp_PAREN_NAMES(rx)) {
6486         if (flags & RXapif_ALL) {
6487             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6488         } else {
6489             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6490             if (sv) {
6491                 SvREFCNT_dec_NN(sv);
6492                 return TRUE;
6493             } else {
6494                 return FALSE;
6495             }
6496         }
6497     } else {
6498         return FALSE;
6499     }
6500 }
6501
6502 SV*
6503 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6504 {
6505     struct regexp *const rx = ReANY(r);
6506
6507     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6508
6509     if ( rx && RXp_PAREN_NAMES(rx) ) {
6510         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6511
6512         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6513     } else {
6514         return FALSE;
6515     }
6516 }
6517
6518 SV*
6519 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6520 {
6521     struct regexp *const rx = ReANY(r);
6522     GET_RE_DEBUG_FLAGS_DECL;
6523
6524     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6525
6526     if (rx && RXp_PAREN_NAMES(rx)) {
6527         HV *hv = RXp_PAREN_NAMES(rx);
6528         HE *temphe;
6529         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6530             IV i;
6531             IV parno = 0;
6532             SV* sv_dat = HeVAL(temphe);
6533             I32 *nums = (I32*)SvPVX(sv_dat);
6534             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6535                 if ((I32)(rx->lastparen) >= nums[i] &&
6536                     rx->offs[nums[i]].start != -1 &&
6537                     rx->offs[nums[i]].end != -1)
6538                 {
6539                     parno = nums[i];
6540                     break;
6541                 }
6542             }
6543             if (parno || flags & RXapif_ALL) {
6544                 return newSVhek(HeKEY_hek(temphe));
6545             }
6546         }
6547     }
6548     return NULL;
6549 }
6550
6551 SV*
6552 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6553 {
6554     SV *ret;
6555     AV *av;
6556     I32 length;
6557     struct regexp *const rx = ReANY(r);
6558
6559     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6560
6561     if (rx && RXp_PAREN_NAMES(rx)) {
6562         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6563             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6564         } else if (flags & RXapif_ONE) {
6565             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6566             av = MUTABLE_AV(SvRV(ret));
6567             length = av_len(av);
6568             SvREFCNT_dec_NN(ret);
6569             return newSViv(length + 1);
6570         } else {
6571             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6572             return NULL;
6573         }
6574     }
6575     return &PL_sv_undef;
6576 }
6577
6578 SV*
6579 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6580 {
6581     struct regexp *const rx = ReANY(r);
6582     AV *av = newAV();
6583
6584     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6585
6586     if (rx && RXp_PAREN_NAMES(rx)) {
6587         HV *hv= RXp_PAREN_NAMES(rx);
6588         HE *temphe;
6589         (void)hv_iterinit(hv);
6590         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6591             IV i;
6592             IV parno = 0;
6593             SV* sv_dat = HeVAL(temphe);
6594             I32 *nums = (I32*)SvPVX(sv_dat);
6595             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6596                 if ((I32)(rx->lastparen) >= nums[i] &&
6597                     rx->offs[nums[i]].start != -1 &&
6598                     rx->offs[nums[i]].end != -1)
6599                 {
6600                     parno = nums[i];
6601                     break;
6602                 }
6603             }
6604             if (parno || flags & RXapif_ALL) {
6605                 av_push(av, newSVhek(HeKEY_hek(temphe)));
6606             }
6607         }
6608     }
6609
6610     return newRV_noinc(MUTABLE_SV(av));
6611 }
6612
6613 void
6614 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6615                              SV * const sv)
6616 {
6617     struct regexp *const rx = ReANY(r);
6618     char *s = NULL;
6619     I32 i = 0;
6620     I32 s1, t1;
6621     I32 n = paren;
6622
6623     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6624
6625     if ( (    n == RX_BUFF_IDX_CARET_PREMATCH
6626            || n == RX_BUFF_IDX_CARET_FULLMATCH
6627            || n == RX_BUFF_IDX_CARET_POSTMATCH
6628          )
6629          && !(rx->extflags & RXf_PMf_KEEPCOPY)
6630     )
6631         goto ret_undef;
6632
6633     if (!rx->subbeg)
6634         goto ret_undef;
6635
6636     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6637         /* no need to distinguish between them any more */
6638         n = RX_BUFF_IDX_FULLMATCH;
6639
6640     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6641         && rx->offs[0].start != -1)
6642     {
6643         /* $`, ${^PREMATCH} */
6644         i = rx->offs[0].start;
6645         s = rx->subbeg;
6646     }
6647     else
6648     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6649         && rx->offs[0].end != -1)
6650     {
6651         /* $', ${^POSTMATCH} */
6652         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6653         i = rx->sublen + rx->suboffset - rx->offs[0].end;
6654     }
6655     else
6656     if ( 0 <= n && n <= (I32)rx->nparens &&
6657         (s1 = rx->offs[n].start) != -1 &&
6658         (t1 = rx->offs[n].end) != -1)
6659     {
6660         /* $&, ${^MATCH},  $1 ... */
6661         i = t1 - s1;
6662         s = rx->subbeg + s1 - rx->suboffset;
6663     } else {
6664         goto ret_undef;
6665     }
6666
6667     assert(s >= rx->subbeg);
6668     assert(rx->sublen >= (s - rx->subbeg) + i );
6669     if (i >= 0) {
6670 #if NO_TAINT_SUPPORT
6671         sv_setpvn(sv, s, i);
6672 #else
6673         const int oldtainted = TAINT_get;
6674         TAINT_NOT;
6675         sv_setpvn(sv, s, i);
6676         TAINT_set(oldtainted);
6677 #endif
6678         if ( (rx->extflags & RXf_CANY_SEEN)
6679             ? (RXp_MATCH_UTF8(rx)
6680                         && (!i || is_utf8_string((U8*)s, i)))
6681             : (RXp_MATCH_UTF8(rx)) )
6682         {
6683             SvUTF8_on(sv);
6684         }
6685         else
6686             SvUTF8_off(sv);
6687         if (TAINTING_get) {
6688             if (RXp_MATCH_TAINTED(rx)) {
6689                 if (SvTYPE(sv) >= SVt_PVMG) {
6690                     MAGIC* const mg = SvMAGIC(sv);
6691                     MAGIC* mgt;
6692                     TAINT;
6693                     SvMAGIC_set(sv, mg->mg_moremagic);
6694                     SvTAINT(sv);
6695                     if ((mgt = SvMAGIC(sv))) {
6696                         mg->mg_moremagic = mgt;
6697                         SvMAGIC_set(sv, mg);
6698                     }
6699                 } else {
6700                     TAINT;
6701                     SvTAINT(sv);
6702                 }
6703             } else
6704                 SvTAINTED_off(sv);
6705         }
6706     } else {
6707       ret_undef:
6708         sv_setsv(sv,&PL_sv_undef);
6709         return;
6710     }
6711 }
6712
6713 void
6714 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6715                                                          SV const * const value)
6716 {
6717     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6718
6719     PERL_UNUSED_ARG(rx);
6720     PERL_UNUSED_ARG(paren);
6721     PERL_UNUSED_ARG(value);
6722
6723     if (!PL_localizing)
6724         Perl_croak_no_modify();
6725 }
6726
6727 I32
6728 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6729                               const I32 paren)
6730 {
6731     struct regexp *const rx = ReANY(r);
6732     I32 i;
6733     I32 s1, t1;
6734
6735     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6736
6737     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6738     switch (paren) {
6739       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6740          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6741             goto warn_undef;
6742         /*FALLTHROUGH*/
6743
6744       case RX_BUFF_IDX_PREMATCH:       /* $` */
6745         if (rx->offs[0].start != -1) {
6746                         i = rx->offs[0].start;
6747                         if (i > 0) {
6748                                 s1 = 0;
6749                                 t1 = i;
6750                                 goto getlen;
6751                         }
6752             }
6753         return 0;
6754
6755       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6756          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6757             goto warn_undef;
6758       case RX_BUFF_IDX_POSTMATCH:       /* $' */
6759             if (rx->offs[0].end != -1) {
6760                         i = rx->sublen - rx->offs[0].end;
6761                         if (i > 0) {
6762                                 s1 = rx->offs[0].end;
6763                                 t1 = rx->sublen;
6764                                 goto getlen;
6765                         }
6766             }
6767         return 0;
6768
6769       case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
6770          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6771             goto warn_undef;
6772         /*FALLTHROUGH*/
6773
6774       /* $& / ${^MATCH}, $1, $2, ... */
6775       default:
6776             if (paren <= (I32)rx->nparens &&
6777             (s1 = rx->offs[paren].start) != -1 &&
6778             (t1 = rx->offs[paren].end) != -1)
6779             {
6780             i = t1 - s1;
6781             goto getlen;
6782         } else {
6783           warn_undef:
6784             if (ckWARN(WARN_UNINITIALIZED))
6785                 report_uninit((const SV *)sv);
6786             return 0;
6787         }
6788     }
6789   getlen:
6790     if (i > 0 && RXp_MATCH_UTF8(rx)) {
6791         const char * const s = rx->subbeg - rx->suboffset + s1;
6792         const U8 *ep;
6793         STRLEN el;
6794
6795         i = t1 - s1;
6796         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6797                         i = el;
6798     }
6799     return i;
6800 }
6801
6802 SV*
6803 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6804 {
6805     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6806         PERL_UNUSED_ARG(rx);
6807         if (0)
6808             return NULL;
6809         else
6810             return newSVpvs("Regexp");
6811 }
6812
6813 /* Scans the name of a named buffer from the pattern.
6814  * If flags is REG_RSN_RETURN_NULL returns null.
6815  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6816  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6817  * to the parsed name as looked up in the RExC_paren_names hash.
6818  * If there is an error throws a vFAIL().. type exception.
6819  */
6820
6821 #define REG_RSN_RETURN_NULL    0
6822 #define REG_RSN_RETURN_NAME    1
6823 #define REG_RSN_RETURN_DATA    2
6824
6825 STATIC SV*
6826 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6827 {
6828     char *name_start = RExC_parse;
6829
6830     PERL_ARGS_ASSERT_REG_SCAN_NAME;
6831
6832     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6833          /* skip IDFIRST by using do...while */
6834         if (UTF)
6835             do {
6836                 RExC_parse += UTF8SKIP(RExC_parse);
6837             } while (isWORDCHAR_utf8((U8*)RExC_parse));
6838         else
6839             do {
6840                 RExC_parse++;
6841             } while (isWORDCHAR(*RExC_parse));
6842     } else {
6843         RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6844         vFAIL("Group name must start with a non-digit word character");
6845     }
6846     if ( flags ) {
6847         SV* sv_name
6848             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6849                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6850         if ( flags == REG_RSN_RETURN_NAME)
6851             return sv_name;
6852         else if (flags==REG_RSN_RETURN_DATA) {
6853             HE *he_str = NULL;
6854             SV *sv_dat = NULL;
6855             if ( ! sv_name )      /* should not happen*/
6856                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6857             if (RExC_paren_names)
6858                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6859             if ( he_str )
6860                 sv_dat = HeVAL(he_str);
6861             if ( ! sv_dat )
6862                 vFAIL("Reference to nonexistent named group");
6863             return sv_dat;
6864         }
6865         else {
6866             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6867                        (unsigned long) flags);
6868         }
6869         assert(0); /* NOT REACHED */
6870     }
6871     return NULL;
6872 }
6873
6874 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
6875     int rem=(int)(RExC_end - RExC_parse);                       \
6876     int cut;                                                    \
6877     int num;                                                    \
6878     int iscut=0;                                                \
6879     if (rem>10) {                                               \
6880         rem=10;                                                 \
6881         iscut=1;                                                \
6882     }                                                           \
6883     cut=10-rem;                                                 \
6884     if (RExC_lastparse!=RExC_parse)                             \
6885         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
6886             rem, RExC_parse,                                    \
6887             cut + 4,                                            \
6888             iscut ? "..." : "<"                                 \
6889         );                                                      \
6890     else                                                        \
6891         PerlIO_printf(Perl_debug_log,"%16s","");                \
6892                                                                 \
6893     if (SIZE_ONLY)                                              \
6894        num = RExC_size + 1;                                     \
6895     else                                                        \
6896        num=REG_NODE_NUM(RExC_emit);                             \
6897     if (RExC_lastnum!=num)                                      \
6898        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
6899     else                                                        \
6900        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
6901     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
6902         (int)((depth*2)), "",                                   \
6903         (funcname)                                              \
6904     );                                                          \
6905     RExC_lastnum=num;                                           \
6906     RExC_lastparse=RExC_parse;                                  \
6907 })
6908
6909
6910
6911 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
6912     DEBUG_PARSE_MSG((funcname));                            \
6913     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
6914 })
6915 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
6916     DEBUG_PARSE_MSG((funcname));                            \
6917     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
6918 })
6919
6920 /* This section of code defines the inversion list object and its methods.  The
6921  * interfaces are highly subject to change, so as much as possible is static to
6922  * this file.  An inversion list is here implemented as a malloc'd C UV array
6923  * with some added info that is placed as UVs at the beginning in a header
6924  * portion.  An inversion list for Unicode is an array of code points, sorted
6925  * by ordinal number.  The zeroth element is the first code point in the list.
6926  * The 1th element is the first element beyond that not in the list.  In other
6927  * words, the first range is
6928  *  invlist[0]..(invlist[1]-1)
6929  * The other ranges follow.  Thus every element whose index is divisible by two
6930  * marks the beginning of a range that is in the list, and every element not
6931  * divisible by two marks the beginning of a range not in the list.  A single
6932  * element inversion list that contains the single code point N generally
6933  * consists of two elements
6934  *  invlist[0] == N
6935  *  invlist[1] == N+1
6936  * (The exception is when N is the highest representable value on the
6937  * machine, in which case the list containing just it would be a single
6938  * element, itself.  By extension, if the last range in the list extends to
6939  * infinity, then the first element of that range will be in the inversion list
6940  * at a position that is divisible by two, and is the final element in the
6941  * list.)
6942  * Taking the complement (inverting) an inversion list is quite simple, if the
6943  * first element is 0, remove it; otherwise add a 0 element at the beginning.
6944  * This implementation reserves an element at the beginning of each inversion
6945  * list to contain 0 when the list contains 0, and contains 1 otherwise.  The
6946  * actual beginning of the list is either that element if 0, or the next one if
6947  * 1.
6948  *
6949  * More about inversion lists can be found in "Unicode Demystified"
6950  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6951  * More will be coming when functionality is added later.
6952  *
6953  * The inversion list data structure is currently implemented as an SV pointing
6954  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
6955  * array of UV whose memory management is automatically handled by the existing
6956  * facilities for SV's.
6957  *
6958  * Some of the methods should always be private to the implementation, and some
6959  * should eventually be made public */
6960
6961 /* The header definitions are in F<inline_invlist.c> */
6962 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
6963 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
6964
6965 #define INVLIST_INITIAL_LEN 10
6966
6967 PERL_STATIC_INLINE UV*
6968 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
6969 {
6970     /* Returns a pointer to the first element in the inversion list's array.
6971      * This is called upon initialization of an inversion list.  Where the
6972      * array begins depends on whether the list has the code point U+0000
6973      * in it or not.  The other parameter tells it whether the code that
6974      * follows this call is about to put a 0 in the inversion list or not.
6975      * The first element is either the element with 0, if 0, or the next one,
6976      * if 1 */
6977
6978     UV* zero = get_invlist_zero_addr(invlist);
6979
6980     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
6981
6982     /* Must be empty */
6983     assert(! *_get_invlist_len_addr(invlist));
6984
6985     /* 1^1 = 0; 1^0 = 1 */
6986     *zero = 1 ^ will_have_0;
6987     return zero + *zero;
6988 }
6989
6990 PERL_STATIC_INLINE UV*
6991 S_invlist_array(pTHX_ SV* const invlist)
6992 {
6993     /* Returns the pointer to the inversion list's array.  Every time the
6994      * length changes, this needs to be called in case malloc or realloc moved
6995      * it */
6996
6997     PERL_ARGS_ASSERT_INVLIST_ARRAY;
6998
6999     /* Must not be empty.  If these fail, you probably didn't check for <len>
7000      * being non-zero before trying to get the array */
7001     assert(*_get_invlist_len_addr(invlist));
7002     assert(*get_invlist_zero_addr(invlist) == 0
7003            || *get_invlist_zero_addr(invlist) == 1);
7004
7005     /* The array begins either at the element reserved for zero if the
7006      * list contains 0 (that element will be set to 0), or otherwise the next
7007      * element (in which case the reserved element will be set to 1). */
7008     return (UV *) (get_invlist_zero_addr(invlist)
7009                    + *get_invlist_zero_addr(invlist));
7010 }
7011
7012 PERL_STATIC_INLINE void
7013 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7014 {
7015     /* Sets the current number of elements stored in the inversion list */
7016
7017     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7018
7019     *_get_invlist_len_addr(invlist) = len;
7020
7021     assert(len <= SvLEN(invlist));
7022
7023     SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7024     /* If the list contains U+0000, that element is part of the header,
7025      * and should not be counted as part of the array.  It will contain
7026      * 0 in that case, and 1 otherwise.  So we could flop 0=>1, 1=>0 and
7027      * subtract:
7028      *  SvCUR_set(invlist,
7029      *            TO_INTERNAL_SIZE(len
7030      *                             - (*get_invlist_zero_addr(inv_list) ^ 1)));
7031      * But, this is only valid if len is not 0.  The consequences of not doing
7032      * this is that the memory allocation code may think that 1 more UV is
7033      * being used than actually is, and so might do an unnecessary grow.  That
7034      * seems worth not bothering to make this the precise amount.
7035      *
7036      * Note that when inverting, SvCUR shouldn't change */
7037 }
7038
7039 PERL_STATIC_INLINE IV*
7040 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7041 {
7042     /* Return the address of the UV that is reserved to hold the cached index
7043      * */
7044
7045     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7046
7047     return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
7048 }
7049
7050 PERL_STATIC_INLINE IV
7051 S_invlist_previous_index(pTHX_ SV* const invlist)
7052 {
7053     /* Returns cached index of previous search */
7054
7055     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7056
7057     return *get_invlist_previous_index_addr(invlist);
7058 }
7059
7060 PERL_STATIC_INLINE void
7061 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7062 {
7063     /* Caches <index> for later retrieval */
7064
7065     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7066
7067     assert(index == 0 || index < (int) _invlist_len(invlist));
7068
7069     *get_invlist_previous_index_addr(invlist) = index;
7070 }
7071
7072 PERL_STATIC_INLINE UV
7073 S_invlist_max(pTHX_ SV* const invlist)
7074 {
7075     /* Returns the maximum number of elements storable in the inversion list's
7076      * array, without having to realloc() */
7077
7078     PERL_ARGS_ASSERT_INVLIST_MAX;
7079
7080     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7081            ? _invlist_len(invlist)
7082            : FROM_INTERNAL_SIZE(SvLEN(invlist));
7083 }
7084
7085 PERL_STATIC_INLINE UV*
7086 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7087 {
7088     /* Return the address of the UV that is reserved to hold 0 if the inversion
7089      * list contains 0.  This has to be the last element of the heading, as the
7090      * list proper starts with either it if 0, or the next element if not.
7091      * (But we force it to contain either 0 or 1) */
7092
7093     PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7094
7095     return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7096 }
7097
7098 #ifndef PERL_IN_XSUB_RE
7099 SV*
7100 Perl__new_invlist(pTHX_ IV initial_size)
7101 {
7102
7103     /* Return a pointer to a newly constructed inversion list, with enough
7104      * space to store 'initial_size' elements.  If that number is negative, a
7105      * system default is used instead */
7106
7107     SV* new_list;
7108
7109     if (initial_size < 0) {
7110         initial_size = INVLIST_INITIAL_LEN;
7111     }
7112
7113     /* Allocate the initial space */
7114     new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7115     invlist_set_len(new_list, 0);
7116
7117     /* Force iterinit() to be used to get iteration to work */
7118     *get_invlist_iter_addr(new_list) = UV_MAX;
7119
7120     /* This should force a segfault if a method doesn't initialize this
7121      * properly */
7122     *get_invlist_zero_addr(new_list) = UV_MAX;
7123
7124     *get_invlist_previous_index_addr(new_list) = 0;
7125     *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7126 #if HEADER_LENGTH != 5
7127 #   error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7128 #endif
7129
7130     return new_list;
7131 }
7132 #endif
7133
7134 STATIC SV*
7135 S__new_invlist_C_array(pTHX_ UV* list)
7136 {
7137     /* Return a pointer to a newly constructed inversion list, initialized to
7138      * point to <list>, which has to be in the exact correct inversion list
7139      * form, including internal fields.  Thus this is a dangerous routine that
7140      * should not be used in the wrong hands */
7141
7142     SV* invlist = newSV_type(SVt_PV);
7143
7144     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7145
7146     SvPV_set(invlist, (char *) list);
7147     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7148                                shouldn't touch it */
7149     SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
7150
7151     if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7152         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7153     }
7154
7155     /* Initialize the iteration pointer.
7156      * XXX This could be done at compile time in charclass_invlists.h, but I
7157      * (khw) am not confident that the suffixes for specifying the C constant
7158      * UV_MAX are portable, e.g.  'ull' on a 32 bit machine that is configured
7159      * to use 64 bits; might need a Configure probe */
7160     invlist_iterfinish(invlist);
7161
7162     return invlist;
7163 }
7164
7165 STATIC void
7166 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7167 {
7168     /* Grow the maximum size of an inversion list */
7169
7170     PERL_ARGS_ASSERT_INVLIST_EXTEND;
7171
7172     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7173 }
7174
7175 PERL_STATIC_INLINE void
7176 S_invlist_trim(pTHX_ SV* const invlist)
7177 {
7178     PERL_ARGS_ASSERT_INVLIST_TRIM;
7179
7180     /* Change the length of the inversion list to how many entries it currently
7181      * has */
7182
7183     SvPV_shrink_to_cur((SV *) invlist);
7184 }
7185
7186 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7187
7188 STATIC void
7189 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7190 {
7191    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7192     * the end of the inversion list.  The range must be above any existing
7193     * ones. */
7194
7195     UV* array;
7196     UV max = invlist_max(invlist);
7197     UV len = _invlist_len(invlist);
7198
7199     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7200
7201     if (len == 0) { /* Empty lists must be initialized */
7202         array = _invlist_array_init(invlist, start == 0);
7203     }
7204     else {
7205         /* Here, the existing list is non-empty. The current max entry in the
7206          * list is generally the first value not in the set, except when the
7207          * set extends to the end of permissible values, in which case it is
7208          * the first entry in that final set, and so this call is an attempt to
7209          * append out-of-order */
7210
7211         UV final_element = len - 1;
7212         array = invlist_array(invlist);
7213         if (array[final_element] > start
7214             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7215         {
7216             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7217                        array[final_element], start,
7218                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7219         }
7220
7221         /* Here, it is a legal append.  If the new range begins with the first
7222          * value not in the set, it is extending the set, so the new first
7223          * value not in the set is one greater than the newly extended range.
7224          * */
7225         if (array[final_element] == start) {
7226             if (end != UV_MAX) {
7227                 array[final_element] = end + 1;
7228             }
7229             else {
7230                 /* But if the end is the maximum representable on the machine,
7231                  * just let the range that this would extend to have no end */
7232                 invlist_set_len(invlist, len - 1);
7233             }
7234             return;
7235         }
7236     }
7237
7238     /* Here the new range doesn't extend any existing set.  Add it */
7239
7240     len += 2;   /* Includes an element each for the start and end of range */
7241
7242     /* If overflows the existing space, extend, which may cause the array to be
7243      * moved */
7244     if (max < len) {
7245         invlist_extend(invlist, len);
7246         invlist_set_len(invlist, len);  /* Have to set len here to avoid assert
7247                                            failure in invlist_array() */
7248         array = invlist_array(invlist);
7249     }
7250     else {
7251         invlist_set_len(invlist, len);
7252     }
7253
7254     /* The next item on the list starts the range, the one after that is
7255      * one past the new range.  */
7256     array[len - 2] = start;
7257     if (end != UV_MAX) {
7258         array[len - 1] = end + 1;
7259     }
7260     else {
7261         /* But if the end is the maximum representable on the machine, just let
7262          * the range have no end */
7263         invlist_set_len(invlist, len - 1);
7264     }
7265 }
7266
7267 #ifndef PERL_IN_XSUB_RE
7268
7269 IV
7270 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7271 {
7272     /* Searches the inversion list for the entry that contains the input code
7273      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
7274      * return value is the index into the list's array of the range that
7275      * contains <cp> */
7276
7277     IV low = 0;
7278     IV mid;
7279     IV high = _invlist_len(invlist);
7280     const IV highest_element = high - 1;
7281     const UV* array;
7282
7283     PERL_ARGS_ASSERT__INVLIST_SEARCH;
7284
7285     /* If list is empty, return failure. */
7286     if (high == 0) {
7287         return -1;
7288     }
7289
7290     /* (We can't get the array unless we know the list is non-empty) */
7291     array = invlist_array(invlist);
7292
7293     mid = invlist_previous_index(invlist);
7294     assert(mid >=0 && mid <= highest_element);
7295
7296     /* <mid> contains the cache of the result of the previous call to this
7297      * function (0 the first time).  See if this call is for the same result,
7298      * or if it is for mid-1.  This is under the theory that calls to this
7299      * function will often be for related code points that are near each other.
7300      * And benchmarks show that caching gives better results.  We also test
7301      * here if the code point is within the bounds of the list.  These tests
7302      * replace others that would have had to be made anyway to make sure that
7303      * the array bounds were not exceeded, and these give us extra information
7304      * at the same time */
7305     if (cp >= array[mid]) {
7306         if (cp >= array[highest_element]) {
7307             return highest_element;
7308         }
7309
7310         /* Here, array[mid] <= cp < array[highest_element].  This means that
7311          * the final element is not the answer, so can exclude it; it also
7312          * means that <mid> is not the final element, so can refer to 'mid + 1'
7313          * safely */
7314         if (cp < array[mid + 1]) {
7315             return mid;
7316         }
7317         high--;
7318         low = mid + 1;
7319     }
7320     else { /* cp < aray[mid] */
7321         if (cp < array[0]) { /* Fail if outside the array */
7322             return -1;
7323         }
7324         high = mid;
7325         if (cp >= array[mid - 1]) {
7326             goto found_entry;
7327         }
7328     }
7329
7330     /* Binary search.  What we are looking for is <i> such that
7331      *  array[i] <= cp < array[i+1]
7332      * The loop below converges on the i+1.  Note that there may not be an
7333      * (i+1)th element in the array, and things work nonetheless */
7334     while (low < high) {
7335         mid = (low + high) / 2;
7336         assert(mid <= highest_element);
7337         if (array[mid] <= cp) { /* cp >= array[mid] */
7338             low = mid + 1;
7339
7340             /* We could do this extra test to exit the loop early.
7341             if (cp < array[low]) {
7342                 return mid;
7343             }
7344             */
7345         }
7346         else { /* cp < array[mid] */
7347             high = mid;
7348         }
7349     }
7350
7351   found_entry:
7352     high--;
7353     invlist_set_previous_index(invlist, high);
7354     return high;
7355 }
7356
7357 void
7358 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7359 {
7360     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7361      * but is used when the swash has an inversion list.  This makes this much
7362      * faster, as it uses a binary search instead of a linear one.  This is
7363      * intimately tied to that function, and perhaps should be in utf8.c,
7364      * except it is intimately tied to inversion lists as well.  It assumes
7365      * that <swatch> is all 0's on input */
7366
7367     UV current = start;
7368     const IV len = _invlist_len(invlist);
7369     IV i;
7370     const UV * array;
7371
7372     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7373
7374     if (len == 0) { /* Empty inversion list */
7375         return;
7376     }
7377
7378     array = invlist_array(invlist);
7379
7380     /* Find which element it is */
7381     i = _invlist_search(invlist, start);
7382
7383     /* We populate from <start> to <end> */
7384     while (current < end) {
7385         UV upper;
7386
7387         /* The inversion list gives the results for every possible code point
7388          * after the first one in the list.  Only those ranges whose index is
7389          * even are ones that the inversion list matches.  For the odd ones,
7390          * and if the initial code point is not in the list, we have to skip
7391          * forward to the next element */
7392         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7393             i++;
7394             if (i >= len) { /* Finished if beyond the end of the array */
7395                 return;
7396             }
7397             current = array[i];
7398             if (current >= end) {   /* Finished if beyond the end of what we
7399                                        are populating */
7400                 if (LIKELY(end < UV_MAX)) {
7401                     return;
7402                 }
7403
7404                 /* We get here when the upper bound is the maximum
7405                  * representable on the machine, and we are looking for just
7406                  * that code point.  Have to special case it */
7407                 i = len;
7408                 goto join_end_of_list;
7409             }
7410         }
7411         assert(current >= start);
7412
7413         /* The current range ends one below the next one, except don't go past
7414          * <end> */
7415         i++;
7416         upper = (i < len && array[i] < end) ? array[i] : end;
7417
7418         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
7419          * for each code point in it */
7420         for (; current < upper; current++) {
7421             const STRLEN offset = (STRLEN)(current - start);
7422             swatch[offset >> 3] |= 1 << (offset & 7);
7423         }
7424
7425     join_end_of_list:
7426
7427         /* Quit if at the end of the list */
7428         if (i >= len) {
7429
7430             /* But first, have to deal with the highest possible code point on
7431              * the platform.  The previous code assumes that <end> is one
7432              * beyond where we want to populate, but that is impossible at the
7433              * platform's infinity, so have to handle it specially */
7434             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7435             {
7436                 const STRLEN offset = (STRLEN)(end - start);
7437                 swatch[offset >> 3] |= 1 << (offset & 7);
7438             }
7439             return;
7440         }
7441
7442         /* Advance to the next range, which will be for code points not in the
7443          * inversion list */
7444         current = array[i];
7445     }
7446
7447     return;
7448 }
7449
7450 void
7451 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7452 {
7453     /* Take the union of two inversion lists and point <output> to it.  *output
7454      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7455      * the reference count to that list will be decremented.  The first list,
7456      * <a>, may be NULL, in which case a copy of the second list is returned.
7457      * If <complement_b> is TRUE, the union is taken of the complement
7458      * (inversion) of <b> instead of b itself.
7459      *
7460      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7461      * Richard Gillam, published by Addison-Wesley, and explained at some
7462      * length there.  The preface says to incorporate its examples into your
7463      * code at your own risk.
7464      *
7465      * The algorithm is like a merge sort.
7466      *
7467      * XXX A potential performance improvement is to keep track as we go along
7468      * if only one of the inputs contributes to the result, meaning the other
7469      * is a subset of that one.  In that case, we can skip the final copy and
7470      * return the larger of the input lists, but then outside code might need
7471      * to keep track of whether to free the input list or not */
7472
7473     UV* array_a;    /* a's array */
7474     UV* array_b;
7475     UV len_a;       /* length of a's array */
7476     UV len_b;
7477
7478     SV* u;                      /* the resulting union */
7479     UV* array_u;
7480     UV len_u;
7481
7482     UV i_a = 0;             /* current index into a's array */
7483     UV i_b = 0;
7484     UV i_u = 0;
7485
7486     /* running count, as explained in the algorithm source book; items are
7487      * stopped accumulating and are output when the count changes to/from 0.
7488      * The count is incremented when we start a range that's in the set, and
7489      * decremented when we start a range that's not in the set.  So its range
7490      * is 0 to 2.  Only when the count is zero is something not in the set.
7491      */
7492     UV count = 0;
7493
7494     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7495     assert(a != b);
7496
7497     /* If either one is empty, the union is the other one */
7498     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7499         if (*output == a) {
7500             if (a != NULL) {
7501                 SvREFCNT_dec_NN(a);
7502             }
7503         }
7504         if (*output != b) {
7505             *output = invlist_clone(b);
7506             if (complement_b) {
7507                 _invlist_invert(*output);
7508             }
7509         } /* else *output already = b; */
7510         return;
7511     }
7512     else if ((len_b = _invlist_len(b)) == 0) {
7513         if (*output == b) {
7514             SvREFCNT_dec_NN(b);
7515         }
7516
7517         /* The complement of an empty list is a list that has everything in it,
7518          * so the union with <a> includes everything too */
7519         if (complement_b) {
7520             if (a == *output) {
7521                 SvREFCNT_dec_NN(a);
7522             }
7523             *output = _new_invlist(1);
7524             _append_range_to_invlist(*output, 0, UV_MAX);
7525         }
7526         else if (*output != a) {
7527             *output = invlist_clone(a);
7528         }
7529         /* else *output already = a; */
7530         return;
7531     }
7532
7533     /* Here both lists exist and are non-empty */
7534     array_a = invlist_array(a);
7535     array_b = invlist_array(b);
7536
7537     /* If are to take the union of 'a' with the complement of b, set it
7538      * up so are looking at b's complement. */
7539     if (complement_b) {
7540
7541         /* To complement, we invert: if the first element is 0, remove it.  To
7542          * do this, we just pretend the array starts one later, and clear the
7543          * flag as we don't have to do anything else later */
7544         if (array_b[0] == 0) {
7545             array_b++;
7546             len_b--;
7547             complement_b = FALSE;
7548         }
7549         else {
7550
7551             /* But if the first element is not zero, we unshift a 0 before the
7552              * array.  The data structure reserves a space for that 0 (which
7553              * should be a '1' right now), so physical shifting is unneeded,
7554              * but temporarily change that element to 0.  Before exiting the
7555              * routine, we must restore the element to '1' */
7556             array_b--;
7557             len_b++;
7558             array_b[0] = 0;
7559         }
7560     }
7561
7562     /* Size the union for the worst case: that the sets are completely
7563      * disjoint */
7564     u = _new_invlist(len_a + len_b);
7565
7566     /* Will contain U+0000 if either component does */
7567     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7568                                       || (len_b > 0 && array_b[0] == 0));
7569
7570     /* Go through each list item by item, stopping when exhausted one of
7571      * them */
7572     while (i_a < len_a && i_b < len_b) {
7573         UV cp;      /* The element to potentially add to the union's array */
7574         bool cp_in_set;   /* is it in the the input list's set or not */
7575
7576         /* We need to take one or the other of the two inputs for the union.
7577          * Since we are merging two sorted lists, we take the smaller of the
7578          * next items.  In case of a tie, we take the one that is in its set
7579          * first.  If we took one not in the set first, it would decrement the
7580          * count, possibly to 0 which would cause it to be output as ending the
7581          * range, and the next time through we would take the same number, and
7582          * output it again as beginning the next range.  By doing it the
7583          * opposite way, there is no possibility that the count will be
7584          * momentarily decremented to 0, and thus the two adjoining ranges will
7585          * be seamlessly merged.  (In a tie and both are in the set or both not
7586          * in the set, it doesn't matter which we take first.) */
7587         if (array_a[i_a] < array_b[i_b]
7588             || (array_a[i_a] == array_b[i_b]
7589                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7590         {
7591             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7592             cp= array_a[i_a++];
7593         }
7594         else {
7595             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7596             cp= array_b[i_b++];
7597         }
7598
7599         /* Here, have chosen which of the two inputs to look at.  Only output
7600          * if the running count changes to/from 0, which marks the
7601          * beginning/end of a range in that's in the set */
7602         if (cp_in_set) {
7603             if (count == 0) {
7604                 array_u[i_u++] = cp;
7605             }
7606             count++;
7607         }
7608         else {
7609             count--;
7610             if (count == 0) {
7611                 array_u[i_u++] = cp;
7612             }
7613         }
7614     }
7615
7616     /* Here, we are finished going through at least one of the lists, which
7617      * means there is something remaining in at most one.  We check if the list
7618      * that hasn't been exhausted is positioned such that we are in the middle
7619      * of a range in its set or not.  (i_a and i_b point to the element beyond
7620      * the one we care about.) If in the set, we decrement 'count'; if 0, there
7621      * is potentially more to output.
7622      * There are four cases:
7623      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
7624      *     in the union is entirely from the non-exhausted set.
7625      *  2) Both were in their sets, count is 2.  Nothing further should
7626      *     be output, as everything that remains will be in the exhausted
7627      *     list's set, hence in the union; decrementing to 1 but not 0 insures
7628      *     that
7629      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
7630      *     Nothing further should be output because the union includes
7631      *     everything from the exhausted set.  Not decrementing ensures that.
7632      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7633      *     decrementing to 0 insures that we look at the remainder of the
7634      *     non-exhausted set */
7635     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7636         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7637     {
7638         count--;
7639     }
7640
7641     /* The final length is what we've output so far, plus what else is about to
7642      * be output.  (If 'count' is non-zero, then the input list we exhausted
7643      * has everything remaining up to the machine's limit in its set, and hence
7644      * in the union, so there will be no further output. */
7645     len_u = i_u;
7646     if (count == 0) {
7647         /* At most one of the subexpressions will be non-zero */
7648         len_u += (len_a - i_a) + (len_b - i_b);
7649     }
7650
7651     /* Set result to final length, which can change the pointer to array_u, so
7652      * re-find it */
7653     if (len_u != _invlist_len(u)) {
7654         invlist_set_len(u, len_u);
7655         invlist_trim(u);
7656         array_u = invlist_array(u);
7657     }
7658
7659     /* When 'count' is 0, the list that was exhausted (if one was shorter than
7660      * the other) ended with everything above it not in its set.  That means
7661      * that the remaining part of the union is precisely the same as the
7662      * non-exhausted list, so can just copy it unchanged.  (If both list were
7663      * exhausted at the same time, then the operations below will be both 0.)
7664      */
7665     if (count == 0) {
7666         IV copy_count; /* At most one will have a non-zero copy count */
7667         if ((copy_count = len_a - i_a) > 0) {
7668             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7669         }
7670         else if ((copy_count = len_b - i_b) > 0) {
7671             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7672         }
7673     }
7674
7675     /* If we've changed b, restore it */
7676     if (complement_b) {
7677         array_b[0] = 1;
7678     }
7679
7680     /*  We may be removing a reference to one of the inputs */
7681     if (a == *output || b == *output) {
7682         assert(! invlist_is_iterating(*output));
7683         SvREFCNT_dec_NN(*output);
7684     }
7685
7686     *output = u;
7687     return;
7688 }
7689
7690 void
7691 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7692 {
7693     /* Take the intersection of two inversion lists and point <i> to it.  *i
7694      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7695      * the reference count to that list will be decremented.
7696      * If <complement_b> is TRUE, the result will be the intersection of <a>
7697      * and the complement (or inversion) of <b> instead of <b> directly.
7698      *
7699      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7700      * Richard Gillam, published by Addison-Wesley, and explained at some
7701      * length there.  The preface says to incorporate its examples into your
7702      * code at your own risk.  In fact, it had bugs
7703      *
7704      * The algorithm is like a merge sort, and is essentially the same as the
7705      * union above
7706      */
7707
7708     UV* array_a;                /* a's array */
7709     UV* array_b;
7710     UV len_a;   /* length of a's array */
7711     UV len_b;
7712
7713     SV* r;                   /* the resulting intersection */
7714     UV* array_r;
7715     UV len_r;
7716
7717     UV i_a = 0;             /* current index into a's array */
7718     UV i_b = 0;
7719     UV i_r = 0;
7720
7721     /* running count, as explained in the algorithm source book; items are
7722      * stopped accumulating and are output when the count changes to/from 2.
7723      * The count is incremented when we start a range that's in the set, and
7724      * decremented when we start a range that's not in the set.  So its range
7725      * is 0 to 2.  Only when the count is 2 is something in the intersection.
7726      */
7727     UV count = 0;
7728
7729     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7730     assert(a != b);
7731
7732     /* Special case if either one is empty */
7733     len_a = _invlist_len(a);
7734     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7735
7736         if (len_a != 0 && complement_b) {
7737
7738             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7739              * be empty.  Here, also we are using 'b's complement, which hence
7740              * must be every possible code point.  Thus the intersection is
7741              * simply 'a'. */
7742             if (*i != a) {
7743                 *i = invlist_clone(a);
7744
7745                 if (*i == b) {
7746                     SvREFCNT_dec_NN(b);
7747                 }
7748             }
7749             /* else *i is already 'a' */
7750             return;
7751         }
7752
7753         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
7754          * intersection must be empty */
7755         if (*i == a) {
7756             SvREFCNT_dec_NN(a);
7757         }
7758         else if (*i == b) {
7759             SvREFCNT_dec_NN(b);
7760         }
7761         *i = _new_invlist(0);
7762         return;
7763     }
7764
7765     /* Here both lists exist and are non-empty */
7766     array_a = invlist_array(a);
7767     array_b = invlist_array(b);
7768
7769     /* If are to take the intersection of 'a' with the complement of b, set it
7770      * up so are looking at b's complement. */
7771     if (complement_b) {
7772
7773         /* To complement, we invert: if the first element is 0, remove it.  To
7774          * do this, we just pretend the array starts one later, and clear the
7775          * flag as we don't have to do anything else later */
7776         if (array_b[0] == 0) {
7777             array_b++;
7778             len_b--;
7779             complement_b = FALSE;
7780         }
7781         else {
7782
7783             /* But if the first element is not zero, we unshift a 0 before the
7784              * array.  The data structure reserves a space for that 0 (which
7785              * should be a '1' right now), so physical shifting is unneeded,
7786              * but temporarily change that element to 0.  Before exiting the
7787              * routine, we must restore the element to '1' */
7788             array_b--;
7789             len_b++;
7790             array_b[0] = 0;
7791         }
7792     }
7793
7794     /* Size the intersection for the worst case: that the intersection ends up
7795      * fragmenting everything to be completely disjoint */
7796     r= _new_invlist(len_a + len_b);
7797
7798     /* Will contain U+0000 iff both components do */
7799     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7800                                      && len_b > 0 && array_b[0] == 0);
7801
7802     /* Go through each list item by item, stopping when exhausted one of
7803      * them */
7804     while (i_a < len_a && i_b < len_b) {
7805         UV cp;      /* The element to potentially add to the intersection's
7806                        array */
7807         bool cp_in_set; /* Is it in the input list's set or not */
7808
7809         /* We need to take one or the other of the two inputs for the
7810          * intersection.  Since we are merging two sorted lists, we take the
7811          * smaller of the next items.  In case of a tie, we take the one that
7812          * is not in its set first (a difference from the union algorithm).  If
7813          * we took one in the set first, it would increment the count, possibly
7814          * to 2 which would cause it to be output as starting a range in the
7815          * intersection, and the next time through we would take that same
7816          * number, and output it again as ending the set.  By doing it the
7817          * opposite of this, there is no possibility that the count will be
7818          * momentarily incremented to 2.  (In a tie and both are in the set or
7819          * both not in the set, it doesn't matter which we take first.) */
7820         if (array_a[i_a] < array_b[i_b]
7821             || (array_a[i_a] == array_b[i_b]
7822                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7823         {
7824             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7825             cp= array_a[i_a++];
7826         }
7827         else {
7828             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7829             cp= array_b[i_b++];
7830         }
7831
7832         /* Here, have chosen which of the two inputs to look at.  Only output
7833          * if the running count changes to/from 2, which marks the
7834          * beginning/end of a range that's in the intersection */
7835         if (cp_in_set) {
7836             count++;
7837             if (count == 2) {
7838                 array_r[i_r++] = cp;
7839             }
7840         }
7841         else {
7842             if (count == 2) {
7843                 array_r[i_r++] = cp;
7844             }
7845             count--;
7846         }
7847     }
7848
7849     /* Here, we are finished going through at least one of the lists, which
7850      * means there is something remaining in at most one.  We check if the list
7851      * that has been exhausted is positioned such that we are in the middle
7852      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
7853      * the ones we care about.)  There are four cases:
7854      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
7855      *     nothing left in the intersection.
7856      *  2) Both were in their sets, count is 2 and perhaps is incremented to
7857      *     above 2.  What should be output is exactly that which is in the
7858      *     non-exhausted set, as everything it has is also in the intersection
7859      *     set, and everything it doesn't have can't be in the intersection
7860      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7861      *     gets incremented to 2.  Like the previous case, the intersection is
7862      *     everything that remains in the non-exhausted set.
7863      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7864      *     remains 1.  And the intersection has nothing more. */
7865     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7866         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7867     {
7868         count++;
7869     }
7870
7871     /* The final length is what we've output so far plus what else is in the
7872      * intersection.  At most one of the subexpressions below will be non-zero */
7873     len_r = i_r;
7874     if (count >= 2) {
7875         len_r += (len_a - i_a) + (len_b - i_b);
7876     }
7877
7878     /* Set result to final length, which can change the pointer to array_r, so
7879      * re-find it */
7880     if (len_r != _invlist_len(r)) {
7881         invlist_set_len(r, len_r);
7882         invlist_trim(r);
7883         array_r = invlist_array(r);
7884     }
7885
7886     /* Finish outputting any remaining */
7887     if (count >= 2) { /* At most one will have a non-zero copy count */
7888         IV copy_count;
7889         if ((copy_count = len_a - i_a) > 0) {
7890             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7891         }
7892         else if ((copy_count = len_b - i_b) > 0) {
7893             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7894         }
7895     }
7896
7897     /* If we've changed b, restore it */
7898     if (complement_b) {
7899         array_b[0] = 1;
7900     }
7901
7902     /*  We may be removing a reference to one of the inputs */
7903     if (a == *i || b == *i) {
7904         assert(! invlist_is_iterating(*i));
7905         SvREFCNT_dec_NN(*i);
7906     }
7907
7908     *i = r;
7909     return;
7910 }
7911
7912 SV*
7913 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7914 {
7915     /* Add the range from 'start' to 'end' inclusive to the inversion list's
7916      * set.  A pointer to the inversion list is returned.  This may actually be
7917      * a new list, in which case the passed in one has been destroyed.  The
7918      * passed in inversion list can be NULL, in which case a new one is created
7919      * with just the one range in it */
7920
7921     SV* range_invlist;
7922     UV len;
7923
7924     if (invlist == NULL) {
7925         invlist = _new_invlist(2);
7926         len = 0;
7927     }
7928     else {
7929         len = _invlist_len(invlist);
7930     }
7931
7932     /* If comes after the final entry actually in the list, can just append it
7933      * to the end, */
7934     if (len == 0
7935         || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
7936             && start >= invlist_array(invlist)[len - 1]))
7937     {
7938         _append_range_to_invlist(invlist, start, end);
7939         return invlist;
7940     }
7941
7942     /* Here, can't just append things, create and return a new inversion list
7943      * which is the union of this range and the existing inversion list */
7944     range_invlist = _new_invlist(2);
7945     _append_range_to_invlist(range_invlist, start, end);
7946
7947     _invlist_union(invlist, range_invlist, &invlist);
7948
7949     /* The temporary can be freed */
7950     SvREFCNT_dec_NN(range_invlist);
7951
7952     return invlist;
7953 }
7954
7955 #endif
7956
7957 PERL_STATIC_INLINE SV*
7958 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7959     return _add_range_to_invlist(invlist, cp, cp);
7960 }
7961
7962 #ifndef PERL_IN_XSUB_RE
7963 void
7964 Perl__invlist_invert(pTHX_ SV* const invlist)
7965 {
7966     /* Complement the input inversion list.  This adds a 0 if the list didn't
7967      * have a zero; removes it otherwise.  As described above, the data
7968      * structure is set up so that this is very efficient */
7969
7970     UV* len_pos = _get_invlist_len_addr(invlist);
7971
7972     PERL_ARGS_ASSERT__INVLIST_INVERT;
7973
7974     assert(! invlist_is_iterating(invlist));
7975
7976     /* The inverse of matching nothing is matching everything */
7977     if (*len_pos == 0) {
7978         _append_range_to_invlist(invlist, 0, UV_MAX);
7979         return;
7980     }
7981
7982     /* The exclusive or complents 0 to 1; and 1 to 0.  If the result is 1, the
7983      * zero element was a 0, so it is being removed, so the length decrements
7984      * by 1; and vice-versa.  SvCUR is unaffected */
7985     if (*get_invlist_zero_addr(invlist) ^= 1) {
7986         (*len_pos)--;
7987     }
7988     else {
7989         (*len_pos)++;
7990     }
7991 }
7992
7993 void
7994 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7995 {
7996     /* Complement the input inversion list (which must be a Unicode property,
7997      * all of which don't match above the Unicode maximum code point.)  And
7998      * Perl has chosen to not have the inversion match above that either.  This
7999      * adds a 0x110000 if the list didn't end with it, and removes it if it did
8000      */
8001
8002     UV len;
8003     UV* array;
8004
8005     PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8006
8007     _invlist_invert(invlist);
8008
8009     len = _invlist_len(invlist);
8010
8011     if (len != 0) { /* If empty do nothing */
8012         array = invlist_array(invlist);
8013         if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8014             /* Add 0x110000.  First, grow if necessary */
8015             len++;
8016             if (invlist_max(invlist) < len) {
8017                 invlist_extend(invlist, len);
8018                 array = invlist_array(invlist);
8019             }
8020             invlist_set_len(invlist, len);
8021             array[len - 1] = PERL_UNICODE_MAX + 1;
8022         }
8023         else {  /* Remove the 0x110000 */
8024             invlist_set_len(invlist, len - 1);
8025         }
8026     }
8027
8028     return;
8029 }
8030 #endif
8031
8032 PERL_STATIC_INLINE SV*
8033 S_invlist_clone(pTHX_ SV* const invlist)
8034 {
8035
8036     /* Return a new inversion list that is a copy of the input one, which is
8037      * unchanged */
8038
8039     /* Need to allocate extra space to accommodate Perl's addition of a
8040      * trailing NUL to SvPV's, since it thinks they are always strings */
8041     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8042     STRLEN length = SvCUR(invlist);
8043
8044     PERL_ARGS_ASSERT_INVLIST_CLONE;
8045
8046     SvCUR_set(new_invlist, length); /* This isn't done automatically */
8047     Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8048
8049     return new_invlist;
8050 }
8051
8052 PERL_STATIC_INLINE UV*
8053 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8054 {
8055     /* Return the address of the UV that contains the current iteration
8056      * position */
8057
8058     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8059
8060     return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8061 }
8062
8063 PERL_STATIC_INLINE UV*
8064 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8065 {
8066     /* Return the address of the UV that contains the version id. */
8067
8068     PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8069
8070     return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8071 }
8072
8073 PERL_STATIC_INLINE void
8074 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
8075 {
8076     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8077
8078     *get_invlist_iter_addr(invlist) = 0;
8079 }
8080
8081 PERL_STATIC_INLINE void
8082 S_invlist_iterfinish(pTHX_ SV* invlist)
8083 {
8084     /* Terminate iterator for invlist.  This is to catch development errors.
8085      * Any iteration that is interrupted before completed should call this
8086      * function.  Functions that add code points anywhere else but to the end
8087      * of an inversion list assert that they are not in the middle of an
8088      * iteration.  If they were, the addition would make the iteration
8089      * problematical: if the iteration hadn't reached the place where things
8090      * were being added, it would be ok */
8091
8092     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8093
8094     *get_invlist_iter_addr(invlist) = UV_MAX;
8095 }
8096
8097 STATIC bool
8098 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8099 {
8100     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8101      * This call sets in <*start> and <*end>, the next range in <invlist>.
8102      * Returns <TRUE> if successful and the next call will return the next
8103      * range; <FALSE> if was already at the end of the list.  If the latter,
8104      * <*start> and <*end> are unchanged, and the next call to this function
8105      * will start over at the beginning of the list */
8106
8107     UV* pos = get_invlist_iter_addr(invlist);
8108     UV len = _invlist_len(invlist);
8109     UV *array;
8110
8111     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8112
8113     if (*pos >= len) {
8114         *pos = UV_MAX;  /* Force iterinit() to be required next time */
8115         return FALSE;
8116     }
8117
8118     array = invlist_array(invlist);
8119
8120     *start = array[(*pos)++];
8121
8122     if (*pos >= len) {
8123         *end = UV_MAX;
8124     }
8125     else {
8126         *end = array[(*pos)++] - 1;
8127     }
8128
8129     return TRUE;
8130 }
8131
8132 PERL_STATIC_INLINE bool
8133 S_invlist_is_iterating(pTHX_ SV* const invlist)
8134 {
8135     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8136
8137     return *(get_invlist_iter_addr(invlist)) < UV_MAX;
8138 }
8139
8140 PERL_STATIC_INLINE UV
8141 S_invlist_highest(pTHX_ SV* const invlist)
8142 {
8143     /* Returns the highest code point that matches an inversion list.  This API
8144      * has an ambiguity, as it returns 0 under either the highest is actually
8145      * 0, or if the list is empty.  If this distinction matters to you, check
8146      * for emptiness before calling this function */
8147
8148     UV len = _invlist_len(invlist);
8149     UV *array;
8150
8151     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8152
8153     if (len == 0) {
8154         return 0;
8155     }
8156
8157     array = invlist_array(invlist);
8158
8159     /* The last element in the array in the inversion list always starts a
8160      * range that goes to infinity.  That range may be for code points that are
8161      * matched in the inversion list, or it may be for ones that aren't
8162      * matched.  In the latter case, the highest code point in the set is one
8163      * less than the beginning of this range; otherwise it is the final element
8164      * of this range: infinity */
8165     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8166            ? UV_MAX
8167            : array[len - 1] - 1;
8168 }
8169
8170 #ifndef PERL_IN_XSUB_RE
8171 SV *
8172 Perl__invlist_contents(pTHX_ SV* const invlist)
8173 {
8174     /* Get the contents of an inversion list into a string SV so that they can
8175      * be printed out.  It uses the format traditionally done for debug tracing
8176      */
8177
8178     UV start, end;
8179     SV* output = newSVpvs("\n");
8180
8181     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8182
8183     assert(! invlist_is_iterating(invlist));
8184
8185     invlist_iterinit(invlist);
8186     while (invlist_iternext(invlist, &start, &end)) {
8187         if (end == UV_MAX) {
8188             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8189         }
8190         else if (end != start) {
8191             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8192                     start,       end);
8193         }
8194         else {
8195             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8196         }
8197     }
8198
8199     return output;
8200 }
8201 #endif
8202
8203 #ifdef PERL_ARGS_ASSERT__INVLIST_DUMP
8204 void
8205 Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header)
8206 {
8207     /* Dumps out the ranges in an inversion list.  The string 'header'
8208      * if present is output on a line before the first range */
8209
8210     UV start, end;
8211
8212     PERL_ARGS_ASSERT__INVLIST_DUMP;
8213
8214     if (header && strlen(header)) {
8215         PerlIO_printf(Perl_debug_log, "%s\n", header);
8216     }
8217     if (invlist_is_iterating(invlist)) {
8218         PerlIO_printf(Perl_debug_log, "Can't dump because is in middle of iterating\n");
8219         return;
8220     }
8221
8222     invlist_iterinit(invlist);
8223     while (invlist_iternext(invlist, &start, &end)) {
8224         if (end == UV_MAX) {
8225             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8226         }
8227         else if (end != start) {
8228             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n",
8229                                                  start,         end);
8230         }
8231         else {
8232             PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start);
8233         }
8234     }
8235 }
8236 #endif
8237
8238 #if 0
8239 bool
8240 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8241 {
8242     /* Return a boolean as to if the two passed in inversion lists are
8243      * identical.  The final argument, if TRUE, says to take the complement of
8244      * the second inversion list before doing the comparison */
8245
8246     UV* array_a = invlist_array(a);
8247     UV* array_b = invlist_array(b);
8248     UV len_a = _invlist_len(a);
8249     UV len_b = _invlist_len(b);
8250
8251     UV i = 0;               /* current index into the arrays */
8252     bool retval = TRUE;     /* Assume are identical until proven otherwise */
8253
8254     PERL_ARGS_ASSERT__INVLISTEQ;
8255
8256     /* If are to compare 'a' with the complement of b, set it
8257      * up so are looking at b's complement. */
8258     if (complement_b) {
8259
8260         /* The complement of nothing is everything, so <a> would have to have
8261          * just one element, starting at zero (ending at infinity) */
8262         if (len_b == 0) {
8263             return (len_a == 1 && array_a[0] == 0);
8264         }
8265         else if (array_b[0] == 0) {
8266
8267             /* Otherwise, to complement, we invert.  Here, the first element is
8268              * 0, just remove it.  To do this, we just pretend the array starts
8269              * one later, and clear the flag as we don't have to do anything
8270              * else later */
8271
8272             array_b++;
8273             len_b--;
8274             complement_b = FALSE;
8275         }
8276         else {
8277
8278             /* But if the first element is not zero, we unshift a 0 before the
8279              * array.  The data structure reserves a space for that 0 (which
8280              * should be a '1' right now), so physical shifting is unneeded,
8281              * but temporarily change that element to 0.  Before exiting the
8282              * routine, we must restore the element to '1' */
8283             array_b--;
8284             len_b++;
8285             array_b[0] = 0;
8286         }
8287     }
8288
8289     /* Make sure that the lengths are the same, as well as the final element
8290      * before looping through the remainder.  (Thus we test the length, final,
8291      * and first elements right off the bat) */
8292     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8293         retval = FALSE;
8294     }
8295     else for (i = 0; i < len_a - 1; i++) {
8296         if (array_a[i] != array_b[i]) {
8297             retval = FALSE;
8298             break;
8299         }
8300     }
8301
8302     if (complement_b) {
8303         array_b[0] = 1;
8304     }
8305     return retval;
8306 }
8307 #endif
8308
8309 #undef HEADER_LENGTH
8310 #undef INVLIST_INITIAL_LENGTH
8311 #undef TO_INTERNAL_SIZE
8312 #undef FROM_INTERNAL_SIZE
8313 #undef INVLIST_LEN_OFFSET
8314 #undef INVLIST_ZERO_OFFSET
8315 #undef INVLIST_ITER_OFFSET
8316 #undef INVLIST_VERSION_ID
8317 #undef INVLIST_PREVIOUS_INDEX_OFFSET
8318
8319 /* End of inversion list object */
8320
8321 /*
8322  - reg - regular expression, i.e. main body or parenthesized thing
8323  *
8324  * Caller must absorb opening parenthesis.
8325  *
8326  * Combining parenthesis handling with the base level of regular expression
8327  * is a trifle forced, but the need to tie the tails of the branches to what
8328  * follows makes it hard to avoid.
8329  */
8330 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8331 #ifdef DEBUGGING
8332 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8333 #else
8334 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8335 #endif
8336
8337 STATIC regnode *
8338 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8339     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8340 {
8341     dVAR;
8342     regnode *ret;               /* Will be the head of the group. */
8343     regnode *br;
8344     regnode *lastbr;
8345     regnode *ender = NULL;
8346     I32 parno = 0;
8347     I32 flags;
8348     U32 oregflags = RExC_flags;
8349     bool have_branch = 0;
8350     bool is_open = 0;
8351     I32 freeze_paren = 0;
8352     I32 after_freeze = 0;
8353
8354     /* for (?g), (?gc), and (?o) warnings; warning
8355        about (?c) will warn about (?g) -- japhy    */
8356
8357 #define WASTED_O  0x01
8358 #define WASTED_G  0x02
8359 #define WASTED_C  0x04
8360 #define WASTED_GC (0x02|0x04)
8361     I32 wastedflags = 0x00;
8362
8363     char * parse_start = RExC_parse; /* MJD */
8364     char * const oregcomp_parse = RExC_parse;
8365
8366     GET_RE_DEBUG_FLAGS_DECL;
8367
8368     PERL_ARGS_ASSERT_REG;
8369     DEBUG_PARSE("reg ");
8370
8371     *flagp = 0;                         /* Tentatively. */
8372
8373
8374     /* Make an OPEN node, if parenthesized. */
8375     if (paren) {
8376         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8377             char *start_verb = RExC_parse;
8378             STRLEN verb_len = 0;
8379             char *start_arg = NULL;
8380             unsigned char op = 0;
8381             int argok = 1;
8382             int internal_argval = 0; /* internal_argval is only useful if !argok */
8383             while ( *RExC_parse && *RExC_parse != ')' ) {
8384                 if ( *RExC_parse == ':' ) {
8385                     start_arg = RExC_parse + 1;
8386                     break;
8387                 }
8388                 RExC_parse++;
8389             }
8390             ++start_verb;
8391             verb_len = RExC_parse - start_verb;
8392             if ( start_arg ) {
8393                 RExC_parse++;
8394                 while ( *RExC_parse && *RExC_parse != ')' )
8395                     RExC_parse++;
8396                 if ( *RExC_parse != ')' )
8397                     vFAIL("Unterminated verb pattern argument");
8398                 if ( RExC_parse == start_arg )
8399                     start_arg = NULL;
8400             } else {
8401                 if ( *RExC_parse != ')' )
8402                     vFAIL("Unterminated verb pattern");
8403             }
8404
8405             switch ( *start_verb ) {
8406             case 'A':  /* (*ACCEPT) */
8407                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8408                     op = ACCEPT;
8409                     internal_argval = RExC_nestroot;
8410                 }
8411                 break;
8412             case 'C':  /* (*COMMIT) */
8413                 if ( memEQs(start_verb,verb_len,"COMMIT") )
8414                     op = COMMIT;
8415                 break;
8416             case 'F':  /* (*FAIL) */
8417                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8418                     op = OPFAIL;
8419                     argok = 0;
8420                 }
8421                 break;
8422             case ':':  /* (*:NAME) */
8423             case 'M':  /* (*MARK:NAME) */
8424                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8425                     op = MARKPOINT;
8426                     argok = -1;
8427                 }
8428                 break;
8429             case 'P':  /* (*PRUNE) */
8430                 if ( memEQs(start_verb,verb_len,"PRUNE") )
8431                     op = PRUNE;
8432                 break;
8433             case 'S':   /* (*SKIP) */
8434                 if ( memEQs(start_verb,verb_len,"SKIP") )
8435                     op = SKIP;
8436                 break;
8437             case 'T':  /* (*THEN) */
8438                 /* [19:06] <TimToady> :: is then */
8439                 if ( memEQs(start_verb,verb_len,"THEN") ) {
8440                     op = CUTGROUP;
8441                     RExC_seen |= REG_SEEN_CUTGROUP;
8442                 }
8443                 break;
8444             }
8445             if ( ! op ) {
8446                 RExC_parse++;
8447                 vFAIL3("Unknown verb pattern '%.*s'",
8448                     verb_len, start_verb);
8449             }
8450             if ( argok ) {
8451                 if ( start_arg && internal_argval ) {
8452                     vFAIL3("Verb pattern '%.*s' may not have an argument",
8453                         verb_len, start_verb);
8454                 } else if ( argok < 0 && !start_arg ) {
8455                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8456                         verb_len, start_verb);
8457                 } else {
8458                     ret = reganode(pRExC_state, op, internal_argval);
8459                     if ( ! internal_argval && ! SIZE_ONLY ) {
8460                         if (start_arg) {
8461                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8462                             ARG(ret) = add_data( pRExC_state, 1, "S" );
8463                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8464                             ret->flags = 0;
8465                         } else {
8466                             ret->flags = 1;
8467                         }
8468                     }
8469                 }
8470                 if (!internal_argval)
8471                     RExC_seen |= REG_SEEN_VERBARG;
8472             } else if ( start_arg ) {
8473                 vFAIL3("Verb pattern '%.*s' may not have an argument",
8474                         verb_len, start_verb);
8475             } else {
8476                 ret = reg_node(pRExC_state, op);
8477             }
8478             nextchar(pRExC_state);
8479             return ret;
8480         } else
8481         if (*RExC_parse == '?') { /* (?...) */
8482             bool is_logical = 0;
8483             const char * const seqstart = RExC_parse;
8484             bool has_use_defaults = FALSE;
8485
8486             RExC_parse++;
8487             paren = *RExC_parse++;
8488             ret = NULL;                 /* For look-ahead/behind. */
8489             switch (paren) {
8490
8491             case 'P':   /* (?P...) variants for those used to PCRE/Python */
8492                 paren = *RExC_parse++;
8493                 if ( paren == '<')         /* (?P<...>) named capture */
8494                     goto named_capture;
8495                 else if (paren == '>') {   /* (?P>name) named recursion */
8496                     goto named_recursion;
8497                 }
8498                 else if (paren == '=') {   /* (?P=...)  named backref */
8499                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
8500                        you change this make sure you change that */
8501                     char* name_start = RExC_parse;
8502                     U32 num = 0;
8503                     SV *sv_dat = reg_scan_name(pRExC_state,
8504                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8505                     if (RExC_parse == name_start || *RExC_parse != ')')
8506                         vFAIL2("Sequence %.3s... not terminated",parse_start);
8507
8508                     if (!SIZE_ONLY) {
8509                         num = add_data( pRExC_state, 1, "S" );
8510                         RExC_rxi->data->data[num]=(void*)sv_dat;
8511                         SvREFCNT_inc_simple_void(sv_dat);
8512                     }
8513                     RExC_sawback = 1;
8514                     ret = reganode(pRExC_state,
8515                                    ((! FOLD)
8516                                      ? NREF
8517                                      : (ASCII_FOLD_RESTRICTED)
8518                                        ? NREFFA
8519                                        : (AT_LEAST_UNI_SEMANTICS)
8520                                          ? NREFFU
8521                                          : (LOC)
8522                                            ? NREFFL
8523                                            : NREFF),
8524                                     num);
8525                     *flagp |= HASWIDTH;
8526
8527                     Set_Node_Offset(ret, parse_start+1);
8528                     Set_Node_Cur_Length(ret); /* MJD */
8529
8530                     nextchar(pRExC_state);
8531                     return ret;
8532                 }
8533                 RExC_parse++;
8534                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8535                 /*NOTREACHED*/
8536             case '<':           /* (?<...) */
8537                 if (*RExC_parse == '!')
8538                     paren = ',';
8539                 else if (*RExC_parse != '=')
8540               named_capture:
8541                 {               /* (?<...>) */
8542                     char *name_start;
8543                     SV *svname;
8544                     paren= '>';
8545             case '\'':          /* (?'...') */
8546                     name_start= RExC_parse;
8547                     svname = reg_scan_name(pRExC_state,
8548                         SIZE_ONLY ?  /* reverse test from the others */
8549                         REG_RSN_RETURN_NAME :
8550                         REG_RSN_RETURN_NULL);
8551                     if (RExC_parse == name_start) {
8552                         RExC_parse++;
8553                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8554                         /*NOTREACHED*/
8555                     }
8556                     if (*RExC_parse != paren)
8557                         vFAIL2("Sequence (?%c... not terminated",
8558                             paren=='>' ? '<' : paren);
8559                     if (SIZE_ONLY) {
8560                         HE *he_str;
8561                         SV *sv_dat = NULL;
8562                         if (!svname) /* shouldn't happen */
8563                             Perl_croak(aTHX_
8564                                 "panic: reg_scan_name returned NULL");
8565                         if (!RExC_paren_names) {
8566                             RExC_paren_names= newHV();
8567                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
8568 #ifdef DEBUGGING
8569                             RExC_paren_name_list= newAV();
8570                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8571 #endif
8572                         }
8573                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8574                         if ( he_str )
8575                             sv_dat = HeVAL(he_str);
8576                         if ( ! sv_dat ) {
8577                             /* croak baby croak */
8578                             Perl_croak(aTHX_
8579                                 "panic: paren_name hash element allocation failed");
8580                         } else if ( SvPOK(sv_dat) ) {
8581                             /* (?|...) can mean we have dupes so scan to check
8582                                its already been stored. Maybe a flag indicating
8583                                we are inside such a construct would be useful,
8584                                but the arrays are likely to be quite small, so
8585                                for now we punt -- dmq */
8586                             IV count = SvIV(sv_dat);
8587                             I32 *pv = (I32*)SvPVX(sv_dat);
8588                             IV i;
8589                             for ( i = 0 ; i < count ; i++ ) {
8590                                 if ( pv[i] == RExC_npar ) {
8591                                     count = 0;
8592                                     break;
8593                                 }
8594                             }
8595                             if ( count ) {
8596                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8597                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8598                                 pv[count] = RExC_npar;
8599                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8600                             }
8601                         } else {
8602                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
8603                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8604                             SvIOK_on(sv_dat);
8605                             SvIV_set(sv_dat, 1);
8606                         }
8607 #ifdef DEBUGGING
8608                         /* Yes this does cause a memory leak in debugging Perls */
8609                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8610                             SvREFCNT_dec_NN(svname);
8611 #endif
8612
8613                         /*sv_dump(sv_dat);*/
8614                     }
8615                     nextchar(pRExC_state);
8616                     paren = 1;
8617                     goto capturing_parens;
8618                 }
8619                 RExC_seen |= REG_SEEN_LOOKBEHIND;
8620                 RExC_in_lookbehind++;
8621                 RExC_parse++;
8622             case '=':           /* (?=...) */
8623                 RExC_seen_zerolen++;
8624                 break;
8625             case '!':           /* (?!...) */
8626                 RExC_seen_zerolen++;
8627                 if (*RExC_parse == ')') {
8628                     ret=reg_node(pRExC_state, OPFAIL);
8629                     nextchar(pRExC_state);
8630                     return ret;
8631                 }
8632                 break;
8633             case '|':           /* (?|...) */
8634                 /* branch reset, behave like a (?:...) except that
8635                    buffers in alternations share the same numbers */
8636                 paren = ':';
8637                 after_freeze = freeze_paren = RExC_npar;
8638                 break;
8639             case ':':           /* (?:...) */
8640             case '>':           /* (?>...) */
8641                 break;
8642             case '$':           /* (?$...) */
8643             case '@':           /* (?@...) */
8644                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8645                 break;
8646             case '#':           /* (?#...) */
8647                 while (*RExC_parse && *RExC_parse != ')')
8648                     RExC_parse++;
8649                 if (*RExC_parse != ')')
8650                     FAIL("Sequence (?#... not terminated");
8651                 nextchar(pRExC_state);
8652                 *flagp = TRYAGAIN;
8653                 return NULL;
8654             case '0' :           /* (?0) */
8655             case 'R' :           /* (?R) */
8656                 if (*RExC_parse != ')')
8657                     FAIL("Sequence (?R) not terminated");
8658                 ret = reg_node(pRExC_state, GOSTART);
8659                 *flagp |= POSTPONED;
8660                 nextchar(pRExC_state);
8661                 return ret;
8662                 /*notreached*/
8663             { /* named and numeric backreferences */
8664                 I32 num;
8665             case '&':            /* (?&NAME) */
8666                 parse_start = RExC_parse - 1;
8667               named_recursion:
8668                 {
8669                     SV *sv_dat = reg_scan_name(pRExC_state,
8670                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8671                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8672                 }
8673                 goto gen_recurse_regop;
8674                 assert(0); /* NOT REACHED */
8675             case '+':
8676                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8677                     RExC_parse++;
8678                     vFAIL("Illegal pattern");
8679                 }
8680                 goto parse_recursion;
8681                 /* NOT REACHED*/
8682             case '-': /* (?-1) */
8683                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8684                     RExC_parse--; /* rewind to let it be handled later */
8685                     goto parse_flags;
8686                 }
8687                 /*FALLTHROUGH */
8688             case '1': case '2': case '3': case '4': /* (?1) */
8689             case '5': case '6': case '7': case '8': case '9':
8690                 RExC_parse--;
8691               parse_recursion:
8692                 num = atoi(RExC_parse);
8693                 parse_start = RExC_parse - 1; /* MJD */
8694                 if (*RExC_parse == '-')
8695                     RExC_parse++;
8696                 while (isDIGIT(*RExC_parse))
8697                         RExC_parse++;
8698                 if (*RExC_parse!=')')
8699                     vFAIL("Expecting close bracket");
8700
8701               gen_recurse_regop:
8702                 if ( paren == '-' ) {
8703                     /*
8704                     Diagram of capture buffer numbering.
8705                     Top line is the normal capture buffer numbers
8706                     Bottom line is the negative indexing as from
8707                     the X (the (?-2))
8708
8709                     +   1 2    3 4 5 X          6 7
8710                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8711                     -   5 4    3 2 1 X          x x
8712
8713                     */
8714                     num = RExC_npar + num;
8715                     if (num < 1)  {
8716                         RExC_parse++;
8717                         vFAIL("Reference to nonexistent group");
8718                     }
8719                 } else if ( paren == '+' ) {
8720                     num = RExC_npar + num - 1;
8721                 }
8722
8723                 ret = reganode(pRExC_state, GOSUB, num);
8724                 if (!SIZE_ONLY) {
8725                     if (num > (I32)RExC_rx->nparens) {
8726                         RExC_parse++;
8727                         vFAIL("Reference to nonexistent group");
8728                     }
8729                     ARG2L_SET( ret, RExC_recurse_count++);
8730                     RExC_emit++;
8731                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8732                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8733                 } else {
8734                     RExC_size++;
8735                 }
8736                 RExC_seen |= REG_SEEN_RECURSE;
8737                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8738                 Set_Node_Offset(ret, parse_start); /* MJD */
8739
8740                 *flagp |= POSTPONED;
8741                 nextchar(pRExC_state);
8742                 return ret;
8743             } /* named and numeric backreferences */
8744             assert(0); /* NOT REACHED */
8745
8746             case '?':           /* (??...) */
8747                 is_logical = 1;
8748                 if (*RExC_parse != '{') {
8749                     RExC_parse++;
8750                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8751                     /*NOTREACHED*/
8752                 }
8753                 *flagp |= POSTPONED;
8754                 paren = *RExC_parse++;
8755                 /* FALL THROUGH */
8756             case '{':           /* (?{...}) */
8757             {
8758                 U32 n = 0;
8759                 struct reg_code_block *cb;
8760
8761                 RExC_seen_zerolen++;
8762
8763                 if (   !pRExC_state->num_code_blocks
8764                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
8765                     || pRExC_state->code_blocks[pRExC_state->code_index].start
8766                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8767                             - RExC_start)
8768                 ) {
8769                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
8770                         FAIL("panic: Sequence (?{...}): no code block found\n");
8771                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
8772                 }
8773                 /* this is a pre-compiled code block (?{...}) */
8774                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8775                 RExC_parse = RExC_start + cb->end;
8776                 if (!SIZE_ONLY) {
8777                     OP *o = cb->block;
8778                     if (cb->src_regex) {
8779                         n = add_data(pRExC_state, 2, "rl");
8780                         RExC_rxi->data->data[n] =
8781                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
8782                         RExC_rxi->data->data[n+1] = (void*)o;
8783                     }
8784                     else {
8785                         n = add_data(pRExC_state, 1,
8786                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8787                         RExC_rxi->data->data[n] = (void*)o;
8788                     }
8789                 }
8790                 pRExC_state->code_index++;
8791                 nextchar(pRExC_state);
8792
8793                 if (is_logical) {
8794                     regnode *eval;
8795                     ret = reg_node(pRExC_state, LOGICAL);
8796                     eval = reganode(pRExC_state, EVAL, n);
8797                     if (!SIZE_ONLY) {
8798                         ret->flags = 2;
8799                         /* for later propagation into (??{}) return value */
8800                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8801                     }
8802                     REGTAIL(pRExC_state, ret, eval);
8803                     /* deal with the length of this later - MJD */
8804                     return ret;
8805                 }
8806                 ret = reganode(pRExC_state, EVAL, n);
8807                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8808                 Set_Node_Offset(ret, parse_start);
8809                 return ret;
8810             }
8811             case '(':           /* (?(?{...})...) and (?(?=...)...) */
8812             {
8813                 int is_define= 0;
8814                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
8815                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8816                         || RExC_parse[1] == '<'
8817                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
8818                         I32 flag;
8819
8820                         ret = reg_node(pRExC_state, LOGICAL);
8821                         if (!SIZE_ONLY)
8822                             ret->flags = 1;
8823                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
8824                         goto insert_if;
8825                     }
8826                 }
8827                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
8828                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8829                 {
8830                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
8831                     char *name_start= RExC_parse++;
8832                     U32 num = 0;
8833                     SV *sv_dat=reg_scan_name(pRExC_state,
8834                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8835                     if (RExC_parse == name_start || *RExC_parse != ch)
8836                         vFAIL2("Sequence (?(%c... not terminated",
8837                             (ch == '>' ? '<' : ch));
8838                     RExC_parse++;
8839                     if (!SIZE_ONLY) {
8840                         num = add_data( pRExC_state, 1, "S" );
8841                         RExC_rxi->data->data[num]=(void*)sv_dat;
8842                         SvREFCNT_inc_simple_void(sv_dat);
8843                     }
8844                     ret = reganode(pRExC_state,NGROUPP,num);
8845                     goto insert_if_check_paren;
8846                 }
8847                 else if (RExC_parse[0] == 'D' &&
8848                          RExC_parse[1] == 'E' &&
8849                          RExC_parse[2] == 'F' &&
8850                          RExC_parse[3] == 'I' &&
8851                          RExC_parse[4] == 'N' &&
8852                          RExC_parse[5] == 'E')
8853                 {
8854                     ret = reganode(pRExC_state,DEFINEP,0);
8855                     RExC_parse +=6 ;
8856                     is_define = 1;
8857                     goto insert_if_check_paren;
8858                 }
8859                 else if (RExC_parse[0] == 'R') {
8860                     RExC_parse++;
8861                     parno = 0;
8862                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8863                         parno = atoi(RExC_parse++);
8864                         while (isDIGIT(*RExC_parse))
8865                             RExC_parse++;
8866                     } else if (RExC_parse[0] == '&') {
8867                         SV *sv_dat;
8868                         RExC_parse++;
8869                         sv_dat = reg_scan_name(pRExC_state,
8870                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8871                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8872                     }
8873                     ret = reganode(pRExC_state,INSUBP,parno);
8874                     goto insert_if_check_paren;
8875                 }
8876                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8877                     /* (?(1)...) */
8878                     char c;
8879                     parno = atoi(RExC_parse++);
8880
8881                     while (isDIGIT(*RExC_parse))
8882                         RExC_parse++;
8883                     ret = reganode(pRExC_state, GROUPP, parno);
8884
8885                  insert_if_check_paren:
8886                     if ((c = *nextchar(pRExC_state)) != ')')
8887                         vFAIL("Switch condition not recognized");
8888                   insert_if:
8889                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8890                     br = regbranch(pRExC_state, &flags, 1,depth+1);
8891                     if (br == NULL)
8892                         br = reganode(pRExC_state, LONGJMP, 0);
8893                     else
8894                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8895                     c = *nextchar(pRExC_state);
8896                     if (flags&HASWIDTH)
8897                         *flagp |= HASWIDTH;
8898                     if (c == '|') {
8899                         if (is_define)
8900                             vFAIL("(?(DEFINE)....) does not allow branches");
8901                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8902                         regbranch(pRExC_state, &flags, 1,depth+1);
8903                         REGTAIL(pRExC_state, ret, lastbr);
8904                         if (flags&HASWIDTH)
8905                             *flagp |= HASWIDTH;
8906                         c = *nextchar(pRExC_state);
8907                     }
8908                     else
8909                         lastbr = NULL;
8910                     if (c != ')')
8911                         vFAIL("Switch (?(condition)... contains too many branches");
8912                     ender = reg_node(pRExC_state, TAIL);
8913                     REGTAIL(pRExC_state, br, ender);
8914                     if (lastbr) {
8915                         REGTAIL(pRExC_state, lastbr, ender);
8916                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8917                     }
8918                     else
8919                         REGTAIL(pRExC_state, ret, ender);
8920                     RExC_size++; /* XXX WHY do we need this?!!
8921                                     For large programs it seems to be required
8922                                     but I can't figure out why. -- dmq*/
8923                     return ret;
8924                 }
8925                 else {
8926                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8927                 }
8928             }
8929             case '[':           /* (?[ ... ]) */
8930                 return handle_sets(pRExC_state, flagp, depth, oregcomp_parse);
8931             case 0:
8932                 RExC_parse--; /* for vFAIL to print correctly */
8933                 vFAIL("Sequence (? incomplete");
8934                 break;
8935             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
8936                                        that follow */
8937                 has_use_defaults = TRUE;
8938                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8939                 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8940                                                 ? REGEX_UNICODE_CHARSET
8941                                                 : REGEX_DEPENDS_CHARSET);
8942                 goto parse_flags;
8943             default:
8944                 --RExC_parse;
8945                 parse_flags:      /* (?i) */
8946             {
8947                 U32 posflags = 0, negflags = 0;
8948                 U32 *flagsp = &posflags;
8949                 char has_charset_modifier = '\0';
8950                 regex_charset cs = get_regex_charset(RExC_flags);
8951                 if (cs == REGEX_DEPENDS_CHARSET
8952                     && (RExC_utf8 || RExC_uni_semantics))
8953                 {
8954                     cs = REGEX_UNICODE_CHARSET;
8955                 }
8956
8957                 while (*RExC_parse) {
8958                     /* && strchr("iogcmsx", *RExC_parse) */
8959                     /* (?g), (?gc) and (?o) are useless here
8960                        and must be globally applied -- japhy */
8961                     switch (*RExC_parse) {
8962                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8963                     case LOCALE_PAT_MOD:
8964                         if (has_charset_modifier) {
8965                             goto excess_modifier;
8966                         }
8967                         else if (flagsp == &negflags) {
8968                             goto neg_modifier;
8969                         }
8970                         cs = REGEX_LOCALE_CHARSET;
8971                         has_charset_modifier = LOCALE_PAT_MOD;
8972                         RExC_contains_locale = 1;
8973                         break;
8974                     case UNICODE_PAT_MOD:
8975                         if (has_charset_modifier) {
8976                             goto excess_modifier;
8977                         }
8978                         else if (flagsp == &negflags) {
8979                             goto neg_modifier;
8980                         }
8981                         cs = REGEX_UNICODE_CHARSET;
8982                         has_charset_modifier = UNICODE_PAT_MOD;
8983                         break;
8984                     case ASCII_RESTRICT_PAT_MOD:
8985                         if (flagsp == &negflags) {
8986                             goto neg_modifier;
8987                         }
8988                         if (has_charset_modifier) {
8989                             if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8990                                 goto excess_modifier;
8991                             }
8992                             /* Doubled modifier implies more restricted */
8993                             cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8994                         }
8995                         else {
8996                             cs = REGEX_ASCII_RESTRICTED_CHARSET;
8997                         }
8998                         has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8999                         break;
9000                     case DEPENDS_PAT_MOD:
9001                         if (has_use_defaults) {
9002                             goto fail_modifiers;
9003                         }
9004                         else if (flagsp == &negflags) {
9005                             goto neg_modifier;
9006                         }
9007                         else if (has_charset_modifier) {
9008                             goto excess_modifier;
9009                         }
9010
9011                         /* The dual charset means unicode semantics if the
9012                          * pattern (or target, not known until runtime) are
9013                          * utf8, or something in the pattern indicates unicode
9014                          * semantics */
9015                         cs = (RExC_utf8 || RExC_uni_semantics)
9016                              ? REGEX_UNICODE_CHARSET
9017                              : REGEX_DEPENDS_CHARSET;
9018                         has_charset_modifier = DEPENDS_PAT_MOD;
9019                         break;
9020                     excess_modifier:
9021                         RExC_parse++;
9022                         if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9023                             vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9024                         }
9025                         else if (has_charset_modifier == *(RExC_parse - 1)) {
9026                             vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
9027                         }
9028                         else {
9029                             vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9030                         }
9031                         /*NOTREACHED*/
9032                     neg_modifier:
9033                         RExC_parse++;
9034                         vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
9035                         /*NOTREACHED*/
9036                     case ONCE_PAT_MOD: /* 'o' */
9037                     case GLOBAL_PAT_MOD: /* 'g' */
9038                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9039                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9040                             if (! (wastedflags & wflagbit) ) {
9041                                 wastedflags |= wflagbit;
9042                                 vWARN5(
9043                                     RExC_parse + 1,
9044                                     "Useless (%s%c) - %suse /%c modifier",
9045                                     flagsp == &negflags ? "?-" : "?",
9046                                     *RExC_parse,
9047                                     flagsp == &negflags ? "don't " : "",
9048                                     *RExC_parse
9049                                 );
9050                             }
9051                         }
9052                         break;
9053
9054                     case CONTINUE_PAT_MOD: /* 'c' */
9055                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9056                             if (! (wastedflags & WASTED_C) ) {
9057                                 wastedflags |= WASTED_GC;
9058                                 vWARN3(
9059                                     RExC_parse + 1,
9060                                     "Useless (%sc) - %suse /gc modifier",
9061                                     flagsp == &negflags ? "?-" : "?",
9062                                     flagsp == &negflags ? "don't " : ""
9063                                 );
9064                             }
9065                         }
9066                         break;
9067                     case KEEPCOPY_PAT_MOD: /* 'p' */
9068                         if (flagsp == &negflags) {
9069                             if (SIZE_ONLY)
9070                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9071                         } else {
9072                             *flagsp |= RXf_PMf_KEEPCOPY;
9073                         }
9074                         break;
9075                     case '-':
9076                         /* A flag is a default iff it is following a minus, so
9077                          * if there is a minus, it means will be trying to
9078                          * re-specify a default which is an error */
9079                         if (has_use_defaults || flagsp == &negflags) {
9080             fail_modifiers:
9081                             RExC_parse++;
9082                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9083                             /*NOTREACHED*/
9084                         }
9085                         flagsp = &negflags;
9086                         wastedflags = 0;  /* reset so (?g-c) warns twice */
9087                         break;
9088                     case ':':
9089                         paren = ':';
9090                         /*FALLTHROUGH*/
9091                     case ')':
9092                         RExC_flags |= posflags;
9093                         RExC_flags &= ~negflags;
9094                         set_regex_charset(&RExC_flags, cs);
9095                         if (paren != ':') {
9096                             oregflags |= posflags;
9097                             oregflags &= ~negflags;
9098                             set_regex_charset(&oregflags, cs);
9099                         }
9100                         nextchar(pRExC_state);
9101                         if (paren != ':') {
9102                             *flagp = TRYAGAIN;
9103                             return NULL;
9104                         } else {
9105                             ret = NULL;
9106                             goto parse_rest;
9107                         }
9108                         /*NOTREACHED*/
9109                     default:
9110                         RExC_parse++;
9111                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9112                         /*NOTREACHED*/
9113                     }
9114                     ++RExC_parse;
9115                 }
9116             }} /* one for the default block, one for the switch */
9117         }
9118         else {                  /* (...) */
9119           capturing_parens:
9120             parno = RExC_npar;
9121             RExC_npar++;
9122
9123             ret = reganode(pRExC_state, OPEN, parno);
9124             if (!SIZE_ONLY ){
9125                 if (!RExC_nestroot)
9126                     RExC_nestroot = parno;
9127                 if (RExC_seen & REG_SEEN_RECURSE
9128                     && !RExC_open_parens[parno-1])
9129                 {
9130                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9131                         "Setting open paren #%"IVdf" to %d\n",
9132                         (IV)parno, REG_NODE_NUM(ret)));
9133                     RExC_open_parens[parno-1]= ret;
9134                 }
9135             }
9136             Set_Node_Length(ret, 1); /* MJD */
9137             Set_Node_Offset(ret, RExC_parse); /* MJD */
9138             is_open = 1;
9139         }
9140     }
9141     else                        /* ! paren */
9142         ret = NULL;
9143
9144    parse_rest:
9145     /* Pick up the branches, linking them together. */
9146     parse_start = RExC_parse;   /* MJD */
9147     br = regbranch(pRExC_state, &flags, 1,depth+1);
9148
9149     /*     branch_len = (paren != 0); */
9150
9151     if (br == NULL)
9152         return(NULL);
9153     if (*RExC_parse == '|') {
9154         if (!SIZE_ONLY && RExC_extralen) {
9155             reginsert(pRExC_state, BRANCHJ, br, depth+1);
9156         }
9157         else {                  /* MJD */
9158             reginsert(pRExC_state, BRANCH, br, depth+1);
9159             Set_Node_Length(br, paren != 0);
9160             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9161         }
9162         have_branch = 1;
9163         if (SIZE_ONLY)
9164             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
9165     }
9166     else if (paren == ':') {
9167         *flagp |= flags&SIMPLE;
9168     }
9169     if (is_open) {                              /* Starts with OPEN. */
9170         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
9171     }
9172     else if (paren != '?')              /* Not Conditional */
9173         ret = br;
9174     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9175     lastbr = br;
9176     while (*RExC_parse == '|') {
9177         if (!SIZE_ONLY && RExC_extralen) {
9178             ender = reganode(pRExC_state, LONGJMP,0);
9179             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9180         }
9181         if (SIZE_ONLY)
9182             RExC_extralen += 2;         /* Account for LONGJMP. */
9183         nextchar(pRExC_state);
9184         if (freeze_paren) {
9185             if (RExC_npar > after_freeze)
9186                 after_freeze = RExC_npar;
9187             RExC_npar = freeze_paren;
9188         }
9189         br = regbranch(pRExC_state, &flags, 0, depth+1);
9190
9191         if (br == NULL)
9192             return(NULL);
9193         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
9194         lastbr = br;
9195         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9196     }
9197
9198     if (have_branch || paren != ':') {
9199         /* Make a closing node, and hook it on the end. */
9200         switch (paren) {
9201         case ':':
9202             ender = reg_node(pRExC_state, TAIL);
9203             break;
9204         case 1:
9205             ender = reganode(pRExC_state, CLOSE, parno);
9206             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9207                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9208                         "Setting close paren #%"IVdf" to %d\n",
9209                         (IV)parno, REG_NODE_NUM(ender)));
9210                 RExC_close_parens[parno-1]= ender;
9211                 if (RExC_nestroot == parno)
9212                     RExC_nestroot = 0;
9213             }
9214             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9215             Set_Node_Length(ender,1); /* MJD */
9216             break;
9217         case '<':
9218         case ',':
9219         case '=':
9220         case '!':
9221             *flagp &= ~HASWIDTH;
9222             /* FALL THROUGH */
9223         case '>':
9224             ender = reg_node(pRExC_state, SUCCEED);
9225             break;
9226         case 0:
9227             ender = reg_node(pRExC_state, END);
9228             if (!SIZE_ONLY) {
9229                 assert(!RExC_opend); /* there can only be one! */
9230                 RExC_opend = ender;
9231             }
9232             break;
9233         }
9234         DEBUG_PARSE_r(if (!SIZE_ONLY) {
9235             SV * const mysv_val1=sv_newmortal();
9236             SV * const mysv_val2=sv_newmortal();
9237             DEBUG_PARSE_MSG("lsbr");
9238             regprop(RExC_rx, mysv_val1, lastbr);
9239             regprop(RExC_rx, mysv_val2, ender);
9240             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9241                           SvPV_nolen_const(mysv_val1),
9242                           (IV)REG_NODE_NUM(lastbr),
9243                           SvPV_nolen_const(mysv_val2),
9244                           (IV)REG_NODE_NUM(ender),
9245                           (IV)(ender - lastbr)
9246             );
9247         });
9248         REGTAIL(pRExC_state, lastbr, ender);
9249
9250         if (have_branch && !SIZE_ONLY) {
9251             char is_nothing= 1;
9252             if (depth==1)
9253                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9254
9255             /* Hook the tails of the branches to the closing node. */
9256             for (br = ret; br; br = regnext(br)) {
9257                 const U8 op = PL_regkind[OP(br)];
9258                 if (op == BRANCH) {
9259                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9260                     if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9261                         is_nothing= 0;
9262                 }
9263                 else if (op == BRANCHJ) {
9264                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9265                     /* for now we always disable this optimisation * /
9266                     if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9267                     */
9268                         is_nothing= 0;
9269                 }
9270             }
9271             if (is_nothing) {
9272                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9273                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9274                     SV * const mysv_val1=sv_newmortal();
9275                     SV * const mysv_val2=sv_newmortal();
9276                     DEBUG_PARSE_MSG("NADA");
9277                     regprop(RExC_rx, mysv_val1, ret);
9278                     regprop(RExC_rx, mysv_val2, ender);
9279                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9280                                   SvPV_nolen_const(mysv_val1),
9281                                   (IV)REG_NODE_NUM(ret),
9282                                   SvPV_nolen_const(mysv_val2),
9283                                   (IV)REG_NODE_NUM(ender),
9284                                   (IV)(ender - ret)
9285                     );
9286                 });
9287                 OP(br)= NOTHING;
9288                 if (OP(ender) == TAIL) {
9289                     NEXT_OFF(br)= 0;
9290                     RExC_emit= br + 1;
9291                 } else {
9292                     regnode *opt;
9293                     for ( opt= br + 1; opt < ender ; opt++ )
9294                         OP(opt)= OPTIMIZED;
9295                     NEXT_OFF(br)= ender - br;
9296                 }
9297             }
9298         }
9299     }
9300
9301     {
9302         const char *p;
9303         static const char parens[] = "=!<,>";
9304
9305         if (paren && (p = strchr(parens, paren))) {
9306             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9307             int flag = (p - parens) > 1;
9308
9309             if (paren == '>')
9310                 node = SUSPEND, flag = 0;
9311             reginsert(pRExC_state, node,ret, depth+1);
9312             Set_Node_Cur_Length(ret);
9313             Set_Node_Offset(ret, parse_start + 1);
9314             ret->flags = flag;
9315             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9316         }
9317     }
9318
9319     /* Check for proper termination. */
9320     if (paren) {
9321         RExC_flags = oregflags;
9322         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9323             RExC_parse = oregcomp_parse;
9324             vFAIL("Unmatched (");
9325         }
9326     }
9327     else if (!paren && RExC_parse < RExC_end) {
9328         if (*RExC_parse == ')') {
9329             RExC_parse++;
9330             vFAIL("Unmatched )");
9331         }
9332         else
9333             FAIL("Junk on end of regexp");      /* "Can't happen". */
9334         assert(0); /* NOTREACHED */
9335     }
9336
9337     if (RExC_in_lookbehind) {
9338         RExC_in_lookbehind--;
9339     }
9340     if (after_freeze > RExC_npar)
9341         RExC_npar = after_freeze;
9342     return(ret);
9343 }
9344
9345 /*
9346  - regbranch - one alternative of an | operator
9347  *
9348  * Implements the concatenation operator.
9349  */
9350 STATIC regnode *
9351 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9352 {
9353     dVAR;
9354     regnode *ret;
9355     regnode *chain = NULL;
9356     regnode *latest;
9357     I32 flags = 0, c = 0;
9358     GET_RE_DEBUG_FLAGS_DECL;
9359
9360     PERL_ARGS_ASSERT_REGBRANCH;
9361
9362     DEBUG_PARSE("brnc");
9363
9364     if (first)
9365         ret = NULL;
9366     else {
9367         if (!SIZE_ONLY && RExC_extralen)
9368             ret = reganode(pRExC_state, BRANCHJ,0);
9369         else {
9370             ret = reg_node(pRExC_state, BRANCH);
9371             Set_Node_Length(ret, 1);
9372         }
9373     }
9374
9375     if (!first && SIZE_ONLY)
9376         RExC_extralen += 1;                     /* BRANCHJ */
9377
9378     *flagp = WORST;                     /* Tentatively. */
9379
9380     RExC_parse--;
9381     nextchar(pRExC_state);
9382     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9383         flags &= ~TRYAGAIN;
9384         latest = regpiece(pRExC_state, &flags,depth+1);
9385         if (latest == NULL) {
9386             if (flags & TRYAGAIN)
9387                 continue;
9388             return(NULL);
9389         }
9390         else if (ret == NULL)
9391             ret = latest;
9392         *flagp |= flags&(HASWIDTH|POSTPONED);
9393         if (chain == NULL)      /* First piece. */
9394             *flagp |= flags&SPSTART;
9395         else {
9396             RExC_naughty++;
9397             REGTAIL(pRExC_state, chain, latest);
9398         }
9399         chain = latest;
9400         c++;
9401     }
9402     if (chain == NULL) {        /* Loop ran zero times. */
9403         chain = reg_node(pRExC_state, NOTHING);
9404         if (ret == NULL)
9405             ret = chain;
9406     }
9407     if (c == 1) {
9408         *flagp |= flags&SIMPLE;
9409     }
9410
9411     return ret;
9412 }
9413
9414 /*
9415  - regpiece - something followed by possible [*+?]
9416  *
9417  * Note that the branching code sequences used for ? and the general cases
9418  * of * and + are somewhat optimized:  they use the same NOTHING node as
9419  * both the endmarker for their branch list and the body of the last branch.
9420  * It might seem that this node could be dispensed with entirely, but the
9421  * endmarker role is not redundant.
9422  */
9423 STATIC regnode *
9424 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9425 {
9426     dVAR;
9427     regnode *ret;
9428     char op;
9429     char *next;
9430     I32 flags;
9431     const char * const origparse = RExC_parse;
9432     I32 min;
9433     I32 max = REG_INFTY;
9434 #ifdef RE_TRACK_PATTERN_OFFSETS
9435     char *parse_start;
9436 #endif
9437     const char *maxpos = NULL;
9438
9439     /* Save the original in case we change the emitted regop to a FAIL. */
9440     regnode * const orig_emit = RExC_emit;
9441
9442     GET_RE_DEBUG_FLAGS_DECL;
9443
9444     PERL_ARGS_ASSERT_REGPIECE;
9445
9446     DEBUG_PARSE("piec");
9447
9448     ret = regatom(pRExC_state, &flags,depth+1);
9449     if (ret == NULL) {
9450         if (flags & TRYAGAIN)
9451             *flagp |= TRYAGAIN;
9452         return(NULL);
9453     }
9454
9455     op = *RExC_parse;
9456
9457     if (op == '{' && regcurly(RExC_parse, FALSE)) {
9458         maxpos = NULL;
9459 #ifdef RE_TRACK_PATTERN_OFFSETS
9460         parse_start = RExC_parse; /* MJD */
9461 #endif
9462         next = RExC_parse + 1;
9463         while (isDIGIT(*next) || *next == ',') {
9464             if (*next == ',') {
9465                 if (maxpos)
9466                     break;
9467                 else
9468                     maxpos = next;
9469             }
9470             next++;
9471         }
9472         if (*next == '}') {             /* got one */
9473             if (!maxpos)
9474                 maxpos = next;
9475             RExC_parse++;
9476             min = atoi(RExC_parse);
9477             if (*maxpos == ',')
9478                 maxpos++;
9479             else
9480                 maxpos = RExC_parse;
9481             max = atoi(maxpos);
9482             if (!max && *maxpos != '0')
9483                 max = REG_INFTY;                /* meaning "infinity" */
9484             else if (max >= REG_INFTY)
9485                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9486             RExC_parse = next;
9487             nextchar(pRExC_state);
9488             if (max < min) {    /* If can't match, warn and optimize to fail
9489                                    unconditionally */
9490                 if (SIZE_ONLY) {
9491                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9492
9493                     /* We can't back off the size because we have to reserve
9494                      * enough space for all the things we are about to throw
9495                      * away, but we can shrink it by the ammount we are about
9496                      * to re-use here */
9497                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9498                 }
9499                 else {
9500                     RExC_emit = orig_emit;
9501                 }
9502                 ret = reg_node(pRExC_state, OPFAIL);
9503                 return ret;
9504             }
9505             else if (max == 0) {    /* replace {0} with a nothing node */
9506                 if (SIZE_ONLY) {
9507                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)NOTHING];
9508                 }
9509                 else {
9510                     RExC_emit = orig_emit;
9511                 }
9512                 ret = reg_node(pRExC_state, NOTHING);
9513                 return ret;
9514             }
9515
9516         do_curly:
9517             if ((flags&SIMPLE)) {
9518                 RExC_naughty += 2 + RExC_naughty / 2;
9519                 reginsert(pRExC_state, CURLY, ret, depth+1);
9520                 Set_Node_Offset(ret, parse_start+1); /* MJD */
9521                 Set_Node_Cur_Length(ret);
9522             }
9523             else {
9524                 regnode * const w = reg_node(pRExC_state, WHILEM);
9525
9526                 w->flags = 0;
9527                 REGTAIL(pRExC_state, ret, w);
9528                 if (!SIZE_ONLY && RExC_extralen) {
9529                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
9530                     reginsert(pRExC_state, NOTHING,ret, depth+1);
9531                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
9532                 }
9533                 reginsert(pRExC_state, CURLYX,ret, depth+1);
9534                                 /* MJD hk */
9535                 Set_Node_Offset(ret, parse_start+1);
9536                 Set_Node_Length(ret,
9537                                 op == '{' ? (RExC_parse - parse_start) : 1);
9538
9539                 if (!SIZE_ONLY && RExC_extralen)
9540                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
9541                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9542                 if (SIZE_ONLY)
9543                     RExC_whilem_seen++, RExC_extralen += 3;
9544                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
9545             }
9546             ret->flags = 0;
9547
9548             if (min > 0)
9549                 *flagp = WORST;
9550             if (max > 0)
9551                 *flagp |= HASWIDTH;
9552             if (!SIZE_ONLY) {
9553                 ARG1_SET(ret, (U16)min);
9554                 ARG2_SET(ret, (U16)max);
9555             }
9556
9557             goto nest_check;
9558         }
9559     }
9560
9561     if (!ISMULT1(op)) {
9562         *flagp = flags;
9563         return(ret);
9564     }
9565
9566 #if 0                           /* Now runtime fix should be reliable. */
9567
9568     /* if this is reinstated, don't forget to put this back into perldiag:
9569
9570             =item Regexp *+ operand could be empty at {#} in regex m/%s/
9571
9572            (F) The part of the regexp subject to either the * or + quantifier
9573            could match an empty string. The {#} shows in the regular
9574            expression about where the problem was discovered.
9575
9576     */
9577
9578     if (!(flags&HASWIDTH) && op != '?')
9579       vFAIL("Regexp *+ operand could be empty");
9580 #endif
9581
9582 #ifdef RE_TRACK_PATTERN_OFFSETS
9583     parse_start = RExC_parse;
9584 #endif
9585     nextchar(pRExC_state);
9586
9587     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9588
9589     if (op == '*' && (flags&SIMPLE)) {
9590         reginsert(pRExC_state, STAR, ret, depth+1);
9591         ret->flags = 0;
9592         RExC_naughty += 4;
9593     }
9594     else if (op == '*') {
9595         min = 0;
9596         goto do_curly;
9597     }
9598     else if (op == '+' && (flags&SIMPLE)) {
9599         reginsert(pRExC_state, PLUS, ret, depth+1);
9600         ret->flags = 0;
9601         RExC_naughty += 3;
9602     }
9603     else if (op == '+') {
9604         min = 1;
9605         goto do_curly;
9606     }
9607     else if (op == '?') {
9608         min = 0; max = 1;
9609         goto do_curly;
9610     }
9611   nest_check:
9612     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9613         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
9614         ckWARN3reg(RExC_parse,
9615                    "%.*s matches null string many times",
9616                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9617                    origparse);
9618         (void)ReREFCNT_inc(RExC_rx_sv);
9619     }
9620
9621     if (RExC_parse < RExC_end && *RExC_parse == '?') {
9622         nextchar(pRExC_state);
9623         reginsert(pRExC_state, MINMOD, ret, depth+1);
9624         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9625     }
9626 #ifndef REG_ALLOW_MINMOD_SUSPEND
9627     else
9628 #endif
9629     if (RExC_parse < RExC_end && *RExC_parse == '+') {
9630         regnode *ender;
9631         nextchar(pRExC_state);
9632         ender = reg_node(pRExC_state, SUCCEED);
9633         REGTAIL(pRExC_state, ret, ender);
9634         reginsert(pRExC_state, SUSPEND, ret, depth+1);
9635         ret->flags = 0;
9636         ender = reg_node(pRExC_state, TAIL);
9637         REGTAIL(pRExC_state, ret, ender);
9638         /*ret= ender;*/
9639     }
9640
9641     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9642         RExC_parse++;
9643         vFAIL("Nested quantifiers");
9644     }
9645
9646     return(ret);
9647 }
9648
9649 STATIC bool
9650 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
9651         const bool strict   /* Apply stricter parsing rules? */
9652     )
9653 {
9654
9655  /* This is expected to be called by a parser routine that has recognized '\N'
9656    and needs to handle the rest. RExC_parse is expected to point at the first
9657    char following the N at the time of the call.  On successful return,
9658    RExC_parse has been updated to point to just after the sequence identified
9659    by this routine, and <*flagp> has been updated.
9660
9661    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9662    character class.
9663
9664    \N may begin either a named sequence, or if outside a character class, mean
9665    to match a non-newline.  For non single-quoted regexes, the tokenizer has
9666    attempted to decide which, and in the case of a named sequence, converted it
9667    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9668    where c1... are the characters in the sequence.  For single-quoted regexes,
9669    the tokenizer passes the \N sequence through unchanged; this code will not
9670    attempt to determine this nor expand those, instead raising a syntax error.
9671    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9672    or there is no '}', it signals that this \N occurrence means to match a
9673    non-newline.
9674
9675    Only the \N{U+...} form should occur in a character class, for the same
9676    reason that '.' inside a character class means to just match a period: it
9677    just doesn't make sense.
9678
9679    The function raises an error (via vFAIL), and doesn't return for various
9680    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
9681    success; it returns FALSE otherwise.
9682
9683    If <valuep> is non-null, it means the caller can accept an input sequence
9684    consisting of a just a single code point; <*valuep> is set to that value
9685    if the input is such.
9686
9687    If <node_p> is non-null it signifies that the caller can accept any other
9688    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
9689    is set as follows:
9690     1) \N means not-a-NL: points to a newly created REG_ANY node;
9691     2) \N{}:              points to a new NOTHING node;
9692     3) otherwise:         points to a new EXACT node containing the resolved
9693                           string.
9694    Note that FALSE is returned for single code point sequences if <valuep> is
9695    null.
9696  */
9697
9698     char * endbrace;    /* '}' following the name */
9699     char* p;
9700     char *endchar;      /* Points to '.' or '}' ending cur char in the input
9701                            stream */
9702     bool has_multiple_chars; /* true if the input stream contains a sequence of
9703                                 more than one character */
9704
9705     GET_RE_DEBUG_FLAGS_DECL;
9706
9707     PERL_ARGS_ASSERT_GROK_BSLASH_N;
9708
9709     GET_RE_DEBUG_FLAGS;
9710
9711     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
9712
9713     /* The [^\n] meaning of \N ignores spaces and comments under the /x
9714      * modifier.  The other meaning does not */
9715     p = (RExC_flags & RXf_PMf_EXTENDED)
9716         ? regwhite( pRExC_state, RExC_parse )
9717         : RExC_parse;
9718
9719     /* Disambiguate between \N meaning a named character versus \N meaning
9720      * [^\n].  The former is assumed when it can't be the latter. */
9721     if (*p != '{' || regcurly(p, FALSE)) {
9722         RExC_parse = p;
9723         if (! node_p) {
9724             /* no bare \N in a charclass */
9725             if (in_char_class) {
9726                 vFAIL("\\N in a character class must be a named character: \\N{...}");
9727             }
9728             return FALSE;
9729         }
9730         nextchar(pRExC_state);
9731         *node_p = reg_node(pRExC_state, REG_ANY);
9732         *flagp |= HASWIDTH|SIMPLE;
9733         RExC_naughty++;
9734         RExC_parse--;
9735         Set_Node_Length(*node_p, 1); /* MJD */
9736         return TRUE;
9737     }
9738
9739     /* Here, we have decided it should be a named character or sequence */
9740
9741     /* The test above made sure that the next real character is a '{', but
9742      * under the /x modifier, it could be separated by space (or a comment and
9743      * \n) and this is not allowed (for consistency with \x{...} and the
9744      * tokenizer handling of \N{NAME}). */
9745     if (*RExC_parse != '{') {
9746         vFAIL("Missing braces on \\N{}");
9747     }
9748
9749     RExC_parse++;       /* Skip past the '{' */
9750
9751     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9752         || ! (endbrace == RExC_parse            /* nothing between the {} */
9753               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
9754                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9755     {
9756         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
9757         vFAIL("\\N{NAME} must be resolved by the lexer");
9758     }
9759
9760     if (endbrace == RExC_parse) {   /* empty: \N{} */
9761         bool ret = TRUE;
9762         if (node_p) {
9763             *node_p = reg_node(pRExC_state,NOTHING);
9764         }
9765         else if (in_char_class) {
9766             if (SIZE_ONLY && in_char_class) {
9767                 if (strict) {
9768                     RExC_parse++;   /* Position after the "}" */
9769                     vFAIL("Zero length \\N{}");
9770                 }
9771                 else {
9772                     ckWARNreg(RExC_parse,
9773                               "Ignoring zero length \\N{} in character class");
9774                 }
9775             }
9776             ret = FALSE;
9777         }
9778         else {
9779             return FALSE;
9780         }
9781         nextchar(pRExC_state);
9782         return ret;
9783     }
9784
9785     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9786     RExC_parse += 2;    /* Skip past the 'U+' */
9787
9788     endchar = RExC_parse + strcspn(RExC_parse, ".}");
9789
9790     /* Code points are separated by dots.  If none, there is only one code
9791      * point, and is terminated by the brace */
9792     has_multiple_chars = (endchar < endbrace);
9793
9794     if (valuep && (! has_multiple_chars || in_char_class)) {
9795         /* We only pay attention to the first char of
9796         multichar strings being returned in char classes. I kinda wonder
9797         if this makes sense as it does change the behaviour
9798         from earlier versions, OTOH that behaviour was broken
9799         as well. XXX Solution is to recharacterize as
9800         [rest-of-class]|multi1|multi2... */
9801
9802         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9803         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9804             | PERL_SCAN_DISALLOW_PREFIX
9805             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9806
9807         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9808
9809         /* The tokenizer should have guaranteed validity, but it's possible to
9810          * bypass it by using single quoting, so check */
9811         if (length_of_hex == 0
9812             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9813         {
9814             RExC_parse += length_of_hex;        /* Includes all the valid */
9815             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
9816                             ? UTF8SKIP(RExC_parse)
9817                             : 1;
9818             /* Guard against malformed utf8 */
9819             if (RExC_parse >= endchar) {
9820                 RExC_parse = endchar;
9821             }
9822             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9823         }
9824
9825         if (in_char_class && has_multiple_chars) {
9826             if (strict) {
9827                 RExC_parse = endbrace;
9828                 vFAIL("\\N{} in character class restricted to one character");
9829             }
9830             else {
9831                 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9832             }
9833         }
9834
9835         RExC_parse = endbrace + 1;
9836     }
9837     else if (! node_p || ! has_multiple_chars) {
9838
9839         /* Here, the input is legal, but not according to the caller's
9840          * options.  We fail without advancing the parse, so that the
9841          * caller can try again */
9842         RExC_parse = p;
9843         return FALSE;
9844     }
9845     else {
9846
9847         /* What is done here is to convert this to a sub-pattern of the form
9848          * (?:\x{char1}\x{char2}...)
9849          * and then call reg recursively.  That way, it retains its atomicness,
9850          * while not having to worry about special handling that some code
9851          * points may have.  toke.c has converted the original Unicode values
9852          * to native, so that we can just pass on the hex values unchanged.  We
9853          * do have to set a flag to keep recoding from happening in the
9854          * recursion */
9855
9856         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9857         STRLEN len;
9858         char *orig_end = RExC_end;
9859         I32 flags;
9860
9861         while (RExC_parse < endbrace) {
9862
9863             /* Convert to notation the rest of the code understands */
9864             sv_catpv(substitute_parse, "\\x{");
9865             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9866             sv_catpv(substitute_parse, "}");
9867
9868             /* Point to the beginning of the next character in the sequence. */
9869             RExC_parse = endchar + 1;
9870             endchar = RExC_parse + strcspn(RExC_parse, ".}");
9871         }
9872         sv_catpv(substitute_parse, ")");
9873
9874         RExC_parse = SvPV(substitute_parse, len);
9875
9876         /* Don't allow empty number */
9877         if (len < 8) {
9878             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9879         }
9880         RExC_end = RExC_parse + len;
9881
9882         /* The values are Unicode, and therefore not subject to recoding */
9883         RExC_override_recoding = 1;
9884
9885         *node_p = reg(pRExC_state, 1, &flags, depth+1);
9886         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9887
9888         RExC_parse = endbrace;
9889         RExC_end = orig_end;
9890         RExC_override_recoding = 0;
9891
9892         nextchar(pRExC_state);
9893     }
9894
9895     return TRUE;
9896 }
9897
9898
9899 /*
9900  * reg_recode
9901  *
9902  * It returns the code point in utf8 for the value in *encp.
9903  *    value: a code value in the source encoding
9904  *    encp:  a pointer to an Encode object
9905  *
9906  * If the result from Encode is not a single character,
9907  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9908  */
9909 STATIC UV
9910 S_reg_recode(pTHX_ const char value, SV **encp)
9911 {
9912     STRLEN numlen = 1;
9913     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9914     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9915     const STRLEN newlen = SvCUR(sv);
9916     UV uv = UNICODE_REPLACEMENT;
9917
9918     PERL_ARGS_ASSERT_REG_RECODE;
9919
9920     if (newlen)
9921         uv = SvUTF8(sv)
9922              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9923              : *(U8*)s;
9924
9925     if (!newlen || numlen != newlen) {
9926         uv = UNICODE_REPLACEMENT;
9927         *encp = NULL;
9928     }
9929     return uv;
9930 }
9931
9932 PERL_STATIC_INLINE U8
9933 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
9934 {
9935     U8 op;
9936
9937     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
9938
9939     if (! FOLD) {
9940         return EXACT;
9941     }
9942
9943     op = get_regex_charset(RExC_flags);
9944     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
9945         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
9946                  been, so there is no hole */
9947     }
9948
9949     return op + EXACTF;
9950 }
9951
9952 PERL_STATIC_INLINE void
9953 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
9954 {
9955     /* This knows the details about sizing an EXACTish node, setting flags for
9956      * it (by setting <*flagp>, and potentially populating it with a single
9957      * character.
9958      *
9959      * If <len> (the length in bytes) is non-zero, this function assumes that
9960      * the node has already been populated, and just does the sizing.  In this
9961      * case <code_point> should be the final code point that has already been
9962      * placed into the node.  This value will be ignored except that under some
9963      * circumstances <*flagp> is set based on it.
9964      *
9965      * If <len> is zero, the function assumes that the node is to contain only
9966      * the single character given by <code_point> and calculates what <len>
9967      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
9968      * additionally will populate the node's STRING with <code_point>, if <len>
9969      * is 0.  In both cases <*flagp> is appropriately set
9970      *
9971      * It knows that under FOLD, UTF characters and the Latin Sharp S must be
9972      * folded (the latter only when the rules indicate it can match 'ss') */
9973
9974     bool len_passed_in = cBOOL(len != 0);
9975     U8 character[UTF8_MAXBYTES_CASE+1];
9976
9977     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
9978
9979     if (! len_passed_in) {
9980         if (UTF) {
9981             if (FOLD) {
9982                 to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
9983             }
9984             else {
9985                 uvchr_to_utf8( character, code_point);
9986                 len = UTF8SKIP(character);
9987             }
9988         }
9989         else if (! FOLD
9990                  || code_point != LATIN_SMALL_LETTER_SHARP_S
9991                  || ASCII_FOLD_RESTRICTED
9992                  || ! AT_LEAST_UNI_SEMANTICS)
9993         {
9994             *character = (U8) code_point;
9995             len = 1;
9996         }
9997         else {
9998             *character = 's';
9999             *(character + 1) = 's';
10000             len = 2;
10001         }
10002     }
10003
10004     if (SIZE_ONLY) {
10005         RExC_size += STR_SZ(len);
10006     }
10007     else {
10008         RExC_emit += STR_SZ(len);
10009         STR_LEN(node) = len;
10010         if (! len_passed_in) {
10011             Copy((char *) character, STRING(node), len, char);
10012         }
10013     }
10014
10015     *flagp |= HASWIDTH;
10016
10017     /* A single character node is SIMPLE, except for the special-cased SHARP S
10018      * under /di. */
10019     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10020         && (code_point != LATIN_SMALL_LETTER_SHARP_S
10021             || ! FOLD || ! DEPENDS_SEMANTICS))
10022     {
10023         *flagp |= SIMPLE;
10024     }
10025 }
10026
10027 /*
10028  - regatom - the lowest level
10029
10030    Try to identify anything special at the start of the pattern. If there
10031    is, then handle it as required. This may involve generating a single regop,
10032    such as for an assertion; or it may involve recursing, such as to
10033    handle a () structure.
10034
10035    If the string doesn't start with something special then we gobble up
10036    as much literal text as we can.
10037
10038    Once we have been able to handle whatever type of thing started the
10039    sequence, we return.
10040
10041    Note: we have to be careful with escapes, as they can be both literal
10042    and special, and in the case of \10 and friends, context determines which.
10043
10044    A summary of the code structure is:
10045
10046    switch (first_byte) {
10047         cases for each special:
10048             handle this special;
10049             break;
10050         case '\\':
10051             switch (2nd byte) {
10052                 cases for each unambiguous special:
10053                     handle this special;
10054                     break;
10055                 cases for each ambigous special/literal:
10056                     disambiguate;
10057                     if (special)  handle here
10058                     else goto defchar;
10059                 default: // unambiguously literal:
10060                     goto defchar;
10061             }
10062         default:  // is a literal char
10063             // FALL THROUGH
10064         defchar:
10065             create EXACTish node for literal;
10066             while (more input and node isn't full) {
10067                 switch (input_byte) {
10068                    cases for each special;
10069                        make sure parse pointer is set so that the next call to
10070                            regatom will see this special first
10071                        goto loopdone; // EXACTish node terminated by prev. char
10072                    default:
10073                        append char to EXACTISH node;
10074                 }
10075                 get next input byte;
10076             }
10077         loopdone:
10078    }
10079    return the generated node;
10080
10081    Specifically there are two separate switches for handling
10082    escape sequences, with the one for handling literal escapes requiring
10083    a dummy entry for all of the special escapes that are actually handled
10084    by the other.
10085 */
10086
10087 STATIC regnode *
10088 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10089 {
10090     dVAR;
10091     regnode *ret = NULL;
10092     I32 flags = 0;
10093     char *parse_start = RExC_parse;
10094     U8 op;
10095     int invert = 0;
10096
10097     GET_RE_DEBUG_FLAGS_DECL;
10098
10099     *flagp = WORST;             /* Tentatively. */
10100
10101     DEBUG_PARSE("atom");
10102
10103     PERL_ARGS_ASSERT_REGATOM;
10104
10105 tryagain:
10106     switch ((U8)*RExC_parse) {
10107     case '^':
10108         RExC_seen_zerolen++;
10109         nextchar(pRExC_state);
10110         if (RExC_flags & RXf_PMf_MULTILINE)
10111             ret = reg_node(pRExC_state, MBOL);
10112         else if (RExC_flags & RXf_PMf_SINGLELINE)
10113             ret = reg_node(pRExC_state, SBOL);
10114         else
10115             ret = reg_node(pRExC_state, BOL);
10116         Set_Node_Length(ret, 1); /* MJD */
10117         break;
10118     case '$':
10119         nextchar(pRExC_state);
10120         if (*RExC_parse)
10121             RExC_seen_zerolen++;
10122         if (RExC_flags & RXf_PMf_MULTILINE)
10123             ret = reg_node(pRExC_state, MEOL);
10124         else if (RExC_flags & RXf_PMf_SINGLELINE)
10125             ret = reg_node(pRExC_state, SEOL);
10126         else
10127             ret = reg_node(pRExC_state, EOL);
10128         Set_Node_Length(ret, 1); /* MJD */
10129         break;
10130     case '.':
10131         nextchar(pRExC_state);
10132         if (RExC_flags & RXf_PMf_SINGLELINE)
10133             ret = reg_node(pRExC_state, SANY);
10134         else
10135             ret = reg_node(pRExC_state, REG_ANY);
10136         *flagp |= HASWIDTH|SIMPLE;
10137         RExC_naughty++;
10138         Set_Node_Length(ret, 1); /* MJD */
10139         break;
10140     case '[':
10141     {
10142         char * const oregcomp_parse = ++RExC_parse;
10143         ret = regclass(pRExC_state, flagp,depth+1,
10144                        FALSE, /* means parse the whole char class */
10145                        TRUE, /* allow multi-char folds */
10146                        FALSE, /* don't silence non-portable warnings. */
10147                        NULL);
10148         if (*RExC_parse != ']') {
10149             RExC_parse = oregcomp_parse;
10150             vFAIL("Unmatched [");
10151         }
10152         nextchar(pRExC_state);
10153         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10154         break;
10155     }
10156     case '(':
10157         nextchar(pRExC_state);
10158         ret = reg(pRExC_state, 1, &flags,depth+1);
10159         if (ret == NULL) {
10160                 if (flags & TRYAGAIN) {
10161                     if (RExC_parse == RExC_end) {
10162                          /* Make parent create an empty node if needed. */
10163                         *flagp |= TRYAGAIN;
10164                         return(NULL);
10165                     }
10166                     goto tryagain;
10167                 }
10168                 return(NULL);
10169         }
10170         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10171         break;
10172     case '|':
10173     case ')':
10174         if (flags & TRYAGAIN) {
10175             *flagp |= TRYAGAIN;
10176             return NULL;
10177         }
10178         vFAIL("Internal urp");
10179                                 /* Supposed to be caught earlier. */
10180         break;
10181     case '{':
10182         if (!regcurly(RExC_parse, FALSE)) {
10183             RExC_parse++;
10184             goto defchar;
10185         }
10186         /* FALL THROUGH */
10187     case '?':
10188     case '+':
10189     case '*':
10190         RExC_parse++;
10191         vFAIL("Quantifier follows nothing");
10192         break;
10193     case '\\':
10194         /* Special Escapes
10195
10196            This switch handles escape sequences that resolve to some kind
10197            of special regop and not to literal text. Escape sequnces that
10198            resolve to literal text are handled below in the switch marked
10199            "Literal Escapes".
10200
10201            Every entry in this switch *must* have a corresponding entry
10202            in the literal escape switch. However, the opposite is not
10203            required, as the default for this switch is to jump to the
10204            literal text handling code.
10205         */
10206         switch ((U8)*++RExC_parse) {
10207             U8 arg;
10208         /* Special Escapes */
10209         case 'A':
10210             RExC_seen_zerolen++;
10211             ret = reg_node(pRExC_state, SBOL);
10212             *flagp |= SIMPLE;
10213             goto finish_meta_pat;
10214         case 'G':
10215             ret = reg_node(pRExC_state, GPOS);
10216             RExC_seen |= REG_SEEN_GPOS;
10217             *flagp |= SIMPLE;
10218             goto finish_meta_pat;
10219         case 'K':
10220             RExC_seen_zerolen++;
10221             ret = reg_node(pRExC_state, KEEPS);
10222             *flagp |= SIMPLE;
10223             /* XXX:dmq : disabling in-place substitution seems to
10224              * be necessary here to avoid cases of memory corruption, as
10225              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10226              */
10227             RExC_seen |= REG_SEEN_LOOKBEHIND;
10228             goto finish_meta_pat;
10229         case 'Z':
10230             ret = reg_node(pRExC_state, SEOL);
10231             *flagp |= SIMPLE;
10232             RExC_seen_zerolen++;                /* Do not optimize RE away */
10233             goto finish_meta_pat;
10234         case 'z':
10235             ret = reg_node(pRExC_state, EOS);
10236             *flagp |= SIMPLE;
10237             RExC_seen_zerolen++;                /* Do not optimize RE away */
10238             goto finish_meta_pat;
10239         case 'C':
10240             ret = reg_node(pRExC_state, CANY);
10241             RExC_seen |= REG_SEEN_CANY;
10242             *flagp |= HASWIDTH|SIMPLE;
10243             goto finish_meta_pat;
10244         case 'X':
10245             ret = reg_node(pRExC_state, CLUMP);
10246             *flagp |= HASWIDTH;
10247             goto finish_meta_pat;
10248
10249         case 'W':
10250             invert = 1;
10251             /* FALLTHROUGH */
10252         case 'w':
10253             arg = ANYOF_WORDCHAR;
10254             goto join_posix;
10255
10256         case 'b':
10257             RExC_seen_zerolen++;
10258             RExC_seen |= REG_SEEN_LOOKBEHIND;
10259             op = BOUND + get_regex_charset(RExC_flags);
10260             if (op > BOUNDA) {  /* /aa is same as /a */
10261                 op = BOUNDA;
10262             }
10263             ret = reg_node(pRExC_state, op);
10264             FLAGS(ret) = get_regex_charset(RExC_flags);
10265             *flagp |= SIMPLE;
10266             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10267                 ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
10268             }
10269             goto finish_meta_pat;
10270         case 'B':
10271             RExC_seen_zerolen++;
10272             RExC_seen |= REG_SEEN_LOOKBEHIND;
10273             op = NBOUND + get_regex_charset(RExC_flags);
10274             if (op > NBOUNDA) { /* /aa is same as /a */
10275                 op = NBOUNDA;
10276             }
10277             ret = reg_node(pRExC_state, op);
10278             FLAGS(ret) = get_regex_charset(RExC_flags);
10279             *flagp |= SIMPLE;
10280             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10281                 ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
10282             }
10283             goto finish_meta_pat;
10284
10285         case 'D':
10286             invert = 1;
10287             /* FALLTHROUGH */
10288         case 'd':
10289             arg = ANYOF_DIGIT;
10290             goto join_posix;
10291
10292         case 'R':
10293             ret = reg_node(pRExC_state, LNBREAK);
10294             *flagp |= HASWIDTH|SIMPLE;
10295             goto finish_meta_pat;
10296
10297         case 'H':
10298             invert = 1;
10299             /* FALLTHROUGH */
10300         case 'h':
10301             arg = ANYOF_BLANK;
10302             op = POSIXU;
10303             goto join_posix_op_known;
10304
10305         case 'V':
10306             invert = 1;
10307             /* FALLTHROUGH */
10308         case 'v':
10309             arg = ANYOF_VERTWS;
10310             op = POSIXU;
10311             goto join_posix_op_known;
10312
10313         case 'S':
10314             invert = 1;
10315             /* FALLTHROUGH */
10316         case 's':
10317             arg = ANYOF_SPACE;
10318
10319         join_posix:
10320
10321             op = POSIXD + get_regex_charset(RExC_flags);
10322             if (op > POSIXA) {  /* /aa is same as /a */
10323                 op = POSIXA;
10324             }
10325
10326         join_posix_op_known:
10327
10328             if (invert) {
10329                 op += NPOSIXD - POSIXD;
10330             }
10331
10332             ret = reg_node(pRExC_state, op);
10333             if (! SIZE_ONLY) {
10334                 FLAGS(ret) = namedclass_to_classnum(arg);
10335             }
10336
10337             *flagp |= HASWIDTH|SIMPLE;
10338             /* FALL THROUGH */
10339
10340          finish_meta_pat:
10341             nextchar(pRExC_state);
10342             Set_Node_Length(ret, 2); /* MJD */
10343             break;
10344         case 'p':
10345         case 'P':
10346             {
10347 #ifdef DEBUGGING
10348                 char* parse_start = RExC_parse - 2;
10349 #endif
10350
10351                 RExC_parse--;
10352
10353                 ret = regclass(pRExC_state, flagp,depth+1,
10354                                TRUE, /* means just parse this element */
10355                                FALSE, /* don't allow multi-char folds */
10356                                FALSE, /* don't silence non-portable warnings.
10357                                          It would be a bug if these returned
10358                                          non-portables */
10359                                NULL);
10360
10361                 RExC_parse--;
10362
10363                 Set_Node_Offset(ret, parse_start + 2);
10364                 Set_Node_Cur_Length(ret);
10365                 nextchar(pRExC_state);
10366             }
10367             break;
10368         case 'N':
10369             /* Handle \N and \N{NAME} with multiple code points here and not
10370              * below because it can be multicharacter. join_exact() will join
10371              * them up later on.  Also this makes sure that things like
10372              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10373              * The options to the grok function call causes it to fail if the
10374              * sequence is just a single code point.  We then go treat it as
10375              * just another character in the current EXACT node, and hence it
10376              * gets uniform treatment with all the other characters.  The
10377              * special treatment for quantifiers is not needed for such single
10378              * character sequences */
10379             ++RExC_parse;
10380             if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
10381                                 FALSE /* not strict */ )) {
10382                 RExC_parse--;
10383                 goto defchar;
10384             }
10385             break;
10386         case 'k':    /* Handle \k<NAME> and \k'NAME' */
10387         parse_named_seq:
10388         {
10389             char ch= RExC_parse[1];
10390             if (ch != '<' && ch != '\'' && ch != '{') {
10391                 RExC_parse++;
10392                 vFAIL2("Sequence %.2s... not terminated",parse_start);
10393             } else {
10394                 /* this pretty much dupes the code for (?P=...) in reg(), if
10395                    you change this make sure you change that */
10396                 char* name_start = (RExC_parse += 2);
10397                 U32 num = 0;
10398                 SV *sv_dat = reg_scan_name(pRExC_state,
10399                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10400                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10401                 if (RExC_parse == name_start || *RExC_parse != ch)
10402                     vFAIL2("Sequence %.3s... not terminated",parse_start);
10403
10404                 if (!SIZE_ONLY) {
10405                     num = add_data( pRExC_state, 1, "S" );
10406                     RExC_rxi->data->data[num]=(void*)sv_dat;
10407                     SvREFCNT_inc_simple_void(sv_dat);
10408                 }
10409
10410                 RExC_sawback = 1;
10411                 ret = reganode(pRExC_state,
10412                                ((! FOLD)
10413                                  ? NREF
10414                                  : (ASCII_FOLD_RESTRICTED)
10415                                    ? NREFFA
10416                                    : (AT_LEAST_UNI_SEMANTICS)
10417                                      ? NREFFU
10418                                      : (LOC)
10419                                        ? NREFFL
10420                                        : NREFF),
10421                                 num);
10422                 *flagp |= HASWIDTH;
10423
10424                 /* override incorrect value set in reganode MJD */
10425                 Set_Node_Offset(ret, parse_start+1);
10426                 Set_Node_Cur_Length(ret); /* MJD */
10427                 nextchar(pRExC_state);
10428
10429             }
10430             break;
10431         }
10432         case 'g':
10433         case '1': case '2': case '3': case '4':
10434         case '5': case '6': case '7': case '8': case '9':
10435             {
10436                 I32 num;
10437                 bool isg = *RExC_parse == 'g';
10438                 bool isrel = 0;
10439                 bool hasbrace = 0;
10440                 if (isg) {
10441                     RExC_parse++;
10442                     if (*RExC_parse == '{') {
10443                         RExC_parse++;
10444                         hasbrace = 1;
10445                     }
10446                     if (*RExC_parse == '-') {
10447                         RExC_parse++;
10448                         isrel = 1;
10449                     }
10450                     if (hasbrace && !isDIGIT(*RExC_parse)) {
10451                         if (isrel) RExC_parse--;
10452                         RExC_parse -= 2;
10453                         goto parse_named_seq;
10454                 }   }
10455                 num = atoi(RExC_parse);
10456                 if (isg && num == 0)
10457                     vFAIL("Reference to invalid group 0");
10458                 if (isrel) {
10459                     num = RExC_npar - num;
10460                     if (num < 1)
10461                         vFAIL("Reference to nonexistent or unclosed group");
10462                 }
10463                 if (!isg && num > 9 && num >= RExC_npar)
10464                     /* Probably a character specified in octal, e.g. \35 */
10465                     goto defchar;
10466                 else {
10467                     char * const parse_start = RExC_parse - 1; /* MJD */
10468                     while (isDIGIT(*RExC_parse))
10469                         RExC_parse++;
10470                     if (parse_start == RExC_parse - 1)
10471                         vFAIL("Unterminated \\g... pattern");
10472                     if (hasbrace) {
10473                         if (*RExC_parse != '}')
10474                             vFAIL("Unterminated \\g{...} pattern");
10475                         RExC_parse++;
10476                     }
10477                     if (!SIZE_ONLY) {
10478                         if (num > (I32)RExC_rx->nparens)
10479                             vFAIL("Reference to nonexistent group");
10480                     }
10481                     RExC_sawback = 1;
10482                     ret = reganode(pRExC_state,
10483                                    ((! FOLD)
10484                                      ? REF
10485                                      : (ASCII_FOLD_RESTRICTED)
10486                                        ? REFFA
10487                                        : (AT_LEAST_UNI_SEMANTICS)
10488                                          ? REFFU
10489                                          : (LOC)
10490                                            ? REFFL
10491                                            : REFF),
10492                                     num);
10493                     *flagp |= HASWIDTH;
10494
10495                     /* override incorrect value set in reganode MJD */
10496                     Set_Node_Offset(ret, parse_start+1);
10497                     Set_Node_Cur_Length(ret); /* MJD */
10498                     RExC_parse--;
10499                     nextchar(pRExC_state);
10500                 }
10501             }
10502             break;
10503         case '\0':
10504             if (RExC_parse >= RExC_end)
10505                 FAIL("Trailing \\");
10506             /* FALL THROUGH */
10507         default:
10508             /* Do not generate "unrecognized" warnings here, we fall
10509                back into the quick-grab loop below */
10510             parse_start--;
10511             goto defchar;
10512         }
10513         break;
10514
10515     case '#':
10516         if (RExC_flags & RXf_PMf_EXTENDED) {
10517             if ( reg_skipcomment( pRExC_state ) )
10518                 goto tryagain;
10519         }
10520         /* FALL THROUGH */
10521
10522     default:
10523
10524             parse_start = RExC_parse - 1;
10525
10526             RExC_parse++;
10527
10528         defchar: {
10529             STRLEN len = 0;
10530             UV ender;
10531             char *p;
10532             char *s;
10533 #define MAX_NODE_STRING_SIZE 127
10534             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10535             char *s0;
10536             U8 upper_parse = MAX_NODE_STRING_SIZE;
10537             STRLEN foldlen;
10538             U8 node_type;
10539             bool next_is_quantifier;
10540             char * oldp = NULL;
10541
10542             /* If a folding node contains only code points that don't
10543              * participate in folds, it can be changed into an EXACT node,
10544              * which allows the optimizer more things to look for */
10545             bool maybe_exact;
10546
10547             ender = 0;
10548             node_type = compute_EXACTish(pRExC_state);
10549             ret = reg_node(pRExC_state, node_type);
10550
10551             /* In pass1, folded, we use a temporary buffer instead of the
10552              * actual node, as the node doesn't exist yet */
10553             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10554
10555             s0 = s;
10556
10557         reparse:
10558
10559             /* We do the EXACTFish to EXACT node only if folding, and not if in
10560              * locale, as whether a character folds or not isn't known until
10561              * runtime */
10562             maybe_exact = FOLD && ! LOC;
10563
10564             /* XXX The node can hold up to 255 bytes, yet this only goes to
10565              * 127.  I (khw) do not know why.  Keeping it somewhat less than
10566              * 255 allows us to not have to worry about overflow due to
10567              * converting to utf8 and fold expansion, but that value is
10568              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
10569              * split up by this limit into a single one using the real max of
10570              * 255.  Even at 127, this breaks under rare circumstances.  If
10571              * folding, we do not want to split a node at a character that is a
10572              * non-final in a multi-char fold, as an input string could just
10573              * happen to want to match across the node boundary.  The join
10574              * would solve that problem if the join actually happens.  But a
10575              * series of more than two nodes in a row each of 127 would cause
10576              * the first join to succeed to get to 254, but then there wouldn't
10577              * be room for the next one, which could at be one of those split
10578              * multi-char folds.  I don't know of any fool-proof solution.  One
10579              * could back off to end with only a code point that isn't such a
10580              * non-final, but it is possible for there not to be any in the
10581              * entire node. */
10582             for (p = RExC_parse - 1;
10583                  len < upper_parse && p < RExC_end;
10584                  len++)
10585             {
10586                 oldp = p;
10587
10588                 if (RExC_flags & RXf_PMf_EXTENDED)
10589                     p = regwhite( pRExC_state, p );
10590                 switch ((U8)*p) {
10591                 case '^':
10592                 case '$':
10593                 case '.':
10594                 case '[':
10595                 case '(':
10596                 case ')':
10597                 case '|':
10598                     goto loopdone;
10599                 case '\\':
10600                     /* Literal Escapes Switch
10601
10602                        This switch is meant to handle escape sequences that
10603                        resolve to a literal character.
10604
10605                        Every escape sequence that represents something
10606                        else, like an assertion or a char class, is handled
10607                        in the switch marked 'Special Escapes' above in this
10608                        routine, but also has an entry here as anything that
10609                        isn't explicitly mentioned here will be treated as
10610                        an unescaped equivalent literal.
10611                     */
10612
10613                     switch ((U8)*++p) {
10614                     /* These are all the special escapes. */
10615                     case 'A':             /* Start assertion */
10616                     case 'b': case 'B':   /* Word-boundary assertion*/
10617                     case 'C':             /* Single char !DANGEROUS! */
10618                     case 'd': case 'D':   /* digit class */
10619                     case 'g': case 'G':   /* generic-backref, pos assertion */
10620                     case 'h': case 'H':   /* HORIZWS */
10621                     case 'k': case 'K':   /* named backref, keep marker */
10622                     case 'p': case 'P':   /* Unicode property */
10623                               case 'R':   /* LNBREAK */
10624                     case 's': case 'S':   /* space class */
10625                     case 'v': case 'V':   /* VERTWS */
10626                     case 'w': case 'W':   /* word class */
10627                     case 'X':             /* eXtended Unicode "combining character sequence" */
10628                     case 'z': case 'Z':   /* End of line/string assertion */
10629                         --p;
10630                         goto loopdone;
10631
10632                     /* Anything after here is an escape that resolves to a
10633                        literal. (Except digits, which may or may not)
10634                      */
10635                     case 'n':
10636                         ender = '\n';
10637                         p++;
10638                         break;
10639                     case 'N': /* Handle a single-code point named character. */
10640                         /* The options cause it to fail if a multiple code
10641                          * point sequence.  Handle those in the switch() above
10642                          * */
10643                         RExC_parse = p + 1;
10644                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
10645                                             flagp, depth, FALSE,
10646                                             FALSE /* not strict */ ))
10647                         {
10648                             RExC_parse = p = oldp;
10649                             goto loopdone;
10650                         }
10651                         p = RExC_parse;
10652                         if (ender > 0xff) {
10653                             REQUIRE_UTF8;
10654                         }
10655                         break;
10656                     case 'r':
10657                         ender = '\r';
10658                         p++;
10659                         break;
10660                     case 't':
10661                         ender = '\t';
10662                         p++;
10663                         break;
10664                     case 'f':
10665                         ender = '\f';
10666                         p++;
10667                         break;
10668                     case 'e':
10669                           ender = ASCII_TO_NATIVE('\033');
10670                         p++;
10671                         break;
10672                     case 'a':
10673                           ender = ASCII_TO_NATIVE('\007');
10674                         p++;
10675                         break;
10676                     case 'o':
10677                         {
10678                             UV result;
10679                             const char* error_msg;
10680
10681                             bool valid = grok_bslash_o(&p,
10682                                                        &result,
10683                                                        &error_msg,
10684                                                        TRUE, /* out warnings */
10685                                                        FALSE, /* not strict */
10686                                                        TRUE, /* Output warnings
10687                                                                 for non-
10688                                                                 portables */
10689                                                        UTF);
10690                             if (! valid) {
10691                                 RExC_parse = p; /* going to die anyway; point
10692                                                    to exact spot of failure */
10693                                 vFAIL(error_msg);
10694                             }
10695                             ender = result;
10696                             if (PL_encoding && ender < 0x100) {
10697                                 goto recode_encoding;
10698                             }
10699                             if (ender > 0xff) {
10700                                 REQUIRE_UTF8;
10701                             }
10702                             break;
10703                         }
10704                     case 'x':
10705                         {
10706                             UV result = UV_MAX; /* initialize to erroneous
10707                                                    value */
10708                             const char* error_msg;
10709
10710                             bool valid = grok_bslash_x(&p,
10711                                                        &result,
10712                                                        &error_msg,
10713                                                        TRUE, /* out warnings */
10714                                                        FALSE, /* not strict */
10715                                                        TRUE, /* Output warnings
10716                                                                 for non-
10717                                                                 portables */
10718                                                        UTF);
10719                             if (! valid) {
10720                                 RExC_parse = p; /* going to die anyway; point
10721                                                    to exact spot of failure */
10722                                 vFAIL(error_msg);
10723                             }
10724                             ender = result;
10725
10726                             if (PL_encoding && ender < 0x100) {
10727                                 goto recode_encoding;
10728                             }
10729                             if (ender > 0xff) {
10730                                 REQUIRE_UTF8;
10731                             }
10732                             break;
10733                         }
10734                     case 'c':
10735                         p++;
10736                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10737                         break;
10738                     case '0': case '1': case '2': case '3':case '4':
10739                     case '5': case '6': case '7':
10740                         if (*p == '0' ||
10741                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10742                         {
10743                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10744                             STRLEN numlen = 3;
10745                             ender = grok_oct(p, &numlen, &flags, NULL);
10746                             if (ender > 0xff) {
10747                                 REQUIRE_UTF8;
10748                             }
10749                             p += numlen;
10750                             if (SIZE_ONLY   /* like \08, \178 */
10751                                 && numlen < 3
10752                                 && p < RExC_end
10753                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
10754                             {
10755                                 reg_warn_non_literal_string(
10756                                          p + 1,
10757                                          form_short_octal_warning(p, numlen));
10758                             }
10759                         }
10760                         else {  /* Not to be treated as an octal constant, go
10761                                    find backref */
10762                             --p;
10763                             goto loopdone;
10764                         }
10765                         if (PL_encoding && ender < 0x100)
10766                             goto recode_encoding;
10767                         break;
10768                     recode_encoding:
10769                         if (! RExC_override_recoding) {
10770                             SV* enc = PL_encoding;
10771                             ender = reg_recode((const char)(U8)ender, &enc);
10772                             if (!enc && SIZE_ONLY)
10773                                 ckWARNreg(p, "Invalid escape in the specified encoding");
10774                             REQUIRE_UTF8;
10775                         }
10776                         break;
10777                     case '\0':
10778                         if (p >= RExC_end)
10779                             FAIL("Trailing \\");
10780                         /* FALL THROUGH */
10781                     default:
10782                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
10783                             /* Include any { following the alpha to emphasize
10784                              * that it could be part of an escape at some point
10785                              * in the future */
10786                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
10787                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
10788                         }
10789                         goto normal_default;
10790                     } /* End of switch on '\' */
10791                     break;
10792                 default:    /* A literal character */
10793
10794                     if (! SIZE_ONLY
10795                         && RExC_flags & RXf_PMf_EXTENDED
10796                         && ckWARN(WARN_DEPRECATED)
10797                         && is_PATWS_non_low(p, UTF))
10798                     {
10799                         vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
10800                                 "Escape literal pattern white space under /x");
10801                     }
10802
10803                   normal_default:
10804                     if (UTF8_IS_START(*p) && UTF) {
10805                         STRLEN numlen;
10806                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10807                                                &numlen, UTF8_ALLOW_DEFAULT);
10808                         p += numlen;
10809                     }
10810                     else
10811                         ender = (U8) *p++;
10812                     break;
10813                 } /* End of switch on the literal */
10814
10815                 /* Here, have looked at the literal character and <ender>
10816                  * contains its ordinal, <p> points to the character after it
10817                  */
10818
10819                 if ( RExC_flags & RXf_PMf_EXTENDED)
10820                     p = regwhite( pRExC_state, p );
10821
10822                 /* If the next thing is a quantifier, it applies to this
10823                  * character only, which means that this character has to be in
10824                  * its own node and can't just be appended to the string in an
10825                  * existing node, so if there are already other characters in
10826                  * the node, close the node with just them, and set up to do
10827                  * this character again next time through, when it will be the
10828                  * only thing in its new node */
10829                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
10830                 {
10831                     p = oldp;
10832                     goto loopdone;
10833                 }
10834
10835                 if (FOLD) {
10836                     if (UTF
10837                             /* See comments for join_exact() as to why we fold
10838                              * this non-UTF at compile time */
10839                         || (node_type == EXACTFU
10840                             && ender == LATIN_SMALL_LETTER_SHARP_S))
10841                     {
10842
10843
10844                         /* Prime the casefolded buffer.  Locale rules, which
10845                          * apply only to code points < 256, aren't known until
10846                          * execution, so for them, just output the original
10847                          * character using utf8.  If we start to fold non-UTF
10848                          * patterns, be sure to update join_exact() */
10849                         if (LOC && ender < 256) {
10850                             if (UNI_IS_INVARIANT(ender)) {
10851                                 *s = (U8) ender;
10852                                 foldlen = 1;
10853                             } else {
10854                                 *s = UTF8_TWO_BYTE_HI(ender);
10855                                 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
10856                                 foldlen = 2;
10857                             }
10858                         }
10859                         else {
10860                             UV folded = _to_uni_fold_flags(
10861                                            ender,
10862                                            (U8 *) s,
10863                                            &foldlen,
10864                                            FOLD_FLAGS_FULL
10865                                            | ((LOC) ?  FOLD_FLAGS_LOCALE
10866                                                     : (ASCII_FOLD_RESTRICTED)
10867                                                       ? FOLD_FLAGS_NOMIX_ASCII
10868                                                       : 0)
10869                                             );
10870
10871                             /* If this node only contains non-folding code
10872                              * points so far, see if this new one is also
10873                              * non-folding */
10874                             if (maybe_exact) {
10875                                 if (folded != ender) {
10876                                     maybe_exact = FALSE;
10877                                 }
10878                                 else {
10879                                     /* Here the fold is the original; we have
10880                                      * to check further to see if anything
10881                                      * folds to it */
10882                                     if (! PL_utf8_foldable) {
10883                                         SV* swash = swash_init("utf8",
10884                                                            "_Perl_Any_Folds",
10885                                                            &PL_sv_undef, 1, 0);
10886                                         PL_utf8_foldable =
10887                                                     _get_swash_invlist(swash);
10888                                         SvREFCNT_dec_NN(swash);
10889                                     }
10890                                     if (_invlist_contains_cp(PL_utf8_foldable,
10891                                                              ender))
10892                                     {
10893                                         maybe_exact = FALSE;
10894                                     }
10895                                 }
10896                             }
10897                             ender = folded;
10898                         }
10899                         s += foldlen;
10900
10901                         /* The loop increments <len> each time, as all but this
10902                          * path (and the one just below for UTF) through it add
10903                          * a single byte to the EXACTish node.  But this one
10904                          * has changed len to be the correct final value, so
10905                          * subtract one to cancel out the increment that
10906                          * follows */
10907                         len += foldlen - 1;
10908                     }
10909                     else {
10910                         *(s++) = (char) ender;
10911                         maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
10912                     }
10913                 }
10914                 else if (UTF) {
10915                     const STRLEN unilen = reguni(pRExC_state, ender, s);
10916                     if (unilen > 0) {
10917                        s   += unilen;
10918                        len += unilen;
10919                     }
10920
10921                     /* See comment just above for - 1 */
10922                     len--;
10923                 }
10924                 else {
10925                     REGC((char)ender, s++);
10926                 }
10927
10928                 if (next_is_quantifier) {
10929
10930                     /* Here, the next input is a quantifier, and to get here,
10931                      * the current character is the only one in the node.
10932                      * Also, here <len> doesn't include the final byte for this
10933                      * character */
10934                     len++;
10935                     goto loopdone;
10936                 }
10937
10938             } /* End of loop through literal characters */
10939
10940             /* Here we have either exhausted the input or ran out of room in
10941              * the node.  (If we encountered a character that can't be in the
10942              * node, transfer is made directly to <loopdone>, and so we
10943              * wouldn't have fallen off the end of the loop.)  In the latter
10944              * case, we artificially have to split the node into two, because
10945              * we just don't have enough space to hold everything.  This
10946              * creates a problem if the final character participates in a
10947              * multi-character fold in the non-final position, as a match that
10948              * should have occurred won't, due to the way nodes are matched,
10949              * and our artificial boundary.  So back off until we find a non-
10950              * problematic character -- one that isn't at the beginning or
10951              * middle of such a fold.  (Either it doesn't participate in any
10952              * folds, or appears only in the final position of all the folds it
10953              * does participate in.)  A better solution with far fewer false
10954              * positives, and that would fill the nodes more completely, would
10955              * be to actually have available all the multi-character folds to
10956              * test against, and to back-off only far enough to be sure that
10957              * this node isn't ending with a partial one.  <upper_parse> is set
10958              * further below (if we need to reparse the node) to include just
10959              * up through that final non-problematic character that this code
10960              * identifies, so when it is set to less than the full node, we can
10961              * skip the rest of this */
10962             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
10963
10964                 const STRLEN full_len = len;
10965
10966                 assert(len >= MAX_NODE_STRING_SIZE);
10967
10968                 /* Here, <s> points to the final byte of the final character.
10969                  * Look backwards through the string until find a non-
10970                  * problematic character */
10971
10972                 if (! UTF) {
10973
10974                     /* These two have no multi-char folds to non-UTF characters
10975                      */
10976                     if (ASCII_FOLD_RESTRICTED || LOC) {
10977                         goto loopdone;
10978                     }
10979
10980                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
10981                     len = s - s0 + 1;
10982                 }
10983                 else {
10984                     if (!  PL_NonL1NonFinalFold) {
10985                         PL_NonL1NonFinalFold = _new_invlist_C_array(
10986                                         NonL1_Perl_Non_Final_Folds_invlist);
10987                     }
10988
10989                     /* Point to the first byte of the final character */
10990                     s = (char *) utf8_hop((U8 *) s, -1);
10991
10992                     while (s >= s0) {   /* Search backwards until find
10993                                            non-problematic char */
10994                         if (UTF8_IS_INVARIANT(*s)) {
10995
10996                             /* There are no ascii characters that participate
10997                              * in multi-char folds under /aa.  In EBCDIC, the
10998                              * non-ascii invariants are all control characters,
10999                              * so don't ever participate in any folds. */
11000                             if (ASCII_FOLD_RESTRICTED
11001                                 || ! IS_NON_FINAL_FOLD(*s))
11002                             {
11003                                 break;
11004                             }
11005                         }
11006                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11007
11008                             /* No Latin1 characters participate in multi-char
11009                              * folds under /l */
11010                             if (LOC
11011                                 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
11012                                                                 *s, *(s+1))))
11013                             {
11014                                 break;
11015                             }
11016                         }
11017                         else if (! _invlist_contains_cp(
11018                                         PL_NonL1NonFinalFold,
11019                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
11020                         {
11021                             break;
11022                         }
11023
11024                         /* Here, the current character is problematic in that
11025                          * it does occur in the non-final position of some
11026                          * fold, so try the character before it, but have to
11027                          * special case the very first byte in the string, so
11028                          * we don't read outside the string */
11029                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11030                     } /* End of loop backwards through the string */
11031
11032                     /* If there were only problematic characters in the string,
11033                      * <s> will point to before s0, in which case the length
11034                      * should be 0, otherwise include the length of the
11035                      * non-problematic character just found */
11036                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11037                 }
11038
11039                 /* Here, have found the final character, if any, that is
11040                  * non-problematic as far as ending the node without splitting
11041                  * it across a potential multi-char fold.  <len> contains the
11042                  * number of bytes in the node up-to and including that
11043                  * character, or is 0 if there is no such character, meaning
11044                  * the whole node contains only problematic characters.  In
11045                  * this case, give up and just take the node as-is.  We can't
11046                  * do any better */
11047                 if (len == 0) {
11048                     len = full_len;
11049                 } else {
11050
11051                     /* Here, the node does contain some characters that aren't
11052                      * problematic.  If one such is the final character in the
11053                      * node, we are done */
11054                     if (len == full_len) {
11055                         goto loopdone;
11056                     }
11057                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11058
11059                         /* If the final character is problematic, but the
11060                          * penultimate is not, back-off that last character to
11061                          * later start a new node with it */
11062                         p = oldp;
11063                         goto loopdone;
11064                     }
11065
11066                     /* Here, the final non-problematic character is earlier
11067                      * in the input than the penultimate character.  What we do
11068                      * is reparse from the beginning, going up only as far as
11069                      * this final ok one, thus guaranteeing that the node ends
11070                      * in an acceptable character.  The reason we reparse is
11071                      * that we know how far in the character is, but we don't
11072                      * know how to correlate its position with the input parse.
11073                      * An alternate implementation would be to build that
11074                      * correlation as we go along during the original parse,
11075                      * but that would entail extra work for every node, whereas
11076                      * this code gets executed only when the string is too
11077                      * large for the node, and the final two characters are
11078                      * problematic, an infrequent occurrence.  Yet another
11079                      * possible strategy would be to save the tail of the
11080                      * string, and the next time regatom is called, initialize
11081                      * with that.  The problem with this is that unless you
11082                      * back off one more character, you won't be guaranteed
11083                      * regatom will get called again, unless regbranch,
11084                      * regpiece ... are also changed.  If you do back off that
11085                      * extra character, so that there is input guaranteed to
11086                      * force calling regatom, you can't handle the case where
11087                      * just the first character in the node is acceptable.  I
11088                      * (khw) decided to try this method which doesn't have that
11089                      * pitfall; if performance issues are found, we can do a
11090                      * combination of the current approach plus that one */
11091                     upper_parse = len;
11092                     len = 0;
11093                     s = s0;
11094                     goto reparse;
11095                 }
11096             }   /* End of verifying node ends with an appropriate char */
11097
11098         loopdone:   /* Jumped to when encounters something that shouldn't be in
11099                        the node */
11100
11101             /* If 'maybe_exact' is still set here, means there are no
11102              * code points in the node that participate in folds */
11103             if (FOLD && maybe_exact) {
11104                 OP(ret) = EXACT;
11105             }
11106
11107             /* I (khw) don't know if you can get here with zero length, but the
11108              * old code handled this situation by creating a zero-length EXACT
11109              * node.  Might as well be NOTHING instead */
11110             if (len == 0) {
11111                 OP(ret) = NOTHING;
11112             }
11113             else{
11114                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11115             }
11116
11117             RExC_parse = p - 1;
11118             Set_Node_Cur_Length(ret); /* MJD */
11119             nextchar(pRExC_state);
11120             {
11121                 /* len is STRLEN which is unsigned, need to copy to signed */
11122                 IV iv = len;
11123                 if (iv < 0)
11124                     vFAIL("Internal disaster");
11125             }
11126
11127         } /* End of label 'defchar:' */
11128         break;
11129     } /* End of giant switch on input character */
11130
11131     return(ret);
11132 }
11133
11134 STATIC char *
11135 S_regwhite( RExC_state_t *pRExC_state, char *p )
11136 {
11137     const char *e = RExC_end;
11138
11139     PERL_ARGS_ASSERT_REGWHITE;
11140
11141     while (p < e) {
11142         if (isSPACE(*p))
11143             ++p;
11144         else if (*p == '#') {
11145             bool ended = 0;
11146             do {
11147                 if (*p++ == '\n') {
11148                     ended = 1;
11149                     break;
11150                 }
11151             } while (p < e);
11152             if (!ended)
11153                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11154         }
11155         else
11156             break;
11157     }
11158     return p;
11159 }
11160
11161 STATIC char *
11162 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
11163 {
11164     /* Returns the next non-pattern-white space, non-comment character (the
11165      * latter only if 'recognize_comment is true) in the string p, which is
11166      * ended by RExC_end.  If there is no line break ending a comment,
11167      * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
11168     const char *e = RExC_end;
11169
11170     PERL_ARGS_ASSERT_REGPATWS;
11171
11172     while (p < e) {
11173         STRLEN len;
11174         if ((len = is_PATWS_safe(p, e, UTF))) {
11175             p += len;
11176         }
11177         else if (recognize_comment && *p == '#') {
11178             bool ended = 0;
11179             do {
11180                 p++;
11181                 if (is_LNBREAK_safe(p, e, UTF)) {
11182                     ended = 1;
11183                     break;
11184                 }
11185             } while (p < e);
11186             if (!ended)
11187                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11188         }
11189         else
11190             break;
11191     }
11192     return p;
11193 }
11194
11195 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11196    Character classes ([:foo:]) can also be negated ([:^foo:]).
11197    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11198    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11199    but trigger failures because they are currently unimplemented. */
11200
11201 #define POSIXCC_DONE(c)   ((c) == ':')
11202 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11203 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11204
11205 PERL_STATIC_INLINE I32
11206 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, SV *free_me,
11207                     const bool strict)
11208 {
11209     dVAR;
11210     I32 namedclass = OOB_NAMEDCLASS;
11211
11212     PERL_ARGS_ASSERT_REGPPOSIXCC;
11213
11214     if (value == '[' && RExC_parse + 1 < RExC_end &&
11215         /* I smell either [: or [= or [. -- POSIX has been here, right? */
11216         POSIXCC(UCHARAT(RExC_parse)))
11217     {
11218         const char c = UCHARAT(RExC_parse);
11219         char* const s = RExC_parse++;
11220
11221         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11222             RExC_parse++;
11223         if (RExC_parse == RExC_end) {
11224             if (strict) {
11225
11226                 /* Try to give a better location for the error (than the end of
11227                  * the string) by looking for the matching ']' */
11228                 RExC_parse = s;
11229                 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11230                     RExC_parse++;
11231                 }
11232                 vFAIL2("Unmatched '%c' in POSIX class", c);
11233             }
11234             /* Grandfather lone [:, [=, [. */
11235             RExC_parse = s;
11236         }
11237         else {
11238             const char* const t = RExC_parse++; /* skip over the c */
11239             assert(*t == c);
11240
11241             if (UCHARAT(RExC_parse) == ']') {
11242                 const char *posixcc = s + 1;
11243                 RExC_parse++; /* skip over the ending ] */
11244
11245                 if (*s == ':') {
11246                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11247                     const I32 skip = t - posixcc;
11248
11249                     /* Initially switch on the length of the name.  */
11250                     switch (skip) {
11251                     case 4:
11252                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
11253                                                           this is the Perl \w
11254                                                         */
11255                             namedclass = ANYOF_WORDCHAR;
11256                         break;
11257                     case 5:
11258                         /* Names all of length 5.  */
11259                         /* alnum alpha ascii blank cntrl digit graph lower
11260                            print punct space upper  */
11261                         /* Offset 4 gives the best switch position.  */
11262                         switch (posixcc[4]) {
11263                         case 'a':
11264                             if (memEQ(posixcc, "alph", 4)) /* alpha */
11265                                 namedclass = ANYOF_ALPHA;
11266                             break;
11267                         case 'e':
11268                             if (memEQ(posixcc, "spac", 4)) /* space */
11269                                 namedclass = ANYOF_PSXSPC;
11270                             break;
11271                         case 'h':
11272                             if (memEQ(posixcc, "grap", 4)) /* graph */
11273                                 namedclass = ANYOF_GRAPH;
11274                             break;
11275                         case 'i':
11276                             if (memEQ(posixcc, "asci", 4)) /* ascii */
11277                                 namedclass = ANYOF_ASCII;
11278                             break;
11279                         case 'k':
11280                             if (memEQ(posixcc, "blan", 4)) /* blank */
11281                                 namedclass = ANYOF_BLANK;
11282                             break;
11283                         case 'l':
11284                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11285                                 namedclass = ANYOF_CNTRL;
11286                             break;
11287                         case 'm':
11288                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
11289                                 namedclass = ANYOF_ALPHANUMERIC;
11290                             break;
11291                         case 'r':
11292                             if (memEQ(posixcc, "lowe", 4)) /* lower */
11293                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
11294                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
11295                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
11296                             break;
11297                         case 't':
11298                             if (memEQ(posixcc, "digi", 4)) /* digit */
11299                                 namedclass = ANYOF_DIGIT;
11300                             else if (memEQ(posixcc, "prin", 4)) /* print */
11301                                 namedclass = ANYOF_PRINT;
11302                             else if (memEQ(posixcc, "punc", 4)) /* punct */
11303                                 namedclass = ANYOF_PUNCT;
11304                             break;
11305                         }
11306                         break;
11307                     case 6:
11308                         if (memEQ(posixcc, "xdigit", 6))
11309                             namedclass = ANYOF_XDIGIT;
11310                         break;
11311                     }
11312
11313                     if (namedclass == OOB_NAMEDCLASS)
11314                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11315                                       t - s - 1, s + 1);
11316
11317                     /* The #defines are structured so each complement is +1 to
11318                      * the normal one */
11319                     if (complement) {
11320                         namedclass++;
11321                     }
11322                     assert (posixcc[skip] == ':');
11323                     assert (posixcc[skip+1] == ']');
11324                 } else if (!SIZE_ONLY) {
11325                     /* [[=foo=]] and [[.foo.]] are still future. */
11326
11327                     /* adjust RExC_parse so the warning shows after
11328                        the class closes */
11329                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11330                         RExC_parse++;
11331                     SvREFCNT_dec(free_me);
11332                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11333                 }
11334             } else {
11335                 /* Maternal grandfather:
11336                  * "[:" ending in ":" but not in ":]" */
11337                 if (strict) {
11338                     vFAIL("Unmatched '[' in POSIX class");
11339                 }
11340
11341                 /* Grandfather lone [:, [=, [. */
11342                 RExC_parse = s;
11343             }
11344         }
11345     }
11346
11347     return namedclass;
11348 }
11349
11350 STATIC bool
11351 S_could_it_be_POSIX(pTHX_ RExC_state_t *pRExC_state)
11352 {
11353     /* This applies some heuristics at the current parse position (which should
11354      * be at a '[') to see if what follows might be intended to be a [:posix:]
11355      * class.  It returns true if it really is a posix class, of course, but it
11356      * also can return true if it thinks that what was intended was a posix
11357      * class that didn't quite make it.
11358      *
11359      * It will return true for
11360      *      [:alphanumerics:
11361      *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
11362      *                         ')' indicating the end of the (?[
11363      *      [:any garbage including %^&$ punctuation:]
11364      *
11365      * This is designed to be called only from S_handle_sets; it could be
11366      * easily adapted to be called from the spot at the beginning of regclass()
11367      * that checks to see in a normal bracketed class if the surrounding []
11368      * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
11369      * change long-standing behavior, so I (khw) didn't do that */
11370     char* p = RExC_parse + 1;
11371     char first_char = *p;
11372
11373     PERL_ARGS_ASSERT_COULD_IT_BE_POSIX;
11374
11375     assert(*(p - 1) == '[');
11376
11377     if (! POSIXCC(first_char)) {
11378         return FALSE;
11379     }
11380
11381     p++;
11382     while (p < RExC_end && isWORDCHAR(*p)) p++;
11383
11384     if (p >= RExC_end) {
11385         return FALSE;
11386     }
11387
11388     if (p - RExC_parse > 2    /* Got at least 1 word character */
11389         && (*p == first_char
11390             || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
11391     {
11392         return TRUE;
11393     }
11394
11395     p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
11396
11397     return (p
11398             && p - RExC_parse > 2 /* [:] evaluates to colon;
11399                                       [::] is a bad posix class. */
11400             && first_char == *(p - 1));
11401 }
11402
11403 STATIC regnode *
11404 S_handle_sets(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
11405                    char * const oregcomp_parse)
11406 {
11407     /* Handle the (?[...]) construct to do set operations */
11408
11409     U8 curchar;
11410     UV start, end;      /* End points of code point ranges */
11411     SV* result_string;
11412     char *save_end, *save_parse;
11413     SV* final;
11414     STRLEN len;
11415     regnode* node;
11416     AV* stack;
11417     const bool save_fold = FOLD;
11418
11419     GET_RE_DEBUG_FLAGS_DECL;
11420
11421     PERL_ARGS_ASSERT_HANDLE_SETS;
11422
11423     if (LOC) {
11424         vFAIL("(?[...]) not valid in locale");
11425     }
11426     RExC_uni_semantics = 1;
11427
11428     /* This will return only an ANYOF regnode, or (unlikely) something smaller
11429      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
11430      * call regclass to handle '[]' so as to not have to reinvent its parsing
11431      * rules here (throwing away the size it computes each time).  And, we exit
11432      * upon an unescaped ']' that isn't one ending a regclass.  To do both
11433      * these things, we need to realize that something preceded by a backslash
11434      * is escaped, so we have to keep track of backslashes */
11435     if (SIZE_ONLY) {
11436
11437         Perl_ck_warner_d(aTHX_
11438             packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
11439             "The regex_sets feature is experimental" REPORT_LOCATION,
11440             (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse);
11441
11442         while (RExC_parse < RExC_end) {
11443             SV* current = NULL;
11444             RExC_parse = regpatws(pRExC_state, RExC_parse,
11445                                 TRUE); /* means recognize comments */
11446             switch (*RExC_parse) {
11447                 default:
11448                     break;
11449                 case '\\':
11450                     /* Skip the next byte.  This would have to change to skip
11451                      * the next character if we were to recognize and handle
11452                      * specific non-ASCIIs */
11453                     RExC_parse++;
11454                     break;
11455                 case '[':
11456                 {
11457                     /* If this looks like it is a [:posix:] class, leave the
11458                      * parse pointer at the '[' to fool regclass() into
11459                      * thinking it is part of a '[[:posix]]'.  That function
11460                      * will use strict checking to force a syntax error if it
11461                      * doesn't work out to a legitimate class */
11462                     bool is_posix_class = could_it_be_POSIX(pRExC_state);
11463                     if (! is_posix_class) {
11464                         RExC_parse++;
11465                     }
11466
11467                     (void) regclass(pRExC_state, flagp,depth+1,
11468                                     is_posix_class, /* parse the whole char
11469                                                        class only if not a
11470                                                        posix class */
11471                                     FALSE, /* don't allow multi-char folds */
11472                                     TRUE, /* silence non-portable warnings. */
11473                                     &current);
11474                     /* function call leaves parse pointing to the ']', except
11475                      * if we faked it */
11476                     if (is_posix_class) {
11477                         RExC_parse--;
11478                     }
11479
11480                     SvREFCNT_dec(current);   /* In case it returned something */
11481                     break;
11482                 }
11483
11484                 case ']':
11485                     RExC_parse++;
11486                     if (RExC_parse < RExC_end
11487                         && *RExC_parse == ')')
11488                     {
11489                         node = reganode(pRExC_state, ANYOF, 0);
11490                         RExC_size += ANYOF_SKIP;
11491                         nextchar(pRExC_state);
11492                         Set_Node_Length(node,
11493                                 RExC_parse - oregcomp_parse + 1); /* MJD */
11494                         return node;
11495                     }
11496                     goto no_close;
11497             }
11498             RExC_parse++;
11499         }
11500
11501         no_close:
11502         FAIL("Syntax error in (?[...])");
11503     }
11504
11505     /* Pass 2 only after this.  Everything in this construct is a
11506      * metacharacter.  Operands begin with either a '\' (for an escape
11507      * sequence), or a '[' for a bracketed character class.  Any other
11508      * character should be an operator, or parenthesis for grouping.  Both
11509      * types of operands are handled by calling regclass() to parse them.  It
11510      * is called with a parameter to indicate to return the computed inversion
11511      * list.  The parsing here is implemented via a stack.  Each entry on the
11512      * stack is a single character representing one of the operators, or the
11513      * '('; or else a pointer to an operand inversion list. */
11514
11515 #define IS_OPERAND(a)  (! SvIOK(a))
11516
11517     /* The stack starts empty.  It is a syntax error if the first thing parsed
11518      * is a binary operator; everything else is pushed on the stack.  When an
11519      * operand is parsed, the top of the stack is examined.  If it is a binary
11520      * operator, the item before it should be an operand, and both are replaced
11521      * by the result of doing that operation on the new operand and the one on
11522      * the stack.   Thus a sequence of binary operands is reduced to a single
11523      * one before the next one is parsed.
11524      *
11525      * A unary operator may immediately follow a binary in the input, for
11526      * example
11527      *      [a] + ! [b]
11528      * When an operand is parsed and the top of the stack is a unary operator,
11529      * the operation is performed, and then the stack is rechecked to see if
11530      * this new operand is part of a binary operation; if so, it is handled as
11531      * above.
11532      *
11533      * A '(' is simply pushed on the stack; it is valid only if the stack is
11534      * empty, or the top element of the stack is an operator (for which the
11535      * parenthesized expression will become an operand).  By the time the
11536      * corresponding ')' is parsed everything in between should have been
11537      * parsed and evaluated to a single operand (or else is a syntax error),
11538      * and is handled as a regular operand */
11539
11540     stack = newAV();
11541
11542     while (RExC_parse < RExC_end) {
11543         I32 top_index = av_top(stack);
11544         SV** top_ptr;
11545         SV* current = NULL;
11546
11547         /* Skip white space */
11548         RExC_parse = regpatws(pRExC_state, RExC_parse,
11549                                 TRUE); /* means recognize comments */
11550         if (RExC_parse >= RExC_end
11551             || (curchar = UCHARAT(RExC_parse)) == ']')
11552         {   /* Exit loop at the end */
11553             break;
11554         }
11555
11556         switch (curchar) {
11557
11558             default:
11559                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11560                 vFAIL("Unexpected character");
11561
11562             case '\\':
11563                 (void) regclass(pRExC_state, flagp,depth+1,
11564                                 TRUE, /* means parse just the next thing */
11565                                 FALSE, /* don't allow multi-char folds */
11566                                 FALSE, /* don't silence non-portable warnings.
11567                                         */
11568                                 &current);
11569                 /* regclass() will return with parsing just the \ sequence,
11570                  * leaving the parse pointer at the next thing to parse */
11571                 RExC_parse--;
11572                 goto handle_operand;
11573
11574             case '[':   /* Is a bracketed character class */
11575             {
11576                 bool is_posix_class = could_it_be_POSIX(pRExC_state);
11577
11578                 if (! is_posix_class) {
11579                     RExC_parse++;
11580                 }
11581
11582                 (void) regclass(pRExC_state, flagp,depth+1,
11583                                 is_posix_class, /* parse the whole char class
11584                                                    only if not a posix class */
11585                                 FALSE, /* don't allow multi-char folds */
11586                                 FALSE, /* don't silence non-portable warnings.
11587                                         */
11588                                 &current);
11589                 /* function call leaves parse pointing to the ']', except if we
11590                  * faked it */
11591                 if (is_posix_class) {
11592                     RExC_parse--;
11593                 }
11594
11595                 goto handle_operand;
11596             }
11597
11598             case '&':
11599             case '|':
11600             case '+':
11601             case '-':
11602             case '^':
11603                 if (top_index < 0
11604                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
11605                     || ! IS_OPERAND(*top_ptr))
11606                 {
11607                     RExC_parse++;
11608                     vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
11609                 }
11610                 av_push(stack, newSVuv(curchar));
11611                 break;
11612
11613             case '!':
11614                 av_push(stack, newSVuv(curchar));
11615                 break;
11616
11617             case '(':
11618                 if (top_index >= 0) {
11619                     top_ptr = av_fetch(stack, top_index, FALSE);
11620                     assert(top_ptr);
11621                     if (IS_OPERAND(*top_ptr)) {
11622                         RExC_parse++;
11623                         vFAIL("Unexpected '(' with no preceding operator");
11624                     }
11625                 }
11626                 av_push(stack, newSVuv(curchar));
11627                 break;
11628
11629             case ')':
11630             {
11631                 SV* lparen;
11632                 if (top_index < 1
11633                     || ! (current = av_pop(stack))
11634                     || ! IS_OPERAND(current)
11635                     || ! (lparen = av_pop(stack))
11636                     || IS_OPERAND(lparen)
11637                     || SvUV(lparen) != '(')
11638                 {
11639                     RExC_parse++;
11640                     vFAIL("Unexpected ')'");
11641                 }
11642                 top_index -= 2;
11643                 SvREFCNT_dec_NN(lparen);
11644
11645                 /* FALL THROUGH */
11646             }
11647
11648               handle_operand:
11649
11650                 /* Here, we have an operand to process, in 'current' */
11651
11652                 if (top_index < 0) {    /* Just push if stack is empty */
11653                     av_push(stack, current);
11654                 }
11655                 else {
11656                     SV* top = av_pop(stack);
11657                     char current_operator;
11658
11659                     if (IS_OPERAND(top)) {
11660                         vFAIL("Operand with no preceding operator");
11661                     }
11662                     current_operator = (char) SvUV(top);
11663                     switch (current_operator) {
11664                         case '(':   /* Push the '(' back on followed by the new
11665                                        operand */
11666                             av_push(stack, top);
11667                             av_push(stack, current);
11668                             SvREFCNT_inc(top);  /* Counters the '_dec' done
11669                                                    just after the 'break', so
11670                                                    it doesn't get wrongly freed
11671                                                  */
11672                             break;
11673
11674                         case '!':
11675                             _invlist_invert(current);
11676
11677                             /* Unlike binary operators, the top of the stack,
11678                              * now that this unary one has been popped off, may
11679                              * legally be an operator, and we now have operand
11680                              * for it. */
11681                             top_index--;
11682                             SvREFCNT_dec_NN(top);
11683                             goto handle_operand;
11684
11685                         case '&':
11686                             _invlist_intersection(av_pop(stack),
11687                                                    current,
11688                                                    &current);
11689                             av_push(stack, current);
11690                             break;
11691
11692                         case '|':
11693                         case '+':
11694                             _invlist_union(av_pop(stack), current, &current);
11695                             av_push(stack, current);
11696                             break;
11697
11698                         case '-':
11699                             _invlist_subtract(av_pop(stack), current, &current);
11700                             av_push(stack, current);
11701                             break;
11702
11703                         case '^':   /* The union minus the intersection */
11704                         {
11705                             SV* i = NULL;
11706                             SV* u = NULL;
11707                             SV* element;
11708
11709                             element = av_pop(stack);
11710                             _invlist_union(element, current, &u);
11711                             _invlist_intersection(element, current, &i);
11712                             _invlist_subtract(u, i, &current);
11713                             av_push(stack, current);
11714                             SvREFCNT_dec_NN(i);
11715                             SvREFCNT_dec_NN(u);
11716                             SvREFCNT_dec_NN(element);
11717                             break;
11718                         }
11719
11720                         default:
11721                             Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
11722                 }
11723                 SvREFCNT_dec_NN(top);
11724             }
11725         }
11726
11727         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11728     }
11729
11730     if (av_top(stack) < 0   /* Was empty */
11731         || ((final = av_pop(stack)) == NULL)
11732         || ! IS_OPERAND(final)
11733         || av_top(stack) >= 0)  /* More left on stack */
11734     {
11735         vFAIL("Incomplete expression within '(?[ ])'");
11736     }
11737
11738     invlist_iterinit(final);
11739
11740     /* Here, 'final' is the resultant inversion list of evaluating the
11741      * expression.  Feed it to regclass() to generate the real resultant node.
11742      * regclass() is expecting a string of ranges and individual code points */
11743     result_string = newSVpvs("");
11744     while (invlist_iternext(final, &start, &end)) {
11745         if (start == end) {
11746             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
11747         }
11748         else {
11749             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
11750                                                      start,          end);
11751         }
11752     }
11753
11754     save_parse = RExC_parse;
11755     RExC_parse = SvPV(result_string, len);
11756     save_end = RExC_end;
11757     RExC_end = RExC_parse + len;
11758
11759     /* We turn off folding around the call, as the class we have constructed
11760      * already has all folding taken into consideration, and we don't want
11761      * regclass() to add to that */
11762     RExC_flags &= ~RXf_PMf_FOLD;
11763     node = regclass(pRExC_state, flagp,depth+1,
11764                     FALSE, /* means parse the whole char class */
11765                     FALSE, /* don't allow multi-char folds */
11766                     TRUE, /* silence non-portable warnings.  The above may very
11767                              well have generated non-portable code points, but
11768                              they're valid on this machine */
11769                     NULL);
11770     if (save_fold) {
11771         RExC_flags |= RXf_PMf_FOLD;
11772     }
11773     RExC_parse = save_parse + 1;
11774     RExC_end = save_end;
11775     SvREFCNT_dec_NN(final);
11776     SvREFCNT_dec_NN(result_string);
11777     SvREFCNT_dec_NN(stack);
11778
11779     nextchar(pRExC_state);
11780     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
11781     return node;
11782 }
11783 #undef IS_OPERAND
11784
11785 /* The names of properties whose definitions are not known at compile time are
11786  * stored in this SV, after a constant heading.  So if the length has been
11787  * changed since initialization, then there is a run-time definition. */
11788 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
11789
11790 STATIC regnode *
11791 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
11792                  const bool stop_at_1,  /* Just parse the next thing, don't
11793                                            look for a full character class */
11794                  bool allow_multi_folds,
11795                  const bool silence_non_portable,   /* Don't output warnings
11796                                                        about too large
11797                                                        characters */
11798                  SV** ret_invlist)  /* Return an inversion list, not a node */
11799 {
11800     /* parse a bracketed class specification.  Most of these will produce an
11801      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
11802      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
11803      * under /i with multi-character folds: it will be rewritten following the
11804      * paradigm of this example, where the <multi-fold>s are characters which
11805      * fold to multiple character sequences:
11806      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
11807      * gets effectively rewritten as:
11808      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
11809      * reg() gets called (recursively) on the rewritten version, and this
11810      * function will return what it constructs.  (Actually the <multi-fold>s
11811      * aren't physically removed from the [abcdefghi], it's just that they are
11812      * ignored in the recursion by means of a flag:
11813      * <RExC_in_multi_char_class>.)
11814      *
11815      * ANYOF nodes contain a bit map for the first 256 characters, with the
11816      * corresponding bit set if that character is in the list.  For characters
11817      * above 255, a range list or swash is used.  There are extra bits for \w,
11818      * etc. in locale ANYOFs, as what these match is not determinable at
11819      * compile time */
11820
11821     dVAR;
11822     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
11823     IV range = 0;
11824     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
11825     regnode *ret;
11826     STRLEN numlen;
11827     IV namedclass = OOB_NAMEDCLASS;
11828     char *rangebegin = NULL;
11829     bool need_class = 0;
11830     SV *listsv = NULL;
11831     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
11832                                       than just initialized.  */
11833     SV* properties = NULL;    /* Code points that match \p{} \P{} */
11834     SV* posixes = NULL;     /* Code points that match classes like, [:word:],
11835                                extended beyond the Latin1 range */
11836     UV element_count = 0;   /* Number of distinct elements in the class.
11837                                Optimizations may be possible if this is tiny */
11838     AV * multi_char_matches = NULL; /* Code points that fold to more than one
11839                                        character; used under /i */
11840     UV n;
11841     char * stop_ptr = RExC_end;    /* where to stop parsing */
11842     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
11843                                                    space? */
11844     const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
11845
11846     /* Unicode properties are stored in a swash; this holds the current one
11847      * being parsed.  If this swash is the only above-latin1 component of the
11848      * character class, an optimization is to pass it directly on to the
11849      * execution engine.  Otherwise, it is set to NULL to indicate that there
11850      * are other things in the class that have to be dealt with at execution
11851      * time */
11852     SV* swash = NULL;           /* Code points that match \p{} \P{} */
11853
11854     /* Set if a component of this character class is user-defined; just passed
11855      * on to the engine */
11856     bool has_user_defined_property = FALSE;
11857
11858     /* inversion list of code points this node matches only when the target
11859      * string is in UTF-8.  (Because is under /d) */
11860     SV* depends_list = NULL;
11861
11862     /* inversion list of code points this node matches.  For much of the
11863      * function, it includes only those that match regardless of the utf8ness
11864      * of the target string */
11865     SV* cp_list = NULL;
11866
11867 #ifdef EBCDIC
11868     /* In a range, counts how many 0-2 of the ends of it came from literals,
11869      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
11870     UV literal_endpoint = 0;
11871 #endif
11872     bool invert = FALSE;    /* Is this class to be complemented */
11873
11874     /* Is there any thing like \W or [:^digit:] that matches above the legal
11875      * Unicode range? */
11876     bool runtime_posix_matches_above_Unicode = FALSE;
11877
11878     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11879         case we need to change the emitted regop to an EXACT. */
11880     const char * orig_parse = RExC_parse;
11881     const I32 orig_size = RExC_size;
11882     GET_RE_DEBUG_FLAGS_DECL;
11883
11884     PERL_ARGS_ASSERT_REGCLASS;
11885 #ifndef DEBUGGING
11886     PERL_UNUSED_ARG(depth);
11887 #endif
11888
11889     DEBUG_PARSE("clas");
11890
11891     /* Assume we are going to generate an ANYOF node. */
11892     ret = reganode(pRExC_state, ANYOF, 0);
11893
11894     if (SIZE_ONLY) {
11895         RExC_size += ANYOF_SKIP;
11896         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11897     }
11898     else {
11899         ANYOF_FLAGS(ret) = 0;
11900
11901         RExC_emit += ANYOF_SKIP;
11902         if (LOC) {
11903             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11904         }
11905         listsv = newSVpvs("# comment\n");
11906         initial_listsv_len = SvCUR(listsv);
11907     }
11908
11909     if (skip_white) {
11910         RExC_parse = regpatws(pRExC_state, RExC_parse,
11911                               FALSE /* means don't recognize comments */);
11912     }
11913
11914     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
11915         RExC_parse++;
11916         invert = TRUE;
11917         allow_multi_folds = FALSE;
11918         RExC_naughty++;
11919         if (skip_white) {
11920             RExC_parse = regpatws(pRExC_state, RExC_parse,
11921                                   FALSE /* means don't recognize comments */);
11922         }
11923     }
11924
11925     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
11926     if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
11927         const char *s = RExC_parse;
11928         const char  c = *s++;
11929
11930         while (isWORDCHAR(*s))
11931             s++;
11932         if (*s && c == *s && s[1] == ']') {
11933             SAVEFREESV(RExC_rx_sv);
11934             SAVEFREESV(listsv);
11935             ckWARN3reg(s+2,
11936                        "POSIX syntax [%c %c] belongs inside character classes",
11937                        c, c);
11938             (void)ReREFCNT_inc(RExC_rx_sv);
11939             SvREFCNT_inc_simple_void_NN(listsv);
11940         }
11941     }
11942
11943     /* If the caller wants us to just parse a single element, accomplish this
11944      * by faking the loop ending condition */
11945     if (stop_at_1 && RExC_end > RExC_parse) {
11946         stop_ptr = RExC_parse + 1;
11947     }
11948
11949     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
11950     if (UCHARAT(RExC_parse) == ']')
11951         goto charclassloop;
11952
11953 parseit:
11954     while (1) {
11955         if  (RExC_parse >= stop_ptr) {
11956             break;
11957         }
11958
11959         if (skip_white) {
11960             RExC_parse = regpatws(pRExC_state, RExC_parse,
11961                                   FALSE /* means don't recognize comments */);
11962         }
11963
11964         if  (UCHARAT(RExC_parse) == ']') {
11965             break;
11966         }
11967
11968     charclassloop:
11969
11970         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11971         save_value = value;
11972         save_prevvalue = prevvalue;
11973
11974         if (!range) {
11975             rangebegin = RExC_parse;
11976             element_count++;
11977         }
11978         if (UTF) {
11979             value = utf8n_to_uvchr((U8*)RExC_parse,
11980                                    RExC_end - RExC_parse,
11981                                    &numlen, UTF8_ALLOW_DEFAULT);
11982             RExC_parse += numlen;
11983         }
11984         else
11985             value = UCHARAT(RExC_parse++);
11986
11987         if (value == '['
11988             && RExC_parse < RExC_end
11989             && POSIXCC(UCHARAT(RExC_parse)))
11990         {
11991             namedclass = regpposixcc(pRExC_state, value, listsv, strict);
11992         }
11993         else if (value == '\\') {
11994             if (UTF) {
11995                 value = utf8n_to_uvchr((U8*)RExC_parse,
11996                                    RExC_end - RExC_parse,
11997                                    &numlen, UTF8_ALLOW_DEFAULT);
11998                 RExC_parse += numlen;
11999             }
12000             else
12001                 value = UCHARAT(RExC_parse++);
12002
12003             /* Some compilers cannot handle switching on 64-bit integer
12004              * values, therefore value cannot be an UV.  Yes, this will
12005              * be a problem later if we want switch on Unicode.
12006              * A similar issue a little bit later when switching on
12007              * namedclass. --jhi */
12008
12009             /* If the \ is escaping white space when white space is being
12010              * skipped, it means that that white space is wanted literally, and
12011              * is already in 'value'.  Otherwise, need to translate the escape
12012              * into what it signifies. */
12013             if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
12014
12015             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
12016             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
12017             case 's':   namedclass = ANYOF_SPACE;       break;
12018             case 'S':   namedclass = ANYOF_NSPACE;      break;
12019             case 'd':   namedclass = ANYOF_DIGIT;       break;
12020             case 'D':   namedclass = ANYOF_NDIGIT;      break;
12021             case 'v':   namedclass = ANYOF_VERTWS;      break;
12022             case 'V':   namedclass = ANYOF_NVERTWS;     break;
12023             case 'h':   namedclass = ANYOF_HORIZWS;     break;
12024             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
12025             case 'N':  /* Handle \N{NAME} in class */
12026                 {
12027                     /* We only pay attention to the first char of
12028                     multichar strings being returned. I kinda wonder
12029                     if this makes sense as it does change the behaviour
12030                     from earlier versions, OTOH that behaviour was broken
12031                     as well. */
12032                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
12033                                       TRUE, /* => charclass */
12034                                       strict))
12035                     {
12036                         goto parseit;
12037                     }
12038                 }
12039                 break;
12040             case 'p':
12041             case 'P':
12042                 {
12043                 char *e;
12044
12045                 /* We will handle any undefined properties ourselves */
12046                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
12047
12048                 if (RExC_parse >= RExC_end)
12049                     vFAIL2("Empty \\%c{}", (U8)value);
12050                 if (*RExC_parse == '{') {
12051                     const U8 c = (U8)value;
12052                     e = strchr(RExC_parse++, '}');
12053                     if (!e)
12054                         vFAIL2("Missing right brace on \\%c{}", c);
12055                     while (isSPACE(UCHARAT(RExC_parse)))
12056                         RExC_parse++;
12057                     if (e == RExC_parse)
12058                         vFAIL2("Empty \\%c{}", c);
12059                     n = e - RExC_parse;
12060                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
12061                         n--;
12062                 }
12063                 else {
12064                     e = RExC_parse;
12065                     n = 1;
12066                 }
12067                 if (!SIZE_ONLY) {
12068                     SV* invlist;
12069                     char* name;
12070
12071                     if (UCHARAT(RExC_parse) == '^') {
12072                          RExC_parse++;
12073                          n--;
12074                          /* toggle.  (The rhs xor gets the single bit that
12075                           * differs between P and p; the other xor inverts just
12076                           * that bit) */
12077                          value ^= 'P' ^ 'p';
12078
12079                          while (isSPACE(UCHARAT(RExC_parse))) {
12080                               RExC_parse++;
12081                               n--;
12082                          }
12083                     }
12084                     /* Try to get the definition of the property into
12085                      * <invlist>.  If /i is in effect, the effective property
12086                      * will have its name be <__NAME_i>.  The design is
12087                      * discussed in commit
12088                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
12089                     Newx(name, n + sizeof("_i__\n"), char);
12090
12091                     sprintf(name, "%s%.*s%s\n",
12092                                     (FOLD) ? "__" : "",
12093                                     (int)n,
12094                                     RExC_parse,
12095                                     (FOLD) ? "_i" : ""
12096                     );
12097
12098                     /* Look up the property name, and get its swash and
12099                      * inversion list, if the property is found  */
12100                     if (swash) {
12101                         SvREFCNT_dec_NN(swash);
12102                     }
12103                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
12104                                              1, /* binary */
12105                                              0, /* not tr/// */
12106                                              NULL, /* No inversion list */
12107                                              &swash_init_flags
12108                                             );
12109                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
12110                         if (swash) {
12111                             SvREFCNT_dec_NN(swash);
12112                             swash = NULL;
12113                         }
12114
12115                         /* Here didn't find it.  It could be a user-defined
12116                          * property that will be available at run-time.  If we
12117                          * accept only compile-time properties, is an error;
12118                          * otherwise add it to the list for run-time look up */
12119                         if (ret_invlist) {
12120                             RExC_parse = e + 1;
12121                             vFAIL3("Property '%.*s' is unknown", (int) n, name);
12122                         }
12123                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
12124                                         (value == 'p' ? '+' : '!'),
12125                                         name);
12126                         has_user_defined_property = TRUE;
12127
12128                         /* We don't know yet, so have to assume that the
12129                          * property could match something in the Latin1 range,
12130                          * hence something that isn't utf8.  Note that this
12131                          * would cause things in <depends_list> to match
12132                          * inappropriately, except that any \p{}, including
12133                          * this one forces Unicode semantics, which means there
12134                          * is <no depends_list> */
12135                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
12136                     }
12137                     else {
12138
12139                         /* Here, did get the swash and its inversion list.  If
12140                          * the swash is from a user-defined property, then this
12141                          * whole character class should be regarded as such */
12142                         has_user_defined_property =
12143                                     (swash_init_flags
12144                                      & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
12145
12146                         /* Invert if asking for the complement */
12147                         if (value == 'P') {
12148                             _invlist_union_complement_2nd(properties,
12149                                                           invlist,
12150                                                           &properties);
12151
12152                             /* The swash can't be used as-is, because we've
12153                              * inverted things; delay removing it to here after
12154                              * have copied its invlist above */
12155                             SvREFCNT_dec_NN(swash);
12156                             swash = NULL;
12157                         }
12158                         else {
12159                             _invlist_union(properties, invlist, &properties);
12160                         }
12161                     }
12162                     Safefree(name);
12163                 }
12164                 RExC_parse = e + 1;
12165                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
12166                                                 named */
12167
12168                 /* \p means they want Unicode semantics */
12169                 RExC_uni_semantics = 1;
12170                 }
12171                 break;
12172             case 'n':   value = '\n';                   break;
12173             case 'r':   value = '\r';                   break;
12174             case 't':   value = '\t';                   break;
12175             case 'f':   value = '\f';                   break;
12176             case 'b':   value = '\b';                   break;
12177             case 'e':   value = ASCII_TO_NATIVE('\033');break;
12178             case 'a':   value = ASCII_TO_NATIVE('\007');break;
12179             case 'o':
12180                 RExC_parse--;   /* function expects to be pointed at the 'o' */
12181                 {
12182                     const char* error_msg;
12183                     bool valid = grok_bslash_o(&RExC_parse,
12184                                                &value,
12185                                                &error_msg,
12186                                                SIZE_ONLY,   /* warnings in pass
12187                                                                1 only */
12188                                                strict,
12189                                                silence_non_portable,
12190                                                UTF);
12191                     if (! valid) {
12192                         vFAIL(error_msg);
12193                     }
12194                 }
12195                 if (PL_encoding && value < 0x100) {
12196                     goto recode_encoding;
12197                 }
12198                 break;
12199             case 'x':
12200                 RExC_parse--;   /* function expects to be pointed at the 'x' */
12201                 {
12202                     const char* error_msg;
12203                     bool valid = grok_bslash_x(&RExC_parse,
12204                                                &value,
12205                                                &error_msg,
12206                                                TRUE, /* Output warnings */
12207                                                strict,
12208                                                silence_non_portable,
12209                                                UTF);
12210                     if (! valid) {
12211                         vFAIL(error_msg);
12212                     }
12213                 }
12214                 if (PL_encoding && value < 0x100)
12215                     goto recode_encoding;
12216                 break;
12217             case 'c':
12218                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
12219                 break;
12220             case '0': case '1': case '2': case '3': case '4':
12221             case '5': case '6': case '7':
12222                 {
12223                     /* Take 1-3 octal digits */
12224                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12225                     numlen = (strict) ? 4 : 3;
12226                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
12227                     RExC_parse += numlen;
12228                     if (numlen != 3) {
12229                         SAVEFREESV(listsv); /* In case warnings are fatalized */
12230                         if (strict) {
12231                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12232                             vFAIL("Need exactly 3 octal digits");
12233                         }
12234                         else if (! SIZE_ONLY /* like \08, \178 */
12235                                  && numlen < 3
12236                                  && RExC_parse < RExC_end
12237                                  && isDIGIT(*RExC_parse)
12238                                  && ckWARN(WARN_REGEXP))
12239                         {
12240                             SAVEFREESV(RExC_rx_sv);
12241                             reg_warn_non_literal_string(
12242                                  RExC_parse + 1,
12243                                  form_short_octal_warning(RExC_parse, numlen));
12244                             (void)ReREFCNT_inc(RExC_rx_sv);
12245                         }
12246                         SvREFCNT_inc_simple_void_NN(listsv);
12247                     }
12248                     if (PL_encoding && value < 0x100)
12249                         goto recode_encoding;
12250                     break;
12251                 }
12252             recode_encoding:
12253                 if (! RExC_override_recoding) {
12254                     SV* enc = PL_encoding;
12255                     value = reg_recode((const char)(U8)value, &enc);
12256                     if (!enc) {
12257                         if (strict) {
12258                             vFAIL("Invalid escape in the specified encoding");
12259                         }
12260                         else if (SIZE_ONLY) {
12261                             ckWARNreg(RExC_parse,
12262                                   "Invalid escape in the specified encoding");
12263                         }
12264                     }
12265                     break;
12266                 }
12267             default:
12268                 /* Allow \_ to not give an error */
12269                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
12270                     SAVEFREESV(listsv);
12271                     if (strict) {
12272                         vFAIL2("Unrecognized escape \\%c in character class",
12273                                (int)value);
12274                     }
12275                     else {
12276                         SAVEFREESV(RExC_rx_sv);
12277                         ckWARN2reg(RExC_parse,
12278                             "Unrecognized escape \\%c in character class passed through",
12279                             (int)value);
12280                         (void)ReREFCNT_inc(RExC_rx_sv);
12281                     }
12282                     SvREFCNT_inc_simple_void_NN(listsv);
12283                 }
12284                 break;
12285             }   /* End of switch on char following backslash */
12286         } /* end of handling backslash escape sequences */
12287 #ifdef EBCDIC
12288         else
12289             literal_endpoint++;
12290 #endif
12291
12292         /* Here, we have the current token in 'value' */
12293
12294         /* What matches in a locale is not known until runtime.  This includes
12295          * what the Posix classes (like \w, [:space:]) match.  Room must be
12296          * reserved (one time per class) to store such classes, either if Perl
12297          * is compiled so that locale nodes always should have this space, or
12298          * if there is such class info to be stored.  The space will contain a
12299          * bit for each named class that is to be matched against.  This isn't
12300          * needed for \p{} and pseudo-classes, as they are not affected by
12301          * locale, and hence are dealt with separately */
12302         if (LOC
12303             && ! need_class
12304             && (ANYOF_LOCALE == ANYOF_CLASS
12305                 || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
12306         {
12307             need_class = 1;
12308             if (SIZE_ONLY) {
12309                 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12310             }
12311             else {
12312                 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12313                 ANYOF_CLASS_ZERO(ret);
12314             }
12315             ANYOF_FLAGS(ret) |= ANYOF_CLASS;
12316         }
12317
12318         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
12319
12320             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
12321              * literal, as is the character that began the false range, i.e.
12322              * the 'a' in the examples */
12323             if (range) {
12324                 if (!SIZE_ONLY) {
12325                     const int w = (RExC_parse >= rangebegin)
12326                                   ? RExC_parse - rangebegin
12327                                   : 0;
12328                     SAVEFREESV(listsv); /* in case of fatal warnings */
12329                     if (strict) {
12330                         vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin);
12331                     }
12332                     else {
12333                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
12334                         ckWARN4reg(RExC_parse,
12335                                 "False [] range \"%*.*s\"",
12336                                 w, w, rangebegin);
12337                         (void)ReREFCNT_inc(RExC_rx_sv);
12338                         cp_list = add_cp_to_invlist(cp_list, '-');
12339                         cp_list = add_cp_to_invlist(cp_list, prevvalue);
12340                     }
12341                     SvREFCNT_inc_simple_void_NN(listsv);
12342                 }
12343
12344                 range = 0; /* this was not a true range */
12345                 element_count += 2; /* So counts for three values */
12346             }
12347
12348             if (! SIZE_ONLY) {
12349                 U8 classnum = namedclass_to_classnum(namedclass);
12350                 if (namedclass >= ANYOF_MAX) {  /* If a special class */
12351                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
12352
12353                         /* Here, should be \h, \H, \v, or \V.  Neither /d nor
12354                          * /l make a difference in what these match.  There
12355                          * would be problems if these characters had folds
12356                          * other than themselves, as cp_list is subject to
12357                          * folding. */
12358                         if (classnum != _CC_VERTSPACE) {
12359                             assert(   namedclass == ANYOF_HORIZWS
12360                                    || namedclass == ANYOF_NHORIZWS);
12361
12362                             /* It turns out that \h is just a synonym for
12363                              * XPosixBlank */
12364                             classnum = _CC_BLANK;
12365                         }
12366
12367                         _invlist_union_maybe_complement_2nd(
12368                                 cp_list,
12369                                 PL_XPosix_ptrs[classnum],
12370                                 cBOOL(namedclass % 2), /* Complement if odd
12371                                                           (NHORIZWS, NVERTWS)
12372                                                         */
12373                                 &cp_list);
12374                     }
12375                 }
12376                 else if (classnum == _CC_ASCII) {
12377 #ifdef HAS_ISASCII
12378                     if (LOC) {
12379                         ANYOF_CLASS_SET(ret, namedclass);
12380                     }
12381                     else
12382 #endif  /* Not isascii(); just use the hard-coded definition for it */
12383                         _invlist_union_maybe_complement_2nd(
12384                                 posixes,
12385                                 PL_ASCII,
12386                                 cBOOL(namedclass % 2), /* Complement if odd
12387                                                           (NASCII) */
12388                                 &posixes);
12389                 }
12390                 else {  /* Garden variety class */
12391
12392                     /* The ascii range inversion list */
12393                     SV* ascii_source = PL_Posix_ptrs[classnum];
12394
12395                     /* The full Latin1 range inversion list */
12396                     SV* l1_source = PL_L1Posix_ptrs[classnum];
12397
12398                     /* This code is structured into two major clauses.  The
12399                      * first is for classes whose complete definitions may not
12400                      * already be known.  It not, the Latin1 definition
12401                      * (guaranteed to already known) is used plus code is
12402                      * generated to load the rest at run-time (only if needed).
12403                      * If the complete definition is known, it drops down to
12404                      * the second clause, where the complete definition is
12405                      * known */
12406
12407                     if (classnum < _FIRST_NON_SWASH_CC) {
12408
12409                         /* Here, the class has a swash, which may or not
12410                          * already be loaded */
12411
12412                         /* The name of the property to use to match the full
12413                          * eXtended Unicode range swash for this character
12414                          * class */
12415                         const char *Xname = swash_property_names[classnum];
12416
12417                         /* If returning the inversion list, we can't defer
12418                          * getting this until runtime */
12419                         if (ret_invlist && !  PL_utf8_swash_ptrs[classnum]) {
12420                             PL_utf8_swash_ptrs[classnum] =
12421                                 _core_swash_init("utf8", Xname, &PL_sv_undef,
12422                                              1, /* binary */
12423                                              0, /* not tr/// */
12424                                              NULL, /* No inversion list */
12425                                              NULL  /* No flags */
12426                                             );
12427                             assert(PL_utf8_swash_ptrs[classnum]);
12428                         }
12429                         if ( !  PL_utf8_swash_ptrs[classnum]) {
12430                             if (namedclass % 2 == 0) { /* A non-complemented
12431                                                           class */
12432                                 /* If not /a matching, there are code points we
12433                                  * don't know at compile time.  Arrange for the
12434                                  * unknown matches to be loaded at run-time, if
12435                                  * needed */
12436                                 if (! AT_LEAST_ASCII_RESTRICTED) {
12437                                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
12438                                                                  Xname);
12439                                 }
12440                                 if (LOC) {  /* Under locale, set run-time
12441                                                lookup */
12442                                     ANYOF_CLASS_SET(ret, namedclass);
12443                                 }
12444                                 else {
12445                                     /* Add the current class's code points to
12446                                      * the running total */
12447                                     _invlist_union(posixes,
12448                                                    (AT_LEAST_ASCII_RESTRICTED)
12449                                                         ? ascii_source
12450                                                         : l1_source,
12451                                                    &posixes);
12452                                 }
12453                             }
12454                             else {  /* A complemented class */
12455                                 if (AT_LEAST_ASCII_RESTRICTED) {
12456                                     /* Under /a should match everything above
12457                                      * ASCII, plus the complement of the set's
12458                                      * ASCII matches */
12459                                     _invlist_union_complement_2nd(posixes,
12460                                                                   ascii_source,
12461                                                                   &posixes);
12462                                 }
12463                                 else {
12464                                     /* Arrange for the unknown matches to be
12465                                      * loaded at run-time, if needed */
12466                                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
12467                                                                  Xname);
12468                                     runtime_posix_matches_above_Unicode = TRUE;
12469                                     if (LOC) {
12470                                         ANYOF_CLASS_SET(ret, namedclass);
12471                                     }
12472                                     else {
12473
12474                                         /* We want to match everything in
12475                                          * Latin1, except those things that
12476                                          * l1_source matches */
12477                                         SV* scratch_list = NULL;
12478                                         _invlist_subtract(PL_Latin1, l1_source,
12479                                                           &scratch_list);
12480
12481                                         /* Add the list from this class to the
12482                                          * running total */
12483                                         if (! posixes) {
12484                                             posixes = scratch_list;
12485                                         }
12486                                         else {
12487                                             _invlist_union(posixes,
12488                                                            scratch_list,
12489                                                            &posixes);
12490                                             SvREFCNT_dec_NN(scratch_list);
12491                                         }
12492                                         if (DEPENDS_SEMANTICS) {
12493                                             ANYOF_FLAGS(ret)
12494                                                   |= ANYOF_NON_UTF8_LATIN1_ALL;
12495                                         }
12496                                     }
12497                                 }
12498                             }
12499                             goto namedclass_done;
12500                         }
12501
12502                         /* Here, there is a swash loaded for the class.  If no
12503                          * inversion list for it yet, get it */
12504                         if (! PL_XPosix_ptrs[classnum]) {
12505                             PL_XPosix_ptrs[classnum]
12506                              = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
12507                         }
12508                     }
12509
12510                     /* Here there is an inversion list already loaded for the
12511                      * entire class */
12512
12513                     if (namedclass % 2 == 0) {  /* A non-complemented class,
12514                                                    like ANYOF_PUNCT */
12515                         if (! LOC) {
12516                             /* For non-locale, just add it to any existing list
12517                              * */
12518                             _invlist_union(posixes,
12519                                            (AT_LEAST_ASCII_RESTRICTED)
12520                                                ? ascii_source
12521                                                : PL_XPosix_ptrs[classnum],
12522                                            &posixes);
12523                         }
12524                         else {  /* Locale */
12525                             SV* scratch_list = NULL;
12526
12527                             /* For above Latin1 code points, we use the full
12528                              * Unicode range */
12529                             _invlist_intersection(PL_AboveLatin1,
12530                                                   PL_XPosix_ptrs[classnum],
12531                                                   &scratch_list);
12532                             /* And set the output to it, adding instead if
12533                              * there already is an output.  Checking if
12534                              * 'posixes' is NULL first saves an extra clone.
12535                              * Its reference count will be decremented at the
12536                              * next union, etc, or if this is the only
12537                              * instance, at the end of the routine */
12538                             if (! posixes) {
12539                                 posixes = scratch_list;
12540                             }
12541                             else {
12542                                 _invlist_union(posixes, scratch_list, &posixes);
12543                                 SvREFCNT_dec_NN(scratch_list);
12544                             }
12545
12546 #ifndef HAS_ISBLANK
12547                             if (namedclass != ANYOF_BLANK) {
12548 #endif
12549                                 /* Set this class in the node for runtime
12550                                  * matching */
12551                                 ANYOF_CLASS_SET(ret, namedclass);
12552 #ifndef HAS_ISBLANK
12553                             }
12554                             else {
12555                                 /* No isblank(), use the hard-coded ASCII-range
12556                                  * blanks, adding them to the running total. */
12557
12558                                 _invlist_union(posixes, ascii_source, &posixes);
12559                             }
12560 #endif
12561                         }
12562                     }
12563                     else {  /* A complemented class, like ANYOF_NPUNCT */
12564                         if (! LOC) {
12565                             _invlist_union_complement_2nd(
12566                                                 posixes,
12567                                                 (AT_LEAST_ASCII_RESTRICTED)
12568                                                     ? ascii_source
12569                                                     : PL_XPosix_ptrs[classnum],
12570                                                 &posixes);
12571                             /* Under /d, everything in the upper half of the
12572                              * Latin1 range matches this complement */
12573                             if (DEPENDS_SEMANTICS) {
12574                                 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
12575                             }
12576                         }
12577                         else {  /* Locale */
12578                             SV* scratch_list = NULL;
12579                             _invlist_subtract(PL_AboveLatin1,
12580                                               PL_XPosix_ptrs[classnum],
12581                                               &scratch_list);
12582                             if (! posixes) {
12583                                 posixes = scratch_list;
12584                             }
12585                             else {
12586                                 _invlist_union(posixes, scratch_list, &posixes);
12587                                 SvREFCNT_dec_NN(scratch_list);
12588                             }
12589 #ifndef HAS_ISBLANK
12590                             if (namedclass != ANYOF_NBLANK) {
12591 #endif
12592                                 ANYOF_CLASS_SET(ret, namedclass);
12593 #ifndef HAS_ISBLANK
12594                             }
12595                             else {
12596                                 /* Get the list of all code points in Latin1
12597                                  * that are not ASCII blanks, and add them to
12598                                  * the running total */
12599                                 _invlist_subtract(PL_Latin1, ascii_source,
12600                                                   &scratch_list);
12601                                 _invlist_union(posixes, scratch_list, &posixes);
12602                                 SvREFCNT_dec_NN(scratch_list);
12603                             }
12604 #endif
12605                         }
12606                     }
12607                 }
12608               namedclass_done:
12609                 continue;   /* Go get next character */
12610             }
12611         } /* end of namedclass \blah */
12612
12613         /* Here, we have a single value.  If 'range' is set, it is the ending
12614          * of a range--check its validity.  Later, we will handle each
12615          * individual code point in the range.  If 'range' isn't set, this
12616          * could be the beginning of a range, so check for that by looking
12617          * ahead to see if the next real character to be processed is the range
12618          * indicator--the minus sign */
12619
12620         if (skip_white) {
12621             RExC_parse = regpatws(pRExC_state, RExC_parse,
12622                                 FALSE /* means don't recognize comments */);
12623         }
12624
12625         if (range) {
12626             if (prevvalue > value) /* b-a */ {
12627                 const int w = RExC_parse - rangebegin;
12628                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
12629                 range = 0; /* not a valid range */
12630             }
12631         }
12632         else {
12633             prevvalue = value; /* save the beginning of the potential range */
12634             if (! stop_at_1     /* Can't be a range if parsing just one thing */
12635                 && *RExC_parse == '-')
12636             {
12637                 char* next_char_ptr = RExC_parse + 1;
12638                 if (skip_white) {   /* Get the next real char after the '-' */
12639                     next_char_ptr = regpatws(pRExC_state,
12640                                              RExC_parse + 1,
12641                                              FALSE); /* means don't recognize
12642                                                         comments */
12643                 }
12644
12645                 /* If the '-' is at the end of the class (just before the ']',
12646                  * it is a literal minus; otherwise it is a range */
12647                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
12648                     RExC_parse = next_char_ptr;
12649
12650                     /* a bad range like \w-, [:word:]- ? */
12651                     if (namedclass > OOB_NAMEDCLASS) {
12652                         if (strict || ckWARN(WARN_REGEXP)) {
12653                             const int w =
12654                                 RExC_parse >= rangebegin ?
12655                                 RExC_parse - rangebegin : 0;
12656                             if (strict) {
12657                                 vFAIL4("False [] range \"%*.*s\"",
12658                                     w, w, rangebegin);
12659                             }
12660                             else {
12661                                 vWARN4(RExC_parse,
12662                                     "False [] range \"%*.*s\"",
12663                                     w, w, rangebegin);
12664                             }
12665                         }
12666                         if (!SIZE_ONLY) {
12667                             cp_list = add_cp_to_invlist(cp_list, '-');
12668                         }
12669                         element_count++;
12670                     } else
12671                         range = 1;      /* yeah, it's a range! */
12672                     continue;   /* but do it the next time */
12673                 }
12674             }
12675         }
12676
12677         /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12678          * if not */
12679
12680         /* non-Latin1 code point implies unicode semantics.  Must be set in
12681          * pass1 so is there for the whole of pass 2 */
12682         if (value > 255) {
12683             RExC_uni_semantics = 1;
12684         }
12685
12686         /* Ready to process either the single value, or the completed range.
12687          * For single-valued non-inverted ranges, we consider the possibility
12688          * of multi-char folds.  (We made a conscious decision to not do this
12689          * for the other cases because it can often lead to non-intuitive
12690          * results.  For example, you have the peculiar case that:
12691          *  "s s" =~ /^[^\xDF]+$/i => Y
12692          *  "ss"  =~ /^[^\xDF]+$/i => N
12693          *
12694          * See [perl #89750] */
12695         if (FOLD && allow_multi_folds && value == prevvalue) {
12696             if (value == LATIN_SMALL_LETTER_SHARP_S
12697                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
12698                                                         value)))
12699             {
12700                 /* Here <value> is indeed a multi-char fold.  Get what it is */
12701
12702                 U8 foldbuf[UTF8_MAXBYTES_CASE];
12703                 STRLEN foldlen;
12704
12705                 UV folded = _to_uni_fold_flags(
12706                                 value,
12707                                 foldbuf,
12708                                 &foldlen,
12709                                 FOLD_FLAGS_FULL
12710                                 | ((LOC) ?  FOLD_FLAGS_LOCALE
12711                                             : (ASCII_FOLD_RESTRICTED)
12712                                               ? FOLD_FLAGS_NOMIX_ASCII
12713                                               : 0)
12714                                 );
12715
12716                 /* Here, <folded> should be the first character of the
12717                  * multi-char fold of <value>, with <foldbuf> containing the
12718                  * whole thing.  But, if this fold is not allowed (because of
12719                  * the flags), <fold> will be the same as <value>, and should
12720                  * be processed like any other character, so skip the special
12721                  * handling */
12722                 if (folded != value) {
12723
12724                     /* Skip if we are recursed, currently parsing the class
12725                      * again.  Otherwise add this character to the list of
12726                      * multi-char folds. */
12727                     if (! RExC_in_multi_char_class) {
12728                         AV** this_array_ptr;
12729                         AV* this_array;
12730                         STRLEN cp_count = utf8_length(foldbuf,
12731                                                       foldbuf + foldlen);
12732                         SV* multi_fold = sv_2mortal(newSVpvn("", 0));
12733
12734                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
12735
12736
12737                         if (! multi_char_matches) {
12738                             multi_char_matches = newAV();
12739                         }
12740
12741                         /* <multi_char_matches> is actually an array of arrays.
12742                          * There will be one or two top-level elements: [2],
12743                          * and/or [3].  The [2] element is an array, each
12744                          * element thereof is a character which folds to two
12745                          * characters; likewise for [3].  (Unicode guarantees a
12746                          * maximum of 3 characters in any fold.)  When we
12747                          * rewrite the character class below, we will do so
12748                          * such that the longest folds are written first, so
12749                          * that it prefers the longest matching strings first.
12750                          * This is done even if it turns out that any
12751                          * quantifier is non-greedy, out of programmer
12752                          * laziness.  Tom Christiansen has agreed that this is
12753                          * ok.  This makes the test for the ligature 'ffi' come
12754                          * before the test for 'ff' */
12755                         if (av_exists(multi_char_matches, cp_count)) {
12756                             this_array_ptr = (AV**) av_fetch(multi_char_matches,
12757                                                              cp_count, FALSE);
12758                             this_array = *this_array_ptr;
12759                         }
12760                         else {
12761                             this_array = newAV();
12762                             av_store(multi_char_matches, cp_count,
12763                                      (SV*) this_array);
12764                         }
12765                         av_push(this_array, multi_fold);
12766                     }
12767
12768                     /* This element should not be processed further in this
12769                      * class */
12770                     element_count--;
12771                     value = save_value;
12772                     prevvalue = save_prevvalue;
12773                     continue;
12774                 }
12775             }
12776         }
12777
12778         /* Deal with this element of the class */
12779         if (! SIZE_ONLY) {
12780 #ifndef EBCDIC
12781             cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
12782 #else
12783             UV* this_range = _new_invlist(1);
12784             _append_range_to_invlist(this_range, prevvalue, value);
12785
12786             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
12787              * If this range was specified using something like 'i-j', we want
12788              * to include only the 'i' and the 'j', and not anything in
12789              * between, so exclude non-ASCII, non-alphabetics from it.
12790              * However, if the range was specified with something like
12791              * [\x89-\x91] or [\x89-j], all code points within it should be
12792              * included.  literal_endpoint==2 means both ends of the range used
12793              * a literal character, not \x{foo} */
12794             if (literal_endpoint == 2
12795                 && (prevvalue >= 'a' && value <= 'z')
12796                     || (prevvalue >= 'A' && value <= 'Z'))
12797             {
12798                 _invlist_intersection(this_range, PL_ASCII, &this_range, );
12799                 _invlist_intersection(this_range, PL_Alpha, &this_range, );
12800             }
12801             _invlist_union(cp_list, this_range, &cp_list);
12802             literal_endpoint = 0;
12803 #endif
12804         }
12805
12806         range = 0; /* this range (if it was one) is done now */
12807     } /* End of loop through all the text within the brackets */
12808
12809     /* If anything in the class expands to more than one character, we have to
12810      * deal with them by building up a substitute parse string, and recursively
12811      * calling reg() on it, instead of proceeding */
12812     if (multi_char_matches) {
12813         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
12814         I32 cp_count;
12815         STRLEN len;
12816         char *save_end = RExC_end;
12817         char *save_parse = RExC_parse;
12818         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
12819                                        a "|" */
12820         I32 reg_flags;
12821
12822         assert(! invert);
12823 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
12824            because too confusing */
12825         if (invert) {
12826             sv_catpv(substitute_parse, "(?:");
12827         }
12828 #endif
12829
12830         /* Look at the longest folds first */
12831         for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
12832
12833             if (av_exists(multi_char_matches, cp_count)) {
12834                 AV** this_array_ptr;
12835                 SV* this_sequence;
12836
12837                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12838                                                  cp_count, FALSE);
12839                 while ((this_sequence = av_pop(*this_array_ptr)) !=
12840                                                                 &PL_sv_undef)
12841                 {
12842                     if (! first_time) {
12843                         sv_catpv(substitute_parse, "|");
12844                     }
12845                     first_time = FALSE;
12846
12847                     sv_catpv(substitute_parse, SvPVX(this_sequence));
12848                 }
12849             }
12850         }
12851
12852         /* If the character class contains anything else besides these
12853          * multi-character folds, have to include it in recursive parsing */
12854         if (element_count) {
12855             sv_catpv(substitute_parse, "|[");
12856             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
12857             sv_catpv(substitute_parse, "]");
12858         }
12859
12860         sv_catpv(substitute_parse, ")");
12861 #if 0
12862         if (invert) {
12863             /* This is a way to get the parse to skip forward a whole named
12864              * sequence instead of matching the 2nd character when it fails the
12865              * first */
12866             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
12867         }
12868 #endif
12869
12870         RExC_parse = SvPV(substitute_parse, len);
12871         RExC_end = RExC_parse + len;
12872         RExC_in_multi_char_class = 1;
12873         RExC_emit = (regnode *)orig_emit;
12874
12875         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
12876
12877         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED);
12878
12879         RExC_parse = save_parse;
12880         RExC_end = save_end;
12881         RExC_in_multi_char_class = 0;
12882         SvREFCNT_dec_NN(multi_char_matches);
12883         SvREFCNT_dec_NN(listsv);
12884         return ret;
12885     }
12886
12887     /* If the character class contains only a single element, it may be
12888      * optimizable into another node type which is smaller and runs faster.
12889      * Check if this is the case for this class */
12890     if (element_count == 1 && ! ret_invlist) {
12891         U8 op = END;
12892         U8 arg = 0;
12893
12894         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
12895                                               [:digit:] or \p{foo} */
12896
12897             /* All named classes are mapped into POSIXish nodes, with its FLAG
12898              * argument giving which class it is */
12899             switch ((I32)namedclass) {
12900                 case ANYOF_UNIPROP:
12901                     break;
12902
12903                 /* These don't depend on the charset modifiers.  They always
12904                  * match under /u rules */
12905                 case ANYOF_NHORIZWS:
12906                 case ANYOF_HORIZWS:
12907                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
12908                     /* FALLTHROUGH */
12909
12910                 case ANYOF_NVERTWS:
12911                 case ANYOF_VERTWS:
12912                     op = POSIXU;
12913                     goto join_posix;
12914
12915                 /* The actual POSIXish node for all the rest depends on the
12916                  * charset modifier.  The ones in the first set depend only on
12917                  * ASCII or, if available on this platform, locale */
12918                 case ANYOF_ASCII:
12919                 case ANYOF_NASCII:
12920 #ifdef HAS_ISASCII
12921                     op = (LOC) ? POSIXL : POSIXA;
12922 #else
12923                     op = POSIXA;
12924 #endif
12925                     goto join_posix;
12926
12927                 case ANYOF_NCASED:
12928                 case ANYOF_LOWER:
12929                 case ANYOF_NLOWER:
12930                 case ANYOF_UPPER:
12931                 case ANYOF_NUPPER:
12932                     /* under /a could be alpha */
12933                     if (FOLD) {
12934                         if (ASCII_RESTRICTED) {
12935                             namedclass = ANYOF_ALPHA + (namedclass % 2);
12936                         }
12937                         else if (! LOC) {
12938                             break;
12939                         }
12940                     }
12941                     /* FALLTHROUGH */
12942
12943                 /* The rest have more possibilities depending on the charset.
12944                  * We take advantage of the enum ordering of the charset
12945                  * modifiers to get the exact node type, */
12946                 default:
12947                     op = POSIXD + get_regex_charset(RExC_flags);
12948                     if (op > POSIXA) { /* /aa is same as /a */
12949                         op = POSIXA;
12950                     }
12951 #ifndef HAS_ISBLANK
12952                     if (op == POSIXL
12953                         && (namedclass == ANYOF_BLANK
12954                             || namedclass == ANYOF_NBLANK))
12955                     {
12956                         op = POSIXA;
12957                     }
12958 #endif
12959
12960                 join_posix:
12961                     /* The odd numbered ones are the complements of the
12962                      * next-lower even number one */
12963                     if (namedclass % 2 == 1) {
12964                         invert = ! invert;
12965                         namedclass--;
12966                     }
12967                     arg = namedclass_to_classnum(namedclass);
12968                     break;
12969             }
12970         }
12971         else if (value == prevvalue) {
12972
12973             /* Here, the class consists of just a single code point */
12974
12975             if (invert) {
12976                 if (! LOC && value == '\n') {
12977                     op = REG_ANY; /* Optimize [^\n] */
12978                     *flagp |= HASWIDTH|SIMPLE;
12979                     RExC_naughty++;
12980                 }
12981             }
12982             else if (value < 256 || UTF) {
12983
12984                 /* Optimize a single value into an EXACTish node, but not if it
12985                  * would require converting the pattern to UTF-8. */
12986                 op = compute_EXACTish(pRExC_state);
12987             }
12988         } /* Otherwise is a range */
12989         else if (! LOC) {   /* locale could vary these */
12990             if (prevvalue == '0') {
12991                 if (value == '9') {
12992                     arg = _CC_DIGIT;
12993                     op = POSIXA;
12994                 }
12995             }
12996         }
12997
12998         /* Here, we have changed <op> away from its initial value iff we found
12999          * an optimization */
13000         if (op != END) {
13001
13002             /* Throw away this ANYOF regnode, and emit the calculated one,
13003              * which should correspond to the beginning, not current, state of
13004              * the parse */
13005             const char * cur_parse = RExC_parse;
13006             RExC_parse = (char *)orig_parse;
13007             if ( SIZE_ONLY) {
13008                 if (! LOC) {
13009
13010                     /* To get locale nodes to not use the full ANYOF size would
13011                      * require moving the code above that writes the portions
13012                      * of it that aren't in other nodes to after this point.
13013                      * e.g.  ANYOF_CLASS_SET */
13014                     RExC_size = orig_size;
13015                 }
13016             }
13017             else {
13018                 RExC_emit = (regnode *)orig_emit;
13019                 if (PL_regkind[op] == POSIXD) {
13020                     if (invert) {
13021                         op += NPOSIXD - POSIXD;
13022                     }
13023                 }
13024             }
13025
13026             ret = reg_node(pRExC_state, op);
13027
13028             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
13029                 if (! SIZE_ONLY) {
13030                     FLAGS(ret) = arg;
13031                 }
13032                 *flagp |= HASWIDTH|SIMPLE;
13033             }
13034             else if (PL_regkind[op] == EXACT) {
13035                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13036             }
13037
13038             RExC_parse = (char *) cur_parse;
13039
13040             SvREFCNT_dec(posixes);
13041             SvREFCNT_dec_NN(listsv);
13042             SvREFCNT_dec(cp_list);
13043             return ret;
13044         }
13045     }
13046
13047     if (SIZE_ONLY)
13048         return ret;
13049     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
13050
13051     /* If folding, we calculate all characters that could fold to or from the
13052      * ones already on the list */
13053     if (FOLD && cp_list) {
13054         UV start, end;  /* End points of code point ranges */
13055
13056         SV* fold_intersection = NULL;
13057
13058         /* If the highest code point is within Latin1, we can use the
13059          * compiled-in Alphas list, and not have to go out to disk.  This
13060          * yields two false positives, the masculine and feminine ordinal
13061          * indicators, which are weeded out below using the
13062          * IS_IN_SOME_FOLD_L1() macro */
13063         if (invlist_highest(cp_list) < 256) {
13064             _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
13065                                                            &fold_intersection);
13066         }
13067         else {
13068
13069             /* Here, there are non-Latin1 code points, so we will have to go
13070              * fetch the list of all the characters that participate in folds
13071              */
13072             if (! PL_utf8_foldable) {
13073                 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13074                                        &PL_sv_undef, 1, 0);
13075                 PL_utf8_foldable = _get_swash_invlist(swash);
13076                 SvREFCNT_dec_NN(swash);
13077             }
13078
13079             /* This is a hash that for a particular fold gives all characters
13080              * that are involved in it */
13081             if (! PL_utf8_foldclosures) {
13082
13083                 /* If we were unable to find any folds, then we likely won't be
13084                  * able to find the closures.  So just create an empty list.
13085                  * Folding will effectively be restricted to the non-Unicode
13086                  * rules hard-coded into Perl.  (This case happens legitimately
13087                  * during compilation of Perl itself before the Unicode tables
13088                  * are generated) */
13089                 if (_invlist_len(PL_utf8_foldable) == 0) {
13090                     PL_utf8_foldclosures = newHV();
13091                 }
13092                 else {
13093                     /* If the folds haven't been read in, call a fold function
13094                      * to force that */
13095                     if (! PL_utf8_tofold) {
13096                         U8 dummy[UTF8_MAXBYTES+1];
13097
13098                         /* This string is just a short named one above \xff */
13099                         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
13100                         assert(PL_utf8_tofold); /* Verify that worked */
13101                     }
13102                     PL_utf8_foldclosures =
13103                                     _swash_inversion_hash(PL_utf8_tofold);
13104                 }
13105             }
13106
13107             /* Only the characters in this class that participate in folds need
13108              * be checked.  Get the intersection of this class and all the
13109              * possible characters that are foldable.  This can quickly narrow
13110              * down a large class */
13111             _invlist_intersection(PL_utf8_foldable, cp_list,
13112                                   &fold_intersection);
13113         }
13114
13115         /* Now look at the foldable characters in this class individually */
13116         invlist_iterinit(fold_intersection);
13117         while (invlist_iternext(fold_intersection, &start, &end)) {
13118             UV j;
13119
13120             /* Locale folding for Latin1 characters is deferred until runtime */
13121             if (LOC && start < 256) {
13122                 start = 256;
13123             }
13124
13125             /* Look at every character in the range */
13126             for (j = start; j <= end; j++) {
13127
13128                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
13129                 STRLEN foldlen;
13130                 SV** listp;
13131
13132                 if (j < 256) {
13133
13134                     /* We have the latin1 folding rules hard-coded here so that
13135                      * an innocent-looking character class, like /[ks]/i won't
13136                      * have to go out to disk to find the possible matches.
13137                      * XXX It would be better to generate these via regen, in
13138                      * case a new version of the Unicode standard adds new
13139                      * mappings, though that is not really likely, and may be
13140                      * caught by the default: case of the switch below. */
13141
13142                     if (IS_IN_SOME_FOLD_L1(j)) {
13143
13144                         /* ASCII is always matched; non-ASCII is matched only
13145                          * under Unicode rules */
13146                         if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
13147                             cp_list =
13148                                 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
13149                         }
13150                         else {
13151                             depends_list =
13152                              add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
13153                         }
13154                     }
13155
13156                     if (HAS_NONLATIN1_FOLD_CLOSURE(j)
13157                         && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
13158                     {
13159                         /* Certain Latin1 characters have matches outside
13160                          * Latin1.  To get here, <j> is one of those
13161                          * characters.   None of these matches is valid for
13162                          * ASCII characters under /aa, which is why the 'if'
13163                          * just above excludes those.  These matches only
13164                          * happen when the target string is utf8.  The code
13165                          * below adds the single fold closures for <j> to the
13166                          * inversion list. */
13167                         switch (j) {
13168                             case 'k':
13169                             case 'K':
13170                                 cp_list =
13171                                     add_cp_to_invlist(cp_list, KELVIN_SIGN);
13172                                 break;
13173                             case 's':
13174                             case 'S':
13175                                 cp_list = add_cp_to_invlist(cp_list,
13176                                                     LATIN_SMALL_LETTER_LONG_S);
13177                                 break;
13178                             case MICRO_SIGN:
13179                                 cp_list = add_cp_to_invlist(cp_list,
13180                                                     GREEK_CAPITAL_LETTER_MU);
13181                                 cp_list = add_cp_to_invlist(cp_list,
13182                                                     GREEK_SMALL_LETTER_MU);
13183                                 break;
13184                             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13185                             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13186                                 cp_list =
13187                                     add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
13188                                 break;
13189                             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13190                                 cp_list = add_cp_to_invlist(cp_list,
13191                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13192                                 break;
13193                             case LATIN_SMALL_LETTER_SHARP_S:
13194                                 cp_list = add_cp_to_invlist(cp_list,
13195                                                 LATIN_CAPITAL_LETTER_SHARP_S);
13196                                 break;
13197                             case 'F': case 'f':
13198                             case 'I': case 'i':
13199                             case 'L': case 'l':
13200                             case 'T': case 't':
13201                             case 'A': case 'a':
13202                             case 'H': case 'h':
13203                             case 'J': case 'j':
13204                             case 'N': case 'n':
13205                             case 'W': case 'w':
13206                             case 'Y': case 'y':
13207                                 /* These all are targets of multi-character
13208                                  * folds from code points that require UTF8 to
13209                                  * express, so they can't match unless the
13210                                  * target string is in UTF-8, so no action here
13211                                  * is necessary, as regexec.c properly handles
13212                                  * the general case for UTF-8 matching and
13213                                  * multi-char folds */
13214                                 break;
13215                             default:
13216                                 /* Use deprecated warning to increase the
13217                                  * chances of this being output */
13218                                 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
13219                                 break;
13220                         }
13221                     }
13222                     continue;
13223                 }
13224
13225                 /* Here is an above Latin1 character.  We don't have the rules
13226                  * hard-coded for it.  First, get its fold.  This is the simple
13227                  * fold, as the multi-character folds have been handled earlier
13228                  * and separated out */
13229                 _to_uni_fold_flags(j, foldbuf, &foldlen,
13230                                                ((LOC)
13231                                                ? FOLD_FLAGS_LOCALE
13232                                                : (ASCII_FOLD_RESTRICTED)
13233                                                   ? FOLD_FLAGS_NOMIX_ASCII
13234                                                   : 0));
13235
13236                 /* Single character fold of above Latin1.  Add everything in
13237                  * its fold closure to the list that this node should match.
13238                  * The fold closures data structure is a hash with the keys
13239                  * being the UTF-8 of every character that is folded to, like
13240                  * 'k', and the values each an array of all code points that
13241                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
13242                  * Multi-character folds are not included */
13243                 if ((listp = hv_fetch(PL_utf8_foldclosures,
13244                                       (char *) foldbuf, foldlen, FALSE)))
13245                 {
13246                     AV* list = (AV*) *listp;
13247                     IV k;
13248                     for (k = 0; k <= av_len(list); k++) {
13249                         SV** c_p = av_fetch(list, k, FALSE);
13250                         UV c;
13251                         if (c_p == NULL) {
13252                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
13253                         }
13254                         c = SvUV(*c_p);
13255
13256                         /* /aa doesn't allow folds between ASCII and non-; /l
13257                          * doesn't allow them between above and below 256 */
13258                         if ((ASCII_FOLD_RESTRICTED
13259                                   && (isASCII(c) != isASCII(j)))
13260                             || (LOC && ((c < 256) != (j < 256))))
13261                         {
13262                             continue;
13263                         }
13264
13265                         /* Folds involving non-ascii Latin1 characters
13266                          * under /d are added to a separate list */
13267                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
13268                         {
13269                             cp_list = add_cp_to_invlist(cp_list, c);
13270                         }
13271                         else {
13272                           depends_list = add_cp_to_invlist(depends_list, c);
13273                         }
13274                     }
13275                 }
13276             }
13277         }
13278         SvREFCNT_dec_NN(fold_intersection);
13279     }
13280
13281     /* And combine the result (if any) with any inversion list from posix
13282      * classes.  The lists are kept separate up to now because we don't want to
13283      * fold the classes (folding of those is automatically handled by the swash
13284      * fetching code) */
13285     if (posixes) {
13286         if (! DEPENDS_SEMANTICS) {
13287             if (cp_list) {
13288                 _invlist_union(cp_list, posixes, &cp_list);
13289                 SvREFCNT_dec_NN(posixes);
13290             }
13291             else {
13292                 cp_list = posixes;
13293             }
13294         }
13295         else {
13296             /* Under /d, we put into a separate list the Latin1 things that
13297              * match only when the target string is utf8 */
13298             SV* nonascii_but_latin1_properties = NULL;
13299             _invlist_intersection(posixes, PL_Latin1,
13300                                   &nonascii_but_latin1_properties);
13301             _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
13302                               &nonascii_but_latin1_properties);
13303             _invlist_subtract(posixes, nonascii_but_latin1_properties,
13304                               &posixes);
13305             if (cp_list) {
13306                 _invlist_union(cp_list, posixes, &cp_list);
13307                 SvREFCNT_dec_NN(posixes);
13308             }
13309             else {
13310                 cp_list = posixes;
13311             }
13312
13313             if (depends_list) {
13314                 _invlist_union(depends_list, nonascii_but_latin1_properties,
13315                                &depends_list);
13316                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
13317             }
13318             else {
13319                 depends_list = nonascii_but_latin1_properties;
13320             }
13321         }
13322     }
13323
13324     /* And combine the result (if any) with any inversion list from properties.
13325      * The lists are kept separate up to now so that we can distinguish the two
13326      * in regards to matching above-Unicode.  A run-time warning is generated
13327      * if a Unicode property is matched against a non-Unicode code point. But,
13328      * we allow user-defined properties to match anything, without any warning,
13329      * and we also suppress the warning if there is a portion of the character
13330      * class that isn't a Unicode property, and which matches above Unicode, \W
13331      * or [\x{110000}] for example.
13332      * (Note that in this case, unlike the Posix one above, there is no
13333      * <depends_list>, because having a Unicode property forces Unicode
13334      * semantics */
13335     if (properties) {
13336         bool warn_super = ! has_user_defined_property;
13337         if (cp_list) {
13338
13339             /* If it matters to the final outcome, see if a non-property
13340              * component of the class matches above Unicode.  If so, the
13341              * warning gets suppressed.  This is true even if just a single
13342              * such code point is specified, as though not strictly correct if
13343              * another such code point is matched against, the fact that they
13344              * are using above-Unicode code points indicates they should know
13345              * the issues involved */
13346             if (warn_super) {
13347                 bool non_prop_matches_above_Unicode =
13348                             runtime_posix_matches_above_Unicode
13349                             | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
13350                 if (invert) {
13351                     non_prop_matches_above_Unicode =
13352                                             !  non_prop_matches_above_Unicode;
13353                 }
13354                 warn_super = ! non_prop_matches_above_Unicode;
13355             }
13356
13357             _invlist_union(properties, cp_list, &cp_list);
13358             SvREFCNT_dec_NN(properties);
13359         }
13360         else {
13361             cp_list = properties;
13362         }
13363
13364         if (warn_super) {
13365             OP(ret) = ANYOF_WARN_SUPER;
13366         }
13367     }
13368
13369     /* Here, we have calculated what code points should be in the character
13370      * class.
13371      *
13372      * Now we can see about various optimizations.  Fold calculation (which we
13373      * did above) needs to take place before inversion.  Otherwise /[^k]/i
13374      * would invert to include K, which under /i would match k, which it
13375      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
13376      * folded until runtime */
13377
13378     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
13379      * at compile time.  Besides not inverting folded locale now, we can't
13380      * invert if there are things such as \w, which aren't known until runtime
13381      * */
13382     if (invert
13383         && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
13384         && ! depends_list
13385         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13386     {
13387         _invlist_invert(cp_list);
13388
13389         /* Any swash can't be used as-is, because we've inverted things */
13390         if (swash) {
13391             SvREFCNT_dec_NN(swash);
13392             swash = NULL;
13393         }
13394
13395         /* Clear the invert flag since have just done it here */
13396         invert = FALSE;
13397     }
13398
13399     if (ret_invlist) {
13400         *ret_invlist = cp_list;
13401
13402         /* Discard the generated node */
13403         if (SIZE_ONLY) {
13404             RExC_size = orig_size;
13405         }
13406         else {
13407             RExC_emit = orig_emit;
13408         }
13409         return END;
13410     }
13411
13412     /* If we didn't do folding, it's because some information isn't available
13413      * until runtime; set the run-time fold flag for these.  (We don't have to
13414      * worry about properties folding, as that is taken care of by the swash
13415      * fetching) */
13416     if (FOLD && LOC)
13417     {
13418        ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
13419     }
13420
13421     /* Some character classes are equivalent to other nodes.  Such nodes take
13422      * up less room and generally fewer operations to execute than ANYOF nodes.
13423      * Above, we checked for and optimized into some such equivalents for
13424      * certain common classes that are easy to test.  Getting to this point in
13425      * the code means that the class didn't get optimized there.  Since this
13426      * code is only executed in Pass 2, it is too late to save space--it has
13427      * been allocated in Pass 1, and currently isn't given back.  But turning
13428      * things into an EXACTish node can allow the optimizer to join it to any
13429      * adjacent such nodes.  And if the class is equivalent to things like /./,
13430      * expensive run-time swashes can be avoided.  Now that we have more
13431      * complete information, we can find things necessarily missed by the
13432      * earlier code.  I (khw) am not sure how much to look for here.  It would
13433      * be easy, but perhaps too slow, to check any candidates against all the
13434      * node types they could possibly match using _invlistEQ(). */
13435
13436     if (cp_list
13437         && ! invert
13438         && ! depends_list
13439         && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
13440         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13441     {
13442         UV start, end;
13443         U8 op = END;  /* The optimzation node-type */
13444         const char * cur_parse= RExC_parse;
13445
13446         invlist_iterinit(cp_list);
13447         if (! invlist_iternext(cp_list, &start, &end)) {
13448
13449             /* Here, the list is empty.  This happens, for example, when a
13450              * Unicode property is the only thing in the character class, and
13451              * it doesn't match anything.  (perluniprops.pod notes such
13452              * properties) */
13453             op = OPFAIL;
13454             *flagp |= HASWIDTH|SIMPLE;
13455         }
13456         else if (start == end) {    /* The range is a single code point */
13457             if (! invlist_iternext(cp_list, &start, &end)
13458
13459                     /* Don't do this optimization if it would require changing
13460                      * the pattern to UTF-8 */
13461                 && (start < 256 || UTF))
13462             {
13463                 /* Here, the list contains a single code point.  Can optimize
13464                  * into an EXACT node */
13465
13466                 value = start;
13467
13468                 if (! FOLD) {
13469                     op = EXACT;
13470                 }
13471                 else if (LOC) {
13472
13473                     /* A locale node under folding with one code point can be
13474                      * an EXACTFL, as its fold won't be calculated until
13475                      * runtime */
13476                     op = EXACTFL;
13477                 }
13478                 else {
13479
13480                     /* Here, we are generally folding, but there is only one
13481                      * code point to match.  If we have to, we use an EXACT
13482                      * node, but it would be better for joining with adjacent
13483                      * nodes in the optimization pass if we used the same
13484                      * EXACTFish node that any such are likely to be.  We can
13485                      * do this iff the code point doesn't participate in any
13486                      * folds.  For example, an EXACTF of a colon is the same as
13487                      * an EXACT one, since nothing folds to or from a colon. */
13488                     if (value < 256) {
13489                         if (IS_IN_SOME_FOLD_L1(value)) {
13490                             op = EXACT;
13491                         }
13492                     }
13493                     else {
13494                         if (! PL_utf8_foldable) {
13495                             SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13496                                                 &PL_sv_undef, 1, 0);
13497                             PL_utf8_foldable = _get_swash_invlist(swash);
13498                             SvREFCNT_dec_NN(swash);
13499                         }
13500                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13501                             op = EXACT;
13502                         }
13503                     }
13504
13505                     /* If we haven't found the node type, above, it means we
13506                      * can use the prevailing one */
13507                     if (op == END) {
13508                         op = compute_EXACTish(pRExC_state);
13509                     }
13510                 }
13511             }
13512         }
13513         else if (start == 0) {
13514             if (end == UV_MAX) {
13515                 op = SANY;
13516                 *flagp |= HASWIDTH|SIMPLE;
13517                 RExC_naughty++;
13518             }
13519             else if (end == '\n' - 1
13520                     && invlist_iternext(cp_list, &start, &end)
13521                     && start == '\n' + 1 && end == UV_MAX)
13522             {
13523                 op = REG_ANY;
13524                 *flagp |= HASWIDTH|SIMPLE;
13525                 RExC_naughty++;
13526             }
13527         }
13528         invlist_iterfinish(cp_list);
13529
13530         if (op != END) {
13531             RExC_parse = (char *)orig_parse;
13532             RExC_emit = (regnode *)orig_emit;
13533
13534             ret = reg_node(pRExC_state, op);
13535
13536             RExC_parse = (char *)cur_parse;
13537
13538             if (PL_regkind[op] == EXACT) {
13539                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13540             }
13541
13542             SvREFCNT_dec_NN(cp_list);
13543             SvREFCNT_dec_NN(listsv);
13544             return ret;
13545         }
13546     }
13547
13548     /* Here, <cp_list> contains all the code points we can determine at
13549      * compile time that match under all conditions.  Go through it, and
13550      * for things that belong in the bitmap, put them there, and delete from
13551      * <cp_list>.  While we are at it, see if everything above 255 is in the
13552      * list, and if so, set a flag to speed up execution */
13553     ANYOF_BITMAP_ZERO(ret);
13554     if (cp_list) {
13555
13556         /* This gets set if we actually need to modify things */
13557         bool change_invlist = FALSE;
13558
13559         UV start, end;
13560
13561         /* Start looking through <cp_list> */
13562         invlist_iterinit(cp_list);
13563         while (invlist_iternext(cp_list, &start, &end)) {
13564             UV high;
13565             int i;
13566
13567             if (end == UV_MAX && start <= 256) {
13568                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13569             }
13570
13571             /* Quit if are above what we should change */
13572             if (start > 255) {
13573                 break;
13574             }
13575
13576             change_invlist = TRUE;
13577
13578             /* Set all the bits in the range, up to the max that we are doing */
13579             high = (end < 255) ? end : 255;
13580             for (i = start; i <= (int) high; i++) {
13581                 if (! ANYOF_BITMAP_TEST(ret, i)) {
13582                     ANYOF_BITMAP_SET(ret, i);
13583                     prevvalue = value;
13584                     value = i;
13585                 }
13586             }
13587         }
13588         invlist_iterfinish(cp_list);
13589
13590         /* Done with loop; remove any code points that are in the bitmap from
13591          * <cp_list> */
13592         if (change_invlist) {
13593             _invlist_subtract(cp_list, PL_Latin1, &cp_list);
13594         }
13595
13596         /* If have completely emptied it, remove it completely */
13597         if (_invlist_len(cp_list) == 0) {
13598             SvREFCNT_dec_NN(cp_list);
13599             cp_list = NULL;
13600         }
13601     }
13602
13603     if (invert) {
13604         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
13605     }
13606
13607     /* Here, the bitmap has been populated with all the Latin1 code points that
13608      * always match.  Can now add to the overall list those that match only
13609      * when the target string is UTF-8 (<depends_list>). */
13610     if (depends_list) {
13611         if (cp_list) {
13612             _invlist_union(cp_list, depends_list, &cp_list);
13613             SvREFCNT_dec_NN(depends_list);
13614         }
13615         else {
13616             cp_list = depends_list;
13617         }
13618     }
13619
13620     /* If there is a swash and more than one element, we can't use the swash in
13621      * the optimization below. */
13622     if (swash && element_count > 1) {
13623         SvREFCNT_dec_NN(swash);
13624         swash = NULL;
13625     }
13626
13627     if (! cp_list
13628         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13629     {
13630         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
13631         SvREFCNT_dec_NN(listsv);
13632     }
13633     else {
13634         /* av[0] stores the character class description in its textual form:
13635          *       used later (regexec.c:Perl_regclass_swash()) to initialize the
13636          *       appropriate swash, and is also useful for dumping the regnode.
13637          * av[1] if NULL, is a placeholder to later contain the swash computed
13638          *       from av[0].  But if no further computation need be done, the
13639          *       swash is stored there now.
13640          * av[2] stores the cp_list inversion list for use in addition or
13641          *       instead of av[0]; used only if av[1] is NULL
13642          * av[3] is set if any component of the class is from a user-defined
13643          *       property; used only if av[1] is NULL */
13644         AV * const av = newAV();
13645         SV *rv;
13646
13647         av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13648                         ? listsv
13649                         : (SvREFCNT_dec_NN(listsv), &PL_sv_undef));
13650         if (swash) {
13651             av_store(av, 1, swash);
13652             SvREFCNT_dec_NN(cp_list);
13653         }
13654         else {
13655             av_store(av, 1, NULL);
13656             if (cp_list) {
13657                 av_store(av, 2, cp_list);
13658                 av_store(av, 3, newSVuv(has_user_defined_property));
13659             }
13660         }
13661
13662         rv = newRV_noinc(MUTABLE_SV(av));
13663         n = add_data(pRExC_state, 1, "s");
13664         RExC_rxi->data->data[n] = (void*)rv;
13665         ARG_SET(ret, n);
13666     }
13667
13668     *flagp |= HASWIDTH|SIMPLE;
13669     return ret;
13670 }
13671 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
13672
13673
13674 /* reg_skipcomment()
13675
13676    Absorbs an /x style # comments from the input stream.
13677    Returns true if there is more text remaining in the stream.
13678    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
13679    terminates the pattern without including a newline.
13680
13681    Note its the callers responsibility to ensure that we are
13682    actually in /x mode
13683
13684 */
13685
13686 STATIC bool
13687 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
13688 {
13689     bool ended = 0;
13690
13691     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
13692
13693     while (RExC_parse < RExC_end)
13694         if (*RExC_parse++ == '\n') {
13695             ended = 1;
13696             break;
13697         }
13698     if (!ended) {
13699         /* we ran off the end of the pattern without ending
13700            the comment, so we have to add an \n when wrapping */
13701         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
13702         return 0;
13703     } else
13704         return 1;
13705 }
13706
13707 /* nextchar()
13708
13709    Advances the parse position, and optionally absorbs
13710    "whitespace" from the inputstream.
13711
13712    Without /x "whitespace" means (?#...) style comments only,
13713    with /x this means (?#...) and # comments and whitespace proper.
13714
13715    Returns the RExC_parse point from BEFORE the scan occurs.
13716
13717    This is the /x friendly way of saying RExC_parse++.
13718 */
13719
13720 STATIC char*
13721 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
13722 {
13723     char* const retval = RExC_parse++;
13724
13725     PERL_ARGS_ASSERT_NEXTCHAR;
13726
13727     for (;;) {
13728         if (RExC_end - RExC_parse >= 3
13729             && *RExC_parse == '('
13730             && RExC_parse[1] == '?'
13731             && RExC_parse[2] == '#')
13732         {
13733             while (*RExC_parse != ')') {
13734                 if (RExC_parse == RExC_end)
13735                     FAIL("Sequence (?#... not terminated");
13736                 RExC_parse++;
13737             }
13738             RExC_parse++;
13739             continue;
13740         }
13741         if (RExC_flags & RXf_PMf_EXTENDED) {
13742             if (isSPACE(*RExC_parse)) {
13743                 RExC_parse++;
13744                 continue;
13745             }
13746             else if (*RExC_parse == '#') {
13747                 if ( reg_skipcomment( pRExC_state ) )
13748                     continue;
13749             }
13750         }
13751         return retval;
13752     }
13753 }
13754
13755 /*
13756 - reg_node - emit a node
13757 */
13758 STATIC regnode *                        /* Location. */
13759 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
13760 {
13761     dVAR;
13762     regnode *ptr;
13763     regnode * const ret = RExC_emit;
13764     GET_RE_DEBUG_FLAGS_DECL;
13765
13766     PERL_ARGS_ASSERT_REG_NODE;
13767
13768     if (SIZE_ONLY) {
13769         SIZE_ALIGN(RExC_size);
13770         RExC_size += 1;
13771         return(ret);
13772     }
13773     if (RExC_emit >= RExC_emit_bound)
13774         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13775                    op, RExC_emit, RExC_emit_bound);
13776
13777     NODE_ALIGN_FILL(ret);
13778     ptr = ret;
13779     FILL_ADVANCE_NODE(ptr, op);
13780 #ifdef RE_TRACK_PATTERN_OFFSETS
13781     if (RExC_offsets) {         /* MJD */
13782         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
13783               "reg_node", __LINE__,
13784               PL_reg_name[op],
13785               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
13786                 ? "Overwriting end of array!\n" : "OK",
13787               (UV)(RExC_emit - RExC_emit_start),
13788               (UV)(RExC_parse - RExC_start),
13789               (UV)RExC_offsets[0]));
13790         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
13791     }
13792 #endif
13793     RExC_emit = ptr;
13794     return(ret);
13795 }
13796
13797 /*
13798 - reganode - emit a node with an argument
13799 */
13800 STATIC regnode *                        /* Location. */
13801 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
13802 {
13803     dVAR;
13804     regnode *ptr;
13805     regnode * const ret = RExC_emit;
13806     GET_RE_DEBUG_FLAGS_DECL;
13807
13808     PERL_ARGS_ASSERT_REGANODE;
13809
13810     if (SIZE_ONLY) {
13811         SIZE_ALIGN(RExC_size);
13812         RExC_size += 2;
13813         /*
13814            We can't do this:
13815
13816            assert(2==regarglen[op]+1);
13817
13818            Anything larger than this has to allocate the extra amount.
13819            If we changed this to be:
13820
13821            RExC_size += (1 + regarglen[op]);
13822
13823            then it wouldn't matter. Its not clear what side effect
13824            might come from that so its not done so far.
13825            -- dmq
13826         */
13827         return(ret);
13828     }
13829     if (RExC_emit >= RExC_emit_bound)
13830         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13831                    op, RExC_emit, RExC_emit_bound);
13832
13833     NODE_ALIGN_FILL(ret);
13834     ptr = ret;
13835     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
13836 #ifdef RE_TRACK_PATTERN_OFFSETS
13837     if (RExC_offsets) {         /* MJD */
13838         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13839               "reganode",
13840               __LINE__,
13841               PL_reg_name[op],
13842               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
13843               "Overwriting end of array!\n" : "OK",
13844               (UV)(RExC_emit - RExC_emit_start),
13845               (UV)(RExC_parse - RExC_start),
13846               (UV)RExC_offsets[0]));
13847         Set_Cur_Node_Offset;
13848     }
13849 #endif
13850     RExC_emit = ptr;
13851     return(ret);
13852 }
13853
13854 /*
13855 - reguni - emit (if appropriate) a Unicode character
13856 */
13857 STATIC STRLEN
13858 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
13859 {
13860     dVAR;
13861
13862     PERL_ARGS_ASSERT_REGUNI;
13863
13864     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
13865 }
13866
13867 /*
13868 - reginsert - insert an operator in front of already-emitted operand
13869 *
13870 * Means relocating the operand.
13871 */
13872 STATIC void
13873 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
13874 {
13875     dVAR;
13876     regnode *src;
13877     regnode *dst;
13878     regnode *place;
13879     const int offset = regarglen[(U8)op];
13880     const int size = NODE_STEP_REGNODE + offset;
13881     GET_RE_DEBUG_FLAGS_DECL;
13882
13883     PERL_ARGS_ASSERT_REGINSERT;
13884     PERL_UNUSED_ARG(depth);
13885 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13886     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
13887     if (SIZE_ONLY) {
13888         RExC_size += size;
13889         return;
13890     }
13891
13892     src = RExC_emit;
13893     RExC_emit += size;
13894     dst = RExC_emit;
13895     if (RExC_open_parens) {
13896         int paren;
13897         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
13898         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
13899             if ( RExC_open_parens[paren] >= opnd ) {
13900                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
13901                 RExC_open_parens[paren] += size;
13902             } else {
13903                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
13904             }
13905             if ( RExC_close_parens[paren] >= opnd ) {
13906                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
13907                 RExC_close_parens[paren] += size;
13908             } else {
13909                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
13910             }
13911         }
13912     }
13913
13914     while (src > opnd) {
13915         StructCopy(--src, --dst, regnode);
13916 #ifdef RE_TRACK_PATTERN_OFFSETS
13917         if (RExC_offsets) {     /* MJD 20010112 */
13918             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
13919                   "reg_insert",
13920                   __LINE__,
13921                   PL_reg_name[op],
13922                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
13923                     ? "Overwriting end of array!\n" : "OK",
13924                   (UV)(src - RExC_emit_start),
13925                   (UV)(dst - RExC_emit_start),
13926                   (UV)RExC_offsets[0]));
13927             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
13928             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
13929         }
13930 #endif
13931     }
13932
13933
13934     place = opnd;               /* Op node, where operand used to be. */
13935 #ifdef RE_TRACK_PATTERN_OFFSETS
13936     if (RExC_offsets) {         /* MJD */
13937         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13938               "reginsert",
13939               __LINE__,
13940               PL_reg_name[op],
13941               (UV)(place - RExC_emit_start) > RExC_offsets[0]
13942               ? "Overwriting end of array!\n" : "OK",
13943               (UV)(place - RExC_emit_start),
13944               (UV)(RExC_parse - RExC_start),
13945               (UV)RExC_offsets[0]));
13946         Set_Node_Offset(place, RExC_parse);
13947         Set_Node_Length(place, 1);
13948     }
13949 #endif
13950     src = NEXTOPER(place);
13951     FILL_ADVANCE_NODE(place, op);
13952     Zero(src, offset, regnode);
13953 }
13954
13955 /*
13956 - regtail - set the next-pointer at the end of a node chain of p to val.
13957 - SEE ALSO: regtail_study
13958 */
13959 /* TODO: All three parms should be const */
13960 STATIC void
13961 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13962 {
13963     dVAR;
13964     regnode *scan;
13965     GET_RE_DEBUG_FLAGS_DECL;
13966
13967     PERL_ARGS_ASSERT_REGTAIL;
13968 #ifndef DEBUGGING
13969     PERL_UNUSED_ARG(depth);
13970 #endif
13971
13972     if (SIZE_ONLY)
13973         return;
13974
13975     /* Find last node. */
13976     scan = p;
13977     for (;;) {
13978         regnode * const temp = regnext(scan);
13979         DEBUG_PARSE_r({
13980             SV * const mysv=sv_newmortal();
13981             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
13982             regprop(RExC_rx, mysv, scan);
13983             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
13984                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
13985                     (temp == NULL ? "->" : ""),
13986                     (temp == NULL ? PL_reg_name[OP(val)] : "")
13987             );
13988         });
13989         if (temp == NULL)
13990             break;
13991         scan = temp;
13992     }
13993
13994     if (reg_off_by_arg[OP(scan)]) {
13995         ARG_SET(scan, val - scan);
13996     }
13997     else {
13998         NEXT_OFF(scan) = val - scan;
13999     }
14000 }
14001
14002 #ifdef DEBUGGING
14003 /*
14004 - regtail_study - set the next-pointer at the end of a node chain of p to val.
14005 - Look for optimizable sequences at the same time.
14006 - currently only looks for EXACT chains.
14007
14008 This is experimental code. The idea is to use this routine to perform
14009 in place optimizations on branches and groups as they are constructed,
14010 with the long term intention of removing optimization from study_chunk so
14011 that it is purely analytical.
14012
14013 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
14014 to control which is which.
14015
14016 */
14017 /* TODO: All four parms should be const */
14018
14019 STATIC U8
14020 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14021 {
14022     dVAR;
14023     regnode *scan;
14024     U8 exact = PSEUDO;
14025 #ifdef EXPERIMENTAL_INPLACESCAN
14026     I32 min = 0;
14027 #endif
14028     GET_RE_DEBUG_FLAGS_DECL;
14029
14030     PERL_ARGS_ASSERT_REGTAIL_STUDY;
14031
14032
14033     if (SIZE_ONLY)
14034         return exact;
14035
14036     /* Find last node. */
14037
14038     scan = p;
14039     for (;;) {
14040         regnode * const temp = regnext(scan);
14041 #ifdef EXPERIMENTAL_INPLACESCAN
14042         if (PL_regkind[OP(scan)] == EXACT) {
14043             bool has_exactf_sharp_s;    /* Unexamined in this routine */
14044             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
14045                 return EXACT;
14046         }
14047 #endif
14048         if ( exact ) {
14049             switch (OP(scan)) {
14050                 case EXACT:
14051                 case EXACTF:
14052                 case EXACTFA:
14053                 case EXACTFU:
14054                 case EXACTFU_SS:
14055                 case EXACTFU_TRICKYFOLD:
14056                 case EXACTFL:
14057                         if( exact == PSEUDO )
14058                             exact= OP(scan);
14059                         else if ( exact != OP(scan) )
14060                             exact= 0;
14061                 case NOTHING:
14062                     break;
14063                 default:
14064                     exact= 0;
14065             }
14066         }
14067         DEBUG_PARSE_r({
14068             SV * const mysv=sv_newmortal();
14069             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
14070             regprop(RExC_rx, mysv, scan);
14071             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
14072                 SvPV_nolen_const(mysv),
14073                 REG_NODE_NUM(scan),
14074                 PL_reg_name[exact]);
14075         });
14076         if (temp == NULL)
14077             break;
14078         scan = temp;
14079     }
14080     DEBUG_PARSE_r({
14081         SV * const mysv_val=sv_newmortal();
14082         DEBUG_PARSE_MSG("");
14083         regprop(RExC_rx, mysv_val, val);
14084         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
14085                       SvPV_nolen_const(mysv_val),
14086                       (IV)REG_NODE_NUM(val),
14087                       (IV)(val - scan)
14088         );
14089     });
14090     if (reg_off_by_arg[OP(scan)]) {
14091         ARG_SET(scan, val - scan);
14092     }
14093     else {
14094         NEXT_OFF(scan) = val - scan;
14095     }
14096
14097     return exact;
14098 }
14099 #endif
14100
14101 /*
14102  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
14103  */
14104 #ifdef DEBUGGING
14105 static void
14106 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
14107 {
14108     int bit;
14109     int set=0;
14110     regex_charset cs;
14111
14112     for (bit=0; bit<32; bit++) {
14113         if (flags & (1<<bit)) {
14114             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
14115                 continue;
14116             }
14117             if (!set++ && lead)
14118                 PerlIO_printf(Perl_debug_log, "%s",lead);
14119             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
14120         }
14121     }
14122     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
14123             if (!set++ && lead) {
14124                 PerlIO_printf(Perl_debug_log, "%s",lead);
14125             }
14126             switch (cs) {
14127                 case REGEX_UNICODE_CHARSET:
14128                     PerlIO_printf(Perl_debug_log, "UNICODE");
14129                     break;
14130                 case REGEX_LOCALE_CHARSET:
14131                     PerlIO_printf(Perl_debug_log, "LOCALE");
14132                     break;
14133                 case REGEX_ASCII_RESTRICTED_CHARSET:
14134                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
14135                     break;
14136                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
14137                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
14138                     break;
14139                 default:
14140                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
14141                     break;
14142             }
14143     }
14144     if (lead)  {
14145         if (set)
14146             PerlIO_printf(Perl_debug_log, "\n");
14147         else
14148             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14149     }
14150 }
14151 #endif
14152
14153 void
14154 Perl_regdump(pTHX_ const regexp *r)
14155 {
14156 #ifdef DEBUGGING
14157     dVAR;
14158     SV * const sv = sv_newmortal();
14159     SV *dsv= sv_newmortal();
14160     RXi_GET_DECL(r,ri);
14161     GET_RE_DEBUG_FLAGS_DECL;
14162
14163     PERL_ARGS_ASSERT_REGDUMP;
14164
14165     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
14166
14167     /* Header fields of interest. */
14168     if (r->anchored_substr) {
14169         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
14170             RE_SV_DUMPLEN(r->anchored_substr), 30);
14171         PerlIO_printf(Perl_debug_log,
14172                       "anchored %s%s at %"IVdf" ",
14173                       s, RE_SV_TAIL(r->anchored_substr),
14174                       (IV)r->anchored_offset);
14175     } else if (r->anchored_utf8) {
14176         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
14177             RE_SV_DUMPLEN(r->anchored_utf8), 30);
14178         PerlIO_printf(Perl_debug_log,
14179                       "anchored utf8 %s%s at %"IVdf" ",
14180                       s, RE_SV_TAIL(r->anchored_utf8),
14181                       (IV)r->anchored_offset);
14182     }
14183     if (r->float_substr) {
14184         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
14185             RE_SV_DUMPLEN(r->float_substr), 30);
14186         PerlIO_printf(Perl_debug_log,
14187                       "floating %s%s at %"IVdf"..%"UVuf" ",
14188                       s, RE_SV_TAIL(r->float_substr),
14189                       (IV)r->float_min_offset, (UV)r->float_max_offset);
14190     } else if (r->float_utf8) {
14191         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
14192             RE_SV_DUMPLEN(r->float_utf8), 30);
14193         PerlIO_printf(Perl_debug_log,
14194                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
14195                       s, RE_SV_TAIL(r->float_utf8),
14196                       (IV)r->float_min_offset, (UV)r->float_max_offset);
14197     }
14198     if (r->check_substr || r->check_utf8)
14199         PerlIO_printf(Perl_debug_log,
14200                       (const char *)
14201                       (r->check_substr == r->float_substr
14202                        && r->check_utf8 == r->float_utf8
14203                        ? "(checking floating" : "(checking anchored"));
14204     if (r->extflags & RXf_NOSCAN)
14205         PerlIO_printf(Perl_debug_log, " noscan");
14206     if (r->extflags & RXf_CHECK_ALL)
14207         PerlIO_printf(Perl_debug_log, " isall");
14208     if (r->check_substr || r->check_utf8)
14209         PerlIO_printf(Perl_debug_log, ") ");
14210
14211     if (ri->regstclass) {
14212         regprop(r, sv, ri->regstclass);
14213         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
14214     }
14215     if (r->extflags & RXf_ANCH) {
14216         PerlIO_printf(Perl_debug_log, "anchored");
14217         if (r->extflags & RXf_ANCH_BOL)
14218             PerlIO_printf(Perl_debug_log, "(BOL)");
14219         if (r->extflags & RXf_ANCH_MBOL)
14220             PerlIO_printf(Perl_debug_log, "(MBOL)");
14221         if (r->extflags & RXf_ANCH_SBOL)
14222             PerlIO_printf(Perl_debug_log, "(SBOL)");
14223         if (r->extflags & RXf_ANCH_GPOS)
14224             PerlIO_printf(Perl_debug_log, "(GPOS)");
14225         PerlIO_putc(Perl_debug_log, ' ');
14226     }
14227     if (r->extflags & RXf_GPOS_SEEN)
14228         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
14229     if (r->intflags & PREGf_SKIP)
14230         PerlIO_printf(Perl_debug_log, "plus ");
14231     if (r->intflags & PREGf_IMPLICIT)
14232         PerlIO_printf(Perl_debug_log, "implicit ");
14233     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
14234     if (r->extflags & RXf_EVAL_SEEN)
14235         PerlIO_printf(Perl_debug_log, "with eval ");
14236     PerlIO_printf(Perl_debug_log, "\n");
14237     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
14238 #else
14239     PERL_ARGS_ASSERT_REGDUMP;
14240     PERL_UNUSED_CONTEXT;
14241     PERL_UNUSED_ARG(r);
14242 #endif  /* DEBUGGING */
14243 }
14244
14245 /*
14246 - regprop - printable representation of opcode
14247 */
14248 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
14249 STMT_START { \
14250         if (do_sep) {                           \
14251             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
14252             if (flags & ANYOF_INVERT)           \
14253                 /*make sure the invert info is in each */ \
14254                 sv_catpvs(sv, "^");             \
14255             do_sep = 0;                         \
14256         }                                       \
14257 } STMT_END
14258
14259 void
14260 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
14261 {
14262 #ifdef DEBUGGING
14263     dVAR;
14264     int k;
14265
14266     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
14267     static const char * const anyofs[] = {
14268 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
14269     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
14270     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
14271     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
14272     || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
14273     || _CC_VERTSPACE != 16
14274   #error Need to adjust order of anyofs[]
14275 #endif
14276         "[\\w]",
14277         "[\\W]",
14278         "[\\d]",
14279         "[\\D]",
14280         "[:alpha:]",
14281         "[:^alpha:]",
14282         "[:lower:]",
14283         "[:^lower:]",
14284         "[:upper:]",
14285         "[:^upper:]",
14286         "[:punct:]",
14287         "[:^punct:]",
14288         "[:print:]",
14289         "[:^print:]",
14290         "[:alnum:]",
14291         "[:^alnum:]",
14292         "[:graph:]",
14293         "[:^graph:]",
14294         "[:cased:]",
14295         "[:^cased:]",
14296         "[\\s]",
14297         "[\\S]",
14298         "[:blank:]",
14299         "[:^blank:]",
14300         "[:xdigit:]",
14301         "[:^xdigit:]",
14302         "[:space:]",
14303         "[:^space:]",
14304         "[:cntrl:]",
14305         "[:^cntrl:]",
14306         "[:ascii:]",
14307         "[:^ascii:]",
14308         "[\\v]",
14309         "[\\V]"
14310     };
14311     RXi_GET_DECL(prog,progi);
14312     GET_RE_DEBUG_FLAGS_DECL;
14313
14314     PERL_ARGS_ASSERT_REGPROP;
14315
14316     sv_setpvs(sv, "");
14317
14318     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
14319         /* It would be nice to FAIL() here, but this may be called from
14320            regexec.c, and it would be hard to supply pRExC_state. */
14321         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
14322     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
14323
14324     k = PL_regkind[OP(o)];
14325
14326     if (k == EXACT) {
14327         sv_catpvs(sv, " ");
14328         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
14329          * is a crude hack but it may be the best for now since
14330          * we have no flag "this EXACTish node was UTF-8"
14331          * --jhi */
14332         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
14333                   PERL_PV_ESCAPE_UNI_DETECT |
14334                   PERL_PV_ESCAPE_NONASCII   |
14335                   PERL_PV_PRETTY_ELLIPSES   |
14336                   PERL_PV_PRETTY_LTGT       |
14337                   PERL_PV_PRETTY_NOCLEAR
14338                   );
14339     } else if (k == TRIE) {
14340         /* print the details of the trie in dumpuntil instead, as
14341          * progi->data isn't available here */
14342         const char op = OP(o);
14343         const U32 n = ARG(o);
14344         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
14345                (reg_ac_data *)progi->data->data[n] :
14346                NULL;
14347         const reg_trie_data * const trie
14348             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
14349
14350         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
14351         DEBUG_TRIE_COMPILE_r(
14352             Perl_sv_catpvf(aTHX_ sv,
14353                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
14354                 (UV)trie->startstate,
14355                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
14356                 (UV)trie->wordcount,
14357                 (UV)trie->minlen,
14358                 (UV)trie->maxlen,
14359                 (UV)TRIE_CHARCOUNT(trie),
14360                 (UV)trie->uniquecharcount
14361             )
14362         );
14363         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
14364             int i;
14365             int rangestart = -1;
14366             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
14367             sv_catpvs(sv, "[");
14368             for (i = 0; i <= 256; i++) {
14369                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
14370                     if (rangestart == -1)
14371                         rangestart = i;
14372                 } else if (rangestart != -1) {
14373                     if (i <= rangestart + 3)
14374                         for (; rangestart < i; rangestart++)
14375                             put_byte(sv, rangestart);
14376                     else {
14377                         put_byte(sv, rangestart);
14378                         sv_catpvs(sv, "-");
14379                         put_byte(sv, i - 1);
14380                     }
14381                     rangestart = -1;
14382                 }
14383             }
14384             sv_catpvs(sv, "]");
14385         }
14386
14387     } else if (k == CURLY) {
14388         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
14389             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
14390         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
14391     }
14392     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
14393         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
14394     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
14395         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
14396         if ( RXp_PAREN_NAMES(prog) ) {
14397             if ( k != REF || (OP(o) < NREF)) {
14398                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
14399                 SV **name= av_fetch(list, ARG(o), 0 );
14400                 if (name)
14401                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14402             }
14403             else {
14404                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
14405                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
14406                 I32 *nums=(I32*)SvPVX(sv_dat);
14407                 SV **name= av_fetch(list, nums[0], 0 );
14408                 I32 n;
14409                 if (name) {
14410                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
14411                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
14412                                     (n ? "," : ""), (IV)nums[n]);
14413                     }
14414                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14415                 }
14416             }
14417         }
14418     } else if (k == GOSUB)
14419         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
14420     else if (k == VERB) {
14421         if (!o->flags)
14422             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
14423                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
14424     } else if (k == LOGICAL)
14425         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
14426     else if (k == ANYOF) {
14427         int i, rangestart = -1;
14428         const U8 flags = ANYOF_FLAGS(o);
14429         int do_sep = 0;
14430
14431
14432         if (flags & ANYOF_LOCALE)
14433             sv_catpvs(sv, "{loc}");
14434         if (flags & ANYOF_LOC_FOLD)
14435             sv_catpvs(sv, "{i}");
14436         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
14437         if (flags & ANYOF_INVERT)
14438             sv_catpvs(sv, "^");
14439
14440         /* output what the standard cp 0-255 bitmap matches */
14441         for (i = 0; i <= 256; i++) {
14442             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
14443                 if (rangestart == -1)
14444                     rangestart = i;
14445             } else if (rangestart != -1) {
14446                 if (i <= rangestart + 3)
14447                     for (; rangestart < i; rangestart++)
14448                         put_byte(sv, rangestart);
14449                 else {
14450                     put_byte(sv, rangestart);
14451                     sv_catpvs(sv, "-");
14452                     put_byte(sv, i - 1);
14453                 }
14454                 do_sep = 1;
14455                 rangestart = -1;
14456             }
14457         }
14458
14459         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14460         /* output any special charclass tests (used entirely under use locale) */
14461         if (ANYOF_CLASS_TEST_ANY_SET(o))
14462             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
14463                 if (ANYOF_CLASS_TEST(o,i)) {
14464                     sv_catpv(sv, anyofs[i]);
14465                     do_sep = 1;
14466                 }
14467
14468         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14469
14470         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
14471             sv_catpvs(sv, "{non-utf8-latin1-all}");
14472         }
14473
14474         /* output information about the unicode matching */
14475         if (flags & ANYOF_UNICODE_ALL)
14476             sv_catpvs(sv, "{unicode_all}");
14477         else if (ANYOF_NONBITMAP(o))
14478             sv_catpvs(sv, "{unicode}");
14479         if (flags & ANYOF_NONBITMAP_NON_UTF8)
14480             sv_catpvs(sv, "{outside bitmap}");
14481
14482         if (ANYOF_NONBITMAP(o)) {
14483             SV *lv; /* Set if there is something outside the bit map */
14484             SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
14485             bool byte_output = FALSE;   /* If something in the bitmap has been
14486                                            output */
14487
14488             if (lv && lv != &PL_sv_undef) {
14489                 if (sw) {
14490                     U8 s[UTF8_MAXBYTES_CASE+1];
14491
14492                     for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
14493                         uvchr_to_utf8(s, i);
14494
14495                         if (i < 256
14496                             && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
14497                                                                things already
14498                                                                output as part
14499                                                                of the bitmap */
14500                             && swash_fetch(sw, s, TRUE))
14501                         {
14502                             if (rangestart == -1)
14503                                 rangestart = i;
14504                         } else if (rangestart != -1) {
14505                             byte_output = TRUE;
14506                             if (i <= rangestart + 3)
14507                                 for (; rangestart < i; rangestart++) {
14508                                     put_byte(sv, rangestart);
14509                                 }
14510                             else {
14511                                 put_byte(sv, rangestart);
14512                                 sv_catpvs(sv, "-");
14513                                 put_byte(sv, i-1);
14514                             }
14515                             rangestart = -1;
14516                         }
14517                     }
14518                 }
14519
14520                 {
14521                     char *s = savesvpv(lv);
14522                     char * const origs = s;
14523
14524                     while (*s && *s != '\n')
14525                         s++;
14526
14527                     if (*s == '\n') {
14528                         const char * const t = ++s;
14529
14530                         if (byte_output) {
14531                             sv_catpvs(sv, " ");
14532                         }
14533
14534                         while (*s) {
14535                             if (*s == '\n') {
14536
14537                                 /* Truncate very long output */
14538                                 if (s - origs > 256) {
14539                                     Perl_sv_catpvf(aTHX_ sv,
14540                                                    "%.*s...",
14541                                                    (int) (s - origs - 1),
14542                                                    t);
14543                                     goto out_dump;
14544                                 }
14545                                 *s = ' ';
14546                             }
14547                             else if (*s == '\t') {
14548                                 *s = '-';
14549                             }
14550                             s++;
14551                         }
14552                         if (s[-1] == ' ')
14553                             s[-1] = 0;
14554
14555                         sv_catpv(sv, t);
14556                     }
14557
14558                 out_dump:
14559
14560                     Safefree(origs);
14561                 }
14562                 SvREFCNT_dec_NN(lv);
14563             }
14564         }
14565
14566         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14567     }
14568     else if (k == POSIXD || k == NPOSIXD) {
14569         U8 index = FLAGS(o) * 2;
14570         if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14571             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14572         }
14573         else {
14574             sv_catpv(sv, anyofs[index]);
14575         }
14576     }
14577     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14578         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14579 #else
14580     PERL_UNUSED_CONTEXT;
14581     PERL_UNUSED_ARG(sv);
14582     PERL_UNUSED_ARG(o);
14583     PERL_UNUSED_ARG(prog);
14584 #endif  /* DEBUGGING */
14585 }
14586
14587 SV *
14588 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14589 {                               /* Assume that RE_INTUIT is set */
14590     dVAR;
14591     struct regexp *const prog = ReANY(r);
14592     GET_RE_DEBUG_FLAGS_DECL;
14593
14594     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14595     PERL_UNUSED_CONTEXT;
14596
14597     DEBUG_COMPILE_r(
14598         {
14599             const char * const s = SvPV_nolen_const(prog->check_substr
14600                       ? prog->check_substr : prog->check_utf8);
14601
14602             if (!PL_colorset) reginitcolors();
14603             PerlIO_printf(Perl_debug_log,
14604                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14605                       PL_colors[4],
14606                       prog->check_substr ? "" : "utf8 ",
14607                       PL_colors[5],PL_colors[0],
14608                       s,
14609                       PL_colors[1],
14610                       (strlen(s) > 60 ? "..." : ""));
14611         } );
14612
14613     return prog->check_substr ? prog->check_substr : prog->check_utf8;
14614 }
14615
14616 /*
14617    pregfree()
14618
14619    handles refcounting and freeing the perl core regexp structure. When
14620    it is necessary to actually free the structure the first thing it
14621    does is call the 'free' method of the regexp_engine associated to
14622    the regexp, allowing the handling of the void *pprivate; member
14623    first. (This routine is not overridable by extensions, which is why
14624    the extensions free is called first.)
14625
14626    See regdupe and regdupe_internal if you change anything here.
14627 */
14628 #ifndef PERL_IN_XSUB_RE
14629 void
14630 Perl_pregfree(pTHX_ REGEXP *r)
14631 {
14632     SvREFCNT_dec(r);
14633 }
14634
14635 void
14636 Perl_pregfree2(pTHX_ REGEXP *rx)
14637 {
14638     dVAR;
14639     struct regexp *const r = ReANY(rx);
14640     GET_RE_DEBUG_FLAGS_DECL;
14641
14642     PERL_ARGS_ASSERT_PREGFREE2;
14643
14644     if (r->mother_re) {
14645         ReREFCNT_dec(r->mother_re);
14646     } else {
14647         CALLREGFREE_PVT(rx); /* free the private data */
14648         SvREFCNT_dec(RXp_PAREN_NAMES(r));
14649         Safefree(r->xpv_len_u.xpvlenu_pv);
14650     }
14651     if (r->substrs) {
14652         SvREFCNT_dec(r->anchored_substr);
14653         SvREFCNT_dec(r->anchored_utf8);
14654         SvREFCNT_dec(r->float_substr);
14655         SvREFCNT_dec(r->float_utf8);
14656         Safefree(r->substrs);
14657     }
14658     RX_MATCH_COPY_FREE(rx);
14659 #ifdef PERL_ANY_COW
14660     SvREFCNT_dec(r->saved_copy);
14661 #endif
14662     Safefree(r->offs);
14663     SvREFCNT_dec(r->qr_anoncv);
14664     rx->sv_u.svu_rx = 0;
14665 }
14666
14667 /*  reg_temp_copy()
14668
14669     This is a hacky workaround to the structural issue of match results
14670     being stored in the regexp structure which is in turn stored in
14671     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
14672     could be PL_curpm in multiple contexts, and could require multiple
14673     result sets being associated with the pattern simultaneously, such
14674     as when doing a recursive match with (??{$qr})
14675
14676     The solution is to make a lightweight copy of the regexp structure
14677     when a qr// is returned from the code executed by (??{$qr}) this
14678     lightweight copy doesn't actually own any of its data except for
14679     the starp/end and the actual regexp structure itself.
14680
14681 */
14682
14683
14684 REGEXP *
14685 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
14686 {
14687     struct regexp *ret;
14688     struct regexp *const r = ReANY(rx);
14689     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
14690
14691     PERL_ARGS_ASSERT_REG_TEMP_COPY;
14692
14693     if (!ret_x)
14694         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
14695     else {
14696         SvOK_off((SV *)ret_x);
14697         if (islv) {
14698             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
14699                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
14700                made both spots point to the same regexp body.) */
14701             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
14702             assert(!SvPVX(ret_x));
14703             ret_x->sv_u.svu_rx = temp->sv_any;
14704             temp->sv_any = NULL;
14705             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
14706             SvREFCNT_dec_NN(temp);
14707             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
14708                ing below will not set it. */
14709             SvCUR_set(ret_x, SvCUR(rx));
14710         }
14711     }
14712     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
14713        sv_force_normal(sv) is called.  */
14714     SvFAKE_on(ret_x);
14715     ret = ReANY(ret_x);
14716
14717     SvFLAGS(ret_x) |= SvUTF8(rx);
14718     /* We share the same string buffer as the original regexp, on which we
14719        hold a reference count, incremented when mother_re is set below.
14720        The string pointer is copied here, being part of the regexp struct.
14721      */
14722     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
14723            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
14724     if (r->offs) {
14725         const I32 npar = r->nparens+1;
14726         Newx(ret->offs, npar, regexp_paren_pair);
14727         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14728     }
14729     if (r->substrs) {
14730         Newx(ret->substrs, 1, struct reg_substr_data);
14731         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14732
14733         SvREFCNT_inc_void(ret->anchored_substr);
14734         SvREFCNT_inc_void(ret->anchored_utf8);
14735         SvREFCNT_inc_void(ret->float_substr);
14736         SvREFCNT_inc_void(ret->float_utf8);
14737
14738         /* check_substr and check_utf8, if non-NULL, point to either their
14739            anchored or float namesakes, and don't hold a second reference.  */
14740     }
14741     RX_MATCH_COPIED_off(ret_x);
14742 #ifdef PERL_ANY_COW
14743     ret->saved_copy = NULL;
14744 #endif
14745     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
14746     SvREFCNT_inc_void(ret->qr_anoncv);
14747
14748     return ret_x;
14749 }
14750 #endif
14751
14752 /* regfree_internal()
14753
14754    Free the private data in a regexp. This is overloadable by
14755    extensions. Perl takes care of the regexp structure in pregfree(),
14756    this covers the *pprivate pointer which technically perl doesn't
14757    know about, however of course we have to handle the
14758    regexp_internal structure when no extension is in use.
14759
14760    Note this is called before freeing anything in the regexp
14761    structure.
14762  */
14763
14764 void
14765 Perl_regfree_internal(pTHX_ REGEXP * const rx)
14766 {
14767     dVAR;
14768     struct regexp *const r = ReANY(rx);
14769     RXi_GET_DECL(r,ri);
14770     GET_RE_DEBUG_FLAGS_DECL;
14771
14772     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
14773
14774     DEBUG_COMPILE_r({
14775         if (!PL_colorset)
14776             reginitcolors();
14777         {
14778             SV *dsv= sv_newmortal();
14779             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
14780                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
14781             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
14782                 PL_colors[4],PL_colors[5],s);
14783         }
14784     });
14785 #ifdef RE_TRACK_PATTERN_OFFSETS
14786     if (ri->u.offsets)
14787         Safefree(ri->u.offsets);             /* 20010421 MJD */
14788 #endif
14789     if (ri->code_blocks) {
14790         int n;
14791         for (n = 0; n < ri->num_code_blocks; n++)
14792             SvREFCNT_dec(ri->code_blocks[n].src_regex);
14793         Safefree(ri->code_blocks);
14794     }
14795
14796     if (ri->data) {
14797         int n = ri->data->count;
14798
14799         while (--n >= 0) {
14800           /* If you add a ->what type here, update the comment in regcomp.h */
14801             switch (ri->data->what[n]) {
14802             case 'a':
14803             case 'r':
14804             case 's':
14805             case 'S':
14806             case 'u':
14807                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
14808                 break;
14809             case 'f':
14810                 Safefree(ri->data->data[n]);
14811                 break;
14812             case 'l':
14813             case 'L':
14814                 break;
14815             case 'T':
14816                 { /* Aho Corasick add-on structure for a trie node.
14817                      Used in stclass optimization only */
14818                     U32 refcount;
14819                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
14820                     OP_REFCNT_LOCK;
14821                     refcount = --aho->refcount;
14822                     OP_REFCNT_UNLOCK;
14823                     if ( !refcount ) {
14824                         PerlMemShared_free(aho->states);
14825                         PerlMemShared_free(aho->fail);
14826                          /* do this last!!!! */
14827                         PerlMemShared_free(ri->data->data[n]);
14828                         PerlMemShared_free(ri->regstclass);
14829                     }
14830                 }
14831                 break;
14832             case 't':
14833                 {
14834                     /* trie structure. */
14835                     U32 refcount;
14836                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
14837                     OP_REFCNT_LOCK;
14838                     refcount = --trie->refcount;
14839                     OP_REFCNT_UNLOCK;
14840                     if ( !refcount ) {
14841                         PerlMemShared_free(trie->charmap);
14842                         PerlMemShared_free(trie->states);
14843                         PerlMemShared_free(trie->trans);
14844                         if (trie->bitmap)
14845                             PerlMemShared_free(trie->bitmap);
14846                         if (trie->jump)
14847                             PerlMemShared_free(trie->jump);
14848                         PerlMemShared_free(trie->wordinfo);
14849                         /* do this last!!!! */
14850                         PerlMemShared_free(ri->data->data[n]);
14851                     }
14852                 }
14853                 break;
14854             default:
14855                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
14856             }
14857         }
14858         Safefree(ri->data->what);
14859         Safefree(ri->data);
14860     }
14861
14862     Safefree(ri);
14863 }
14864
14865 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
14866 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
14867 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
14868
14869 /*
14870    re_dup - duplicate a regexp.
14871
14872    This routine is expected to clone a given regexp structure. It is only
14873    compiled under USE_ITHREADS.
14874
14875    After all of the core data stored in struct regexp is duplicated
14876    the regexp_engine.dupe method is used to copy any private data
14877    stored in the *pprivate pointer. This allows extensions to handle
14878    any duplication it needs to do.
14879
14880    See pregfree() and regfree_internal() if you change anything here.
14881 */
14882 #if defined(USE_ITHREADS)
14883 #ifndef PERL_IN_XSUB_RE
14884 void
14885 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
14886 {
14887     dVAR;
14888     I32 npar;
14889     const struct regexp *r = ReANY(sstr);
14890     struct regexp *ret = ReANY(dstr);
14891
14892     PERL_ARGS_ASSERT_RE_DUP_GUTS;
14893
14894     npar = r->nparens+1;
14895     Newx(ret->offs, npar, regexp_paren_pair);
14896     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14897     if(ret->swap) {
14898         /* no need to copy these */
14899         Newx(ret->swap, npar, regexp_paren_pair);
14900     }
14901
14902     if (ret->substrs) {
14903         /* Do it this way to avoid reading from *r after the StructCopy().
14904            That way, if any of the sv_dup_inc()s dislodge *r from the L1
14905            cache, it doesn't matter.  */
14906         const bool anchored = r->check_substr
14907             ? r->check_substr == r->anchored_substr
14908             : r->check_utf8 == r->anchored_utf8;
14909         Newx(ret->substrs, 1, struct reg_substr_data);
14910         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14911
14912         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
14913         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
14914         ret->float_substr = sv_dup_inc(ret->float_substr, param);
14915         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
14916
14917         /* check_substr and check_utf8, if non-NULL, point to either their
14918            anchored or float namesakes, and don't hold a second reference.  */
14919
14920         if (ret->check_substr) {
14921             if (anchored) {
14922                 assert(r->check_utf8 == r->anchored_utf8);
14923                 ret->check_substr = ret->anchored_substr;
14924                 ret->check_utf8 = ret->anchored_utf8;
14925             } else {
14926                 assert(r->check_substr == r->float_substr);
14927                 assert(r->check_utf8 == r->float_utf8);
14928                 ret->check_substr = ret->float_substr;
14929                 ret->check_utf8 = ret->float_utf8;
14930             }
14931         } else if (ret->check_utf8) {
14932             if (anchored) {
14933                 ret->check_utf8 = ret->anchored_utf8;
14934             } else {
14935                 ret->check_utf8 = ret->float_utf8;
14936             }
14937         }
14938     }
14939
14940     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
14941     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
14942
14943     if (ret->pprivate)
14944         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
14945
14946     if (RX_MATCH_COPIED(dstr))
14947         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
14948     else
14949         ret->subbeg = NULL;
14950 #ifdef PERL_ANY_COW
14951     ret->saved_copy = NULL;
14952 #endif
14953
14954     /* Whether mother_re be set or no, we need to copy the string.  We
14955        cannot refrain from copying it when the storage points directly to
14956        our mother regexp, because that's
14957                1: a buffer in a different thread
14958                2: something we no longer hold a reference on
14959                so we need to copy it locally.  */
14960     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
14961     ret->mother_re   = NULL;
14962     ret->gofs = 0;
14963 }
14964 #endif /* PERL_IN_XSUB_RE */
14965
14966 /*
14967    regdupe_internal()
14968
14969    This is the internal complement to regdupe() which is used to copy
14970    the structure pointed to by the *pprivate pointer in the regexp.
14971    This is the core version of the extension overridable cloning hook.
14972    The regexp structure being duplicated will be copied by perl prior
14973    to this and will be provided as the regexp *r argument, however
14974    with the /old/ structures pprivate pointer value. Thus this routine
14975    may override any copying normally done by perl.
14976
14977    It returns a pointer to the new regexp_internal structure.
14978 */
14979
14980 void *
14981 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
14982 {
14983     dVAR;
14984     struct regexp *const r = ReANY(rx);
14985     regexp_internal *reti;
14986     int len;
14987     RXi_GET_DECL(r,ri);
14988
14989     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
14990
14991     len = ProgLen(ri);
14992
14993     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
14994     Copy(ri->program, reti->program, len+1, regnode);
14995
14996     reti->num_code_blocks = ri->num_code_blocks;
14997     if (ri->code_blocks) {
14998         int n;
14999         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
15000                 struct reg_code_block);
15001         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
15002                 struct reg_code_block);
15003         for (n = 0; n < ri->num_code_blocks; n++)
15004              reti->code_blocks[n].src_regex = (REGEXP*)
15005                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
15006     }
15007     else
15008         reti->code_blocks = NULL;
15009
15010     reti->regstclass = NULL;
15011
15012     if (ri->data) {
15013         struct reg_data *d;
15014         const int count = ri->data->count;
15015         int i;
15016
15017         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
15018                 char, struct reg_data);
15019         Newx(d->what, count, U8);
15020
15021         d->count = count;
15022         for (i = 0; i < count; i++) {
15023             d->what[i] = ri->data->what[i];
15024             switch (d->what[i]) {
15025                 /* see also regcomp.h and regfree_internal() */
15026             case 'a': /* actually an AV, but the dup function is identical.  */
15027             case 'r':
15028             case 's':
15029             case 'S':
15030             case 'u': /* actually an HV, but the dup function is identical.  */
15031                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
15032                 break;
15033             case 'f':
15034                 /* This is cheating. */
15035                 Newx(d->data[i], 1, struct regnode_charclass_class);
15036                 StructCopy(ri->data->data[i], d->data[i],
15037                             struct regnode_charclass_class);
15038                 reti->regstclass = (regnode*)d->data[i];
15039                 break;
15040             case 'T':
15041                 /* Trie stclasses are readonly and can thus be shared
15042                  * without duplication. We free the stclass in pregfree
15043                  * when the corresponding reg_ac_data struct is freed.
15044                  */
15045                 reti->regstclass= ri->regstclass;
15046                 /* Fall through */
15047             case 't':
15048                 OP_REFCNT_LOCK;
15049                 ((reg_trie_data*)ri->data->data[i])->refcount++;
15050                 OP_REFCNT_UNLOCK;
15051                 /* Fall through */
15052             case 'l':
15053             case 'L':
15054                 d->data[i] = ri->data->data[i];
15055                 break;
15056             default:
15057                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
15058             }
15059         }
15060
15061         reti->data = d;
15062     }
15063     else
15064         reti->data = NULL;
15065
15066     reti->name_list_idx = ri->name_list_idx;
15067
15068 #ifdef RE_TRACK_PATTERN_OFFSETS
15069     if (ri->u.offsets) {
15070         Newx(reti->u.offsets, 2*len+1, U32);
15071         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
15072     }
15073 #else
15074     SetProgLen(reti,len);
15075 #endif
15076
15077     return (void*)reti;
15078 }
15079
15080 #endif    /* USE_ITHREADS */
15081
15082 #ifndef PERL_IN_XSUB_RE
15083
15084 /*
15085  - regnext - dig the "next" pointer out of a node
15086  */
15087 regnode *
15088 Perl_regnext(pTHX_ regnode *p)
15089 {
15090     dVAR;
15091     I32 offset;
15092
15093     if (!p)
15094         return(NULL);
15095
15096     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
15097         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
15098     }
15099
15100     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
15101     if (offset == 0)
15102         return(NULL);
15103
15104     return(p+offset);
15105 }
15106 #endif
15107
15108 STATIC void
15109 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
15110 {
15111     va_list args;
15112     STRLEN l1 = strlen(pat1);
15113     STRLEN l2 = strlen(pat2);
15114     char buf[512];
15115     SV *msv;
15116     const char *message;
15117
15118     PERL_ARGS_ASSERT_RE_CROAK2;
15119
15120     if (l1 > 510)
15121         l1 = 510;
15122     if (l1 + l2 > 510)
15123         l2 = 510 - l1;
15124     Copy(pat1, buf, l1 , char);
15125     Copy(pat2, buf + l1, l2 , char);
15126     buf[l1 + l2] = '\n';
15127     buf[l1 + l2 + 1] = '\0';
15128 #ifdef I_STDARG
15129     /* ANSI variant takes additional second argument */
15130     va_start(args, pat2);
15131 #else
15132     va_start(args);
15133 #endif
15134     msv = vmess(buf, &args);
15135     va_end(args);
15136     message = SvPV_const(msv,l1);
15137     if (l1 > 512)
15138         l1 = 512;
15139     Copy(message, buf, l1 , char);
15140     buf[l1-1] = '\0';                   /* Overwrite \n */
15141     Perl_croak(aTHX_ "%s", buf);
15142 }
15143
15144 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
15145
15146 #ifndef PERL_IN_XSUB_RE
15147 void
15148 Perl_save_re_context(pTHX)
15149 {
15150     dVAR;
15151
15152     struct re_save_state *state;
15153
15154     SAVEVPTR(PL_curcop);
15155     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
15156
15157     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
15158     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
15159     SSPUSHUV(SAVEt_RE_STATE);
15160
15161     Copy(&PL_reg_state, state, 1, struct re_save_state);
15162
15163     PL_reg_oldsaved = NULL;
15164     PL_reg_oldsavedlen = 0;
15165     PL_reg_oldsavedoffset = 0;
15166     PL_reg_oldsavedcoffset = 0;
15167     PL_reg_maxiter = 0;
15168     PL_reg_leftiter = 0;
15169     PL_reg_poscache = NULL;
15170     PL_reg_poscache_size = 0;
15171 #ifdef PERL_ANY_COW
15172     PL_nrs = NULL;
15173 #endif
15174
15175     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
15176     if (PL_curpm) {
15177         const REGEXP * const rx = PM_GETRE(PL_curpm);
15178         if (rx) {
15179             U32 i;
15180             for (i = 1; i <= RX_NPARENS(rx); i++) {
15181                 char digits[TYPE_CHARS(long)];
15182                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
15183                 GV *const *const gvp
15184                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
15185
15186                 if (gvp) {
15187                     GV * const gv = *gvp;
15188                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
15189                         save_scalar(gv);
15190                 }
15191             }
15192         }
15193     }
15194 }
15195 #endif
15196
15197 #ifdef DEBUGGING
15198
15199 STATIC void
15200 S_put_byte(pTHX_ SV *sv, int c)
15201 {
15202     PERL_ARGS_ASSERT_PUT_BYTE;
15203
15204     /* Our definition of isPRINT() ignores locales, so only bytes that are
15205        not part of UTF-8 are considered printable. I assume that the same
15206        holds for UTF-EBCDIC.
15207        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
15208        which Wikipedia says:
15209
15210        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
15211        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
15212        identical, to the ASCII delete (DEL) or rubout control character.
15213        ) So the old condition can be simplified to !isPRINT(c)  */
15214     if (!isPRINT(c)) {
15215         if (c < 256) {
15216             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
15217         }
15218         else {
15219             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
15220         }
15221     }
15222     else {
15223         const char string = c;
15224         if (c == '-' || c == ']' || c == '\\' || c == '^')
15225             sv_catpvs(sv, "\\");
15226         sv_catpvn(sv, &string, 1);
15227     }
15228 }
15229
15230
15231 #define CLEAR_OPTSTART \
15232     if (optstart) STMT_START { \
15233             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
15234             optstart=NULL; \
15235     } STMT_END
15236
15237 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
15238
15239 STATIC const regnode *
15240 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
15241             const regnode *last, const regnode *plast,
15242             SV* sv, I32 indent, U32 depth)
15243 {
15244     dVAR;
15245     U8 op = PSEUDO;     /* Arbitrary non-END op. */
15246     const regnode *next;
15247     const regnode *optstart= NULL;
15248
15249     RXi_GET_DECL(r,ri);
15250     GET_RE_DEBUG_FLAGS_DECL;
15251
15252     PERL_ARGS_ASSERT_DUMPUNTIL;
15253
15254 #ifdef DEBUG_DUMPUNTIL
15255     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
15256         last ? last-start : 0,plast ? plast-start : 0);
15257 #endif
15258
15259     if (plast && plast < last)
15260         last= plast;
15261
15262     while (PL_regkind[op] != END && (!last || node < last)) {
15263         /* While that wasn't END last time... */
15264         NODE_ALIGN(node);
15265         op = OP(node);
15266         if (op == CLOSE || op == WHILEM)
15267             indent--;
15268         next = regnext((regnode *)node);
15269
15270         /* Where, what. */
15271         if (OP(node) == OPTIMIZED) {
15272             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
15273                 optstart = node;
15274             else
15275                 goto after_print;
15276         } else
15277             CLEAR_OPTSTART;
15278
15279         regprop(r, sv, node);
15280         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
15281                       (int)(2*indent + 1), "", SvPVX_const(sv));
15282
15283         if (OP(node) != OPTIMIZED) {
15284             if (next == NULL)           /* Next ptr. */
15285                 PerlIO_printf(Perl_debug_log, " (0)");
15286             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
15287                 PerlIO_printf(Perl_debug_log, " (FAIL)");
15288             else
15289                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
15290             (void)PerlIO_putc(Perl_debug_log, '\n');
15291         }
15292
15293       after_print:
15294         if (PL_regkind[(U8)op] == BRANCHJ) {
15295             assert(next);
15296             {
15297                 const regnode *nnode = (OP(next) == LONGJMP
15298                                        ? regnext((regnode *)next)
15299                                        : next);
15300                 if (last && nnode > last)
15301                     nnode = last;
15302                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
15303             }
15304         }
15305         else if (PL_regkind[(U8)op] == BRANCH) {
15306             assert(next);
15307             DUMPUNTIL(NEXTOPER(node), next);
15308         }
15309         else if ( PL_regkind[(U8)op]  == TRIE ) {
15310             const regnode *this_trie = node;
15311             const char op = OP(node);
15312             const U32 n = ARG(node);
15313             const reg_ac_data * const ac = op>=AHOCORASICK ?
15314                (reg_ac_data *)ri->data->data[n] :
15315                NULL;
15316             const reg_trie_data * const trie =
15317                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
15318 #ifdef DEBUGGING
15319             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
15320 #endif
15321             const regnode *nextbranch= NULL;
15322             I32 word_idx;
15323             sv_setpvs(sv, "");
15324             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
15325                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
15326
15327                 PerlIO_printf(Perl_debug_log, "%*s%s ",
15328                    (int)(2*(indent+3)), "",
15329                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
15330                             PL_colors[0], PL_colors[1],
15331                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
15332                             PERL_PV_PRETTY_ELLIPSES    |
15333                             PERL_PV_PRETTY_LTGT
15334                             )
15335                             : "???"
15336                 );
15337                 if (trie->jump) {
15338                     U16 dist= trie->jump[word_idx+1];
15339                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
15340                                   (UV)((dist ? this_trie + dist : next) - start));
15341                     if (dist) {
15342                         if (!nextbranch)
15343                             nextbranch= this_trie + trie->jump[0];
15344                         DUMPUNTIL(this_trie + dist, nextbranch);
15345                     }
15346                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
15347                         nextbranch= regnext((regnode *)nextbranch);
15348                 } else {
15349                     PerlIO_printf(Perl_debug_log, "\n");
15350                 }
15351             }
15352             if (last && next > last)
15353                 node= last;
15354             else
15355                 node= next;
15356         }
15357         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
15358             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
15359                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
15360         }
15361         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
15362             assert(next);
15363             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
15364         }
15365         else if ( op == PLUS || op == STAR) {
15366             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
15367         }
15368         else if (PL_regkind[(U8)op] == ANYOF) {
15369             /* arglen 1 + class block */
15370             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
15371                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
15372             node = NEXTOPER(node);
15373         }
15374         else if (PL_regkind[(U8)op] == EXACT) {
15375             /* Literal string, where present. */
15376             node += NODE_SZ_STR(node) - 1;
15377             node = NEXTOPER(node);
15378         }
15379         else {
15380             node = NEXTOPER(node);
15381             node += regarglen[(U8)op];
15382         }
15383         if (op == CURLYX || op == OPEN)
15384             indent++;
15385     }
15386     CLEAR_OPTSTART;
15387 #ifdef DEBUG_DUMPUNTIL
15388     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
15389 #endif
15390     return node;
15391 }
15392
15393 #endif  /* DEBUGGING */
15394
15395 /*
15396  * Local variables:
15397  * c-indentation-style: bsd
15398  * c-basic-offset: 4
15399  * indent-tabs-mode: nil
15400  * End:
15401  *
15402  * ex: set ts=8 sts=4 sw=4 et:
15403  */