regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 extern const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  95 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  96 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  97
  98 #ifdef op
  99 #undef op
 100 #endif /* op */
 101
 102 #ifdef MSDOS
 103 #  if defined(BUGGY_MSC6)
 104  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 105 #    pragma optimize("a",off)
 106  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 107 #    pragma optimize("w",on )
 108 #  endif /* BUGGY_MSC6 */
 109 #endif /* MSDOS */
 110
 111 #ifndef STATIC
 112 #define STATIC  static
 113 #endif
 114
 115
 116 typedef struct RExC_state_t {
 117     U32         flags;                  /* RXf_* are we folding, multilining? */
 118     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 119     char        *precomp;               /* uncompiled string. */
 120     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 121     regexp      *rx;                    /* perl core regexp structure */
 122     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 123     char        *start;                 /* Start of input for compile */
 124     char        *end;                   /* End of input for compile */
 125     char        *parse;                 /* Input-scan pointer. */
 126     I32         whilem_seen;            /* number of WHILEM in this expr */
 127     regnode     *emit_start;            /* Start of emitted-code area */
 128     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 129     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 130     I32         naughty;                /* How bad is this pattern? */
 131     I32         sawback;                /* Did we see \1, ...? */
 132     U32         seen;
 133     I32         size;                   /* Code size. */
 134     I32         npar;                   /* Capture buffer count, (OPEN). */
 135     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 136     I32         nestroot;               /* root parens we are in - used by accept */
 137     I32         extralen;
 138     I32         seen_zerolen;
 139     regnode     **open_parens;          /* pointers to open parens */
 140     regnode     **close_parens;         /* pointers to close parens */
 141     regnode     *opend;                 /* END node in program */
 142     I32         utf8;           /* whether the pattern is utf8 or not */
 143     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 144                                 /* XXX use this for future optimisation of case
 145                                  * where pattern must be upgraded to utf8. */
 146     I32         uni_semantics;  /* If a d charset modifier should use unicode
 147                                    rules, even if the pattern is not in
 148                                    utf8 */
 149     HV          *paren_names;           /* Paren names */
 150
 151     regnode     **recurse;              /* Recurse regops */
 152     I32         recurse_count;          /* Number of recurse regops */
 153     I32         in_lookbehind;
 154     I32         contains_locale;
 155     I32         override_recoding;
 156     I32         in_multi_char_class;
 157     struct reg_code_block *code_blocks; /* positions of literal (?{})
 158                                             within pattern */
 159     int         num_code_blocks;        /* size of code_blocks[] */
 160     int         code_index;             /* next code_blocks[] slot */
 161 #if ADD_TO_REGEXEC
 162     char        *starttry;              /* -Dr: where regtry was called. */
 163 #define RExC_starttry   (pRExC_state->starttry)
 164 #endif
 165     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 166 #ifdef DEBUGGING
 167     const char  *lastparse;
 168     I32         lastnum;
 169     AV          *paren_name_list;       /* idx -> name */
 170 #define RExC_lastparse  (pRExC_state->lastparse)
 171 #define RExC_lastnum    (pRExC_state->lastnum)
 172 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 173 #endif
 174 } RExC_state_t;
 175
 176 #define RExC_flags      (pRExC_state->flags)
 177 #define RExC_pm_flags   (pRExC_state->pm_flags)
 178 #define RExC_precomp    (pRExC_state->precomp)
 179 #define RExC_rx_sv      (pRExC_state->rx_sv)
 180 #define RExC_rx         (pRExC_state->rx)
 181 #define RExC_rxi        (pRExC_state->rxi)
 182 #define RExC_start      (pRExC_state->start)
 183 #define RExC_end        (pRExC_state->end)
 184 #define RExC_parse      (pRExC_state->parse)
 185 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 186 #ifdef RE_TRACK_PATTERN_OFFSETS
 187 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 188 #endif
 189 #define RExC_emit       (pRExC_state->emit)
 190 #define RExC_emit_start (pRExC_state->emit_start)
 191 #define RExC_emit_bound (pRExC_state->emit_bound)
 192 #define RExC_naughty    (pRExC_state->naughty)
 193 #define RExC_sawback    (pRExC_state->sawback)
 194 #define RExC_seen       (pRExC_state->seen)
 195 #define RExC_size       (pRExC_state->size)
 196 #define RExC_npar       (pRExC_state->npar)
 197 #define RExC_nestroot   (pRExC_state->nestroot)
 198 #define RExC_extralen   (pRExC_state->extralen)
 199 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 200 #define RExC_utf8       (pRExC_state->utf8)
 201 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 202 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 203 #define RExC_open_parens        (pRExC_state->open_parens)
 204 #define RExC_close_parens       (pRExC_state->close_parens)
 205 #define RExC_opend      (pRExC_state->opend)
 206 #define RExC_paren_names        (pRExC_state->paren_names)
 207 #define RExC_recurse    (pRExC_state->recurse)
 208 #define RExC_recurse_count      (pRExC_state->recurse_count)
 209 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 210 #define RExC_contains_locale    (pRExC_state->contains_locale)
 211 #define RExC_override_recoding (pRExC_state->override_recoding)
 212 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 213
 214
 215 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 216 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 217         ((*s) == '{' && regcurly(s, FALSE)))
 218
 219 #ifdef SPSTART
 220 #undef SPSTART          /* dratted cpp namespace... */
 221 #endif
 222 /*
 223  * Flags to be passed up and down.
 224  */
 225 #define WORST           0       /* Worst case. */
 226 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 227
 228 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 229  * character.  (There needs to be a case: in the switch statement in regexec.c
 230  * for any node marked SIMPLE.)  Note that this is not the same thing as
 231  * REGNODE_SIMPLE */
 232 #define SIMPLE          0x02
 233 #define SPSTART         0x04    /* Starts with * or + */
 234 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 235 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 236
 237 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 238
 239 /* whether trie related optimizations are enabled */
 240 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 241 #define TRIE_STUDY_OPT
 242 #define FULL_TRIE_STUDY
 243 #define TRIE_STCLASS
 244 #endif
 245
 246
 247
 248 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 249 #define PBITVAL(paren) (1 << ((paren) & 7))
 250 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 251 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 252 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 253
 254 /* If not already in utf8, do a longjmp back to the beginning */
 255 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 256 #define REQUIRE_UTF8    STMT_START {                                       \
 257                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 258                         } STMT_END
 259
 260 /* This converts the named class defined in regcomp.h to its equivalent class
 261  * number defined in handy.h. */
 262 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 263 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 264
 265 /* About scan_data_t.
 266
 267   During optimisation we recurse through the regexp program performing
 268   various inplace (keyhole style) optimisations. In addition study_chunk
 269   and scan_commit populate this data structure with information about
 270   what strings MUST appear in the pattern. We look for the longest
 271   string that must appear at a fixed location, and we look for the
 272   longest string that may appear at a floating location. So for instance
 273   in the pattern:
 274
 275     /FOO[xX]A.*B[xX]BAR/
 276
 277   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 278   strings (because they follow a .* construct). study_chunk will identify
 279   both FOO and BAR as being the longest fixed and floating strings respectively.
 280
 281   The strings can be composites, for instance
 282
 283      /(f)(o)(o)/
 284
 285   will result in a composite fixed substring 'foo'.
 286
 287   For each string some basic information is maintained:
 288
 289   - offset or min_offset
 290     This is the position the string must appear at, or not before.
 291     It also implicitly (when combined with minlenp) tells us how many
 292     characters must match before the string we are searching for.
 293     Likewise when combined with minlenp and the length of the string it
 294     tells us how many characters must appear after the string we have
 295     found.
 296
 297   - max_offset
 298     Only used for floating strings. This is the rightmost point that
 299     the string can appear at. If set to I32 max it indicates that the
 300     string can occur infinitely far to the right.
 301
 302   - minlenp
 303     A pointer to the minimum number of characters of the pattern that the
 304     string was found inside. This is important as in the case of positive
 305     lookahead or positive lookbehind we can have multiple patterns
 306     involved. Consider
 307
 308     /(?=FOO).*F/
 309
 310     The minimum length of the pattern overall is 3, the minimum length
 311     of the lookahead part is 3, but the minimum length of the part that
 312     will actually match is 1. So 'FOO's minimum length is 3, but the
 313     minimum length for the F is 1. This is important as the minimum length
 314     is used to determine offsets in front of and behind the string being
 315     looked for.  Since strings can be composites this is the length of the
 316     pattern at the time it was committed with a scan_commit. Note that
 317     the length is calculated by study_chunk, so that the minimum lengths
 318     are not known until the full pattern has been compiled, thus the
 319     pointer to the value.
 320
 321   - lookbehind
 322
 323     In the case of lookbehind the string being searched for can be
 324     offset past the start point of the final matching string.
 325     If this value was just blithely removed from the min_offset it would
 326     invalidate some of the calculations for how many chars must match
 327     before or after (as they are derived from min_offset and minlen and
 328     the length of the string being searched for).
 329     When the final pattern is compiled and the data is moved from the
 330     scan_data_t structure into the regexp structure the information
 331     about lookbehind is factored in, with the information that would
 332     have been lost precalculated in the end_shift field for the
 333     associated string.
 334
 335   The fields pos_min and pos_delta are used to store the minimum offset
 336   and the delta to the maximum offset at the current point in the pattern.
 337
 338 */
 339
 340 typedef struct scan_data_t {
 341     /*I32 len_min;      unused */
 342     /*I32 len_delta;    unused */
 343     I32 pos_min;
 344     I32 pos_delta;
 345     SV *last_found;
 346     I32 last_end;           /* min value, <0 unless valid. */
 347     I32 last_start_min;
 348     I32 last_start_max;
 349     SV **longest;           /* Either &l_fixed, or &l_float. */
 350     SV *longest_fixed;      /* longest fixed string found in pattern */
 351     I32 offset_fixed;       /* offset where it starts */
 352     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 353     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 354     SV *longest_float;      /* longest floating string found in pattern */
 355     I32 offset_float_min;   /* earliest point in string it can appear */
 356     I32 offset_float_max;   /* latest point in string it can appear */
 357     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 358     I32 lookbehind_float;   /* is the position of the string modified by LB */
 359     I32 flags;
 360     I32 whilem_c;
 361     I32 *last_closep;
 362     struct regnode_charclass_class *start_class;
 363 } scan_data_t;
 364
 365 /*
 366  * Forward declarations for pregcomp()'s friends.
 367  */
 368
 369 static const scan_data_t zero_scan_data =
 370   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 371
 372 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 373 #define SF_BEFORE_SEOL          0x0001
 374 #define SF_BEFORE_MEOL          0x0002
 375 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 376 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 377
 378 #ifdef NO_UNARY_PLUS
 379 #  define SF_FIX_SHIFT_EOL      (0+2)
 380 #  define SF_FL_SHIFT_EOL               (0+4)
 381 #else
 382 #  define SF_FIX_SHIFT_EOL      (+2)
 383 #  define SF_FL_SHIFT_EOL               (+4)
 384 #endif
 385
 386 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 387 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 388
 389 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 390 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 391 #define SF_IS_INF               0x0040
 392 #define SF_HAS_PAR              0x0080
 393 #define SF_IN_PAR               0x0100
 394 #define SF_HAS_EVAL             0x0200
 395 #define SCF_DO_SUBSTR           0x0400
 396 #define SCF_DO_STCLASS_AND      0x0800
 397 #define SCF_DO_STCLASS_OR       0x1000
 398 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 399 #define SCF_WHILEM_VISITED_POS  0x2000
 400
 401 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 402 #define SCF_SEEN_ACCEPT         0x8000
 403
 404 #define UTF cBOOL(RExC_utf8)
 405
 406 /* The enums for all these are ordered so things work out correctly */
 407 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 408 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 409 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 410 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 411 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 412 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 413 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 414
 415 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 416
 417 #define OOB_NAMEDCLASS          -1
 418
 419 /* There is no code point that is out-of-bounds, so this is problematic.  But
 420  * its only current use is to initialize a variable that is always set before
 421  * looked at. */
 422 #define OOB_UNICODE             0xDEADBEEF
 423
 424 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 425 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 426
 427
 428 /* length of regex to show in messages that don't mark a position within */
 429 #define RegexLengthToShowInErrorMessages 127
 430
 431 /*
 432  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 433  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 434  * op/pragma/warn/regcomp.
 435  */
 436 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 437 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 438
 439 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 440
 441 /*
 442  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 443  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 444  * "...".
 445  */
 446 #define _FAIL(code) STMT_START {                                        \
 447     const char *ellipses = "";                                          \
 448     IV len = RExC_end - RExC_precomp;                                   \
 449                                                                         \
 450     if (!SIZE_ONLY)                                                     \
 451         SAVEFREESV(RExC_rx_sv);                                         \
 452     if (len > RegexLengthToShowInErrorMessages) {                       \
 453         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 454         len = RegexLengthToShowInErrorMessages - 10;                    \
 455         ellipses = "...";                                               \
 456     }                                                                   \
 457     code;                                                               \
 458 } STMT_END
 459
 460 #define FAIL(msg) _FAIL(                            \
 461     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 462             msg, (int)len, RExC_precomp, ellipses))
 463
 464 #define FAIL2(msg,arg) _FAIL(                       \
 465     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 466             arg, (int)len, RExC_precomp, ellipses))
 467
 468 /*
 469  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 470  */
 471 #define Simple_vFAIL(m) STMT_START {                                    \
 472     const IV offset = RExC_parse - RExC_precomp;                        \
 473     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 474             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 475 } STMT_END
 476
 477 /*
 478  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 479  */
 480 #define vFAIL(m) STMT_START {                           \
 481     if (!SIZE_ONLY)                                     \
 482         SAVEFREESV(RExC_rx_sv);                         \
 483     Simple_vFAIL(m);                                    \
 484 } STMT_END
 485
 486 /*
 487  * Like Simple_vFAIL(), but accepts two arguments.
 488  */
 489 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 490     const IV offset = RExC_parse - RExC_precomp;                        \
 491     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 492             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 493 } STMT_END
 494
 495 /*
 496  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 497  */
 498 #define vFAIL2(m,a1) STMT_START {                       \
 499     if (!SIZE_ONLY)                                     \
 500         SAVEFREESV(RExC_rx_sv);                         \
 501     Simple_vFAIL2(m, a1);                               \
 502 } STMT_END
 503
 504
 505 /*
 506  * Like Simple_vFAIL(), but accepts three arguments.
 507  */
 508 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 509     const IV offset = RExC_parse - RExC_precomp;                \
 510     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 511             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 512 } STMT_END
 513
 514 /*
 515  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 516  */
 517 #define vFAIL3(m,a1,a2) STMT_START {                    \
 518     if (!SIZE_ONLY)                                     \
 519         SAVEFREESV(RExC_rx_sv);                         \
 520     Simple_vFAIL3(m, a1, a2);                           \
 521 } STMT_END
 522
 523 /*
 524  * Like Simple_vFAIL(), but accepts four arguments.
 525  */
 526 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 527     const IV offset = RExC_parse - RExC_precomp;                \
 528     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 529             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 530 } STMT_END
 531
 532 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 533     if (!SIZE_ONLY)                                     \
 534         SAVEFREESV(RExC_rx_sv);                         \
 535     Simple_vFAIL4(m, a1, a2, a3);                       \
 536 } STMT_END
 537
 538 /* m is not necessarily a "literal string", in this macro */
 539 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 540     const IV offset = loc - RExC_precomp;                               \
 541     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 542             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 543 } STMT_END
 544
 545 #define ckWARNreg(loc,m) STMT_START {                                   \
 546     const IV offset = loc - RExC_precomp;                               \
 547     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 548             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 549 } STMT_END
 550
 551 #define vWARN_dep(loc, m) STMT_START {                                  \
 552     const IV offset = loc - RExC_precomp;                               \
 553     Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,     \
 554             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 555 } STMT_END
 556
 557 #define ckWARNdep(loc,m) STMT_START {                                   \
 558     const IV offset = loc - RExC_precomp;                               \
 559     Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                   \
 560             m REPORT_LOCATION,                                          \
 561             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 562 } STMT_END
 563
 564 #define ckWARNregdep(loc,m) STMT_START {                                \
 565     const IV offset = loc - RExC_precomp;                               \
 566     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 567             m REPORT_LOCATION,                                          \
 568             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 569 } STMT_END
 570
 571 #define ckWARN2regdep(loc,m, a1) STMT_START {                           \
 572     const IV offset = loc - RExC_precomp;                               \
 573     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 574             m REPORT_LOCATION,                                          \
 575             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 576 } STMT_END
 577
 578 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 579     const IV offset = loc - RExC_precomp;                               \
 580     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 581             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 582 } STMT_END
 583
 584 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 585     const IV offset = loc - RExC_precomp;                               \
 586     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 587             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 588 } STMT_END
 589
 590 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 591     const IV offset = loc - RExC_precomp;                               \
 592     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 593             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 594 } STMT_END
 595
 596 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 597     const IV offset = loc - RExC_precomp;                               \
 598     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 599             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 600 } STMT_END
 601
 602 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 603     const IV offset = loc - RExC_precomp;                               \
 604     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 605             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 606 } STMT_END
 607
 608 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 609     const IV offset = loc - RExC_precomp;                               \
 610     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 611             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 612 } STMT_END
 613
 614
 615 /* Allow for side effects in s */
 616 #define REGC(c,s) STMT_START {                  \
 617     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 618 } STMT_END
 619
 620 /* Macros for recording node offsets.   20001227 mjd@plover.com
 621  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 622  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 623  * Element 0 holds the number n.
 624  * Position is 1 indexed.
 625  */
 626 #ifndef RE_TRACK_PATTERN_OFFSETS
 627 #define Set_Node_Offset_To_R(node,byte)
 628 #define Set_Node_Offset(node,byte)
 629 #define Set_Cur_Node_Offset
 630 #define Set_Node_Length_To_R(node,len)
 631 #define Set_Node_Length(node,len)
 632 #define Set_Node_Cur_Length(node)
 633 #define Node_Offset(n)
 634 #define Node_Length(n)
 635 #define Set_Node_Offset_Length(node,offset,len)
 636 #define ProgLen(ri) ri->u.proglen
 637 #define SetProgLen(ri,x) ri->u.proglen = x
 638 #else
 639 #define ProgLen(ri) ri->u.offsets[0]
 640 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 641 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 642     if (! SIZE_ONLY) {                                                  \
 643         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 644                     __LINE__, (int)(node), (int)(byte)));               \
 645         if((node) < 0) {                                                \
 646             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 647         } else {                                                        \
 648             RExC_offsets[2*(node)-1] = (byte);                          \
 649         }                                                               \
 650     }                                                                   \
 651 } STMT_END
 652
 653 #define Set_Node_Offset(node,byte) \
 654     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 655 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 656
 657 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 658     if (! SIZE_ONLY) {                                                  \
 659         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 660                 __LINE__, (int)(node), (int)(len)));                    \
 661         if((node) < 0) {                                                \
 662             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 663         } else {                                                        \
 664             RExC_offsets[2*(node)] = (len);                             \
 665         }                                                               \
 666     }                                                                   \
 667 } STMT_END
 668
 669 #define Set_Node_Length(node,len) \
 670     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 671 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 672 #define Set_Node_Cur_Length(node) \
 673     Set_Node_Length(node, RExC_parse - parse_start)
 674
 675 /* Get offsets and lengths */
 676 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 677 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 678
 679 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 680     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 681     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 682 } STMT_END
 683 #endif
 684
 685 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 686 #define EXPERIMENTAL_INPLACESCAN
 687 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 688
 689 #define DEBUG_STUDYDATA(str,data,depth)                              \
 690 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 691     PerlIO_printf(Perl_debug_log,                                    \
 692         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 693         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 694         (int)(depth)*2, "",                                          \
 695         (IV)((data)->pos_min),                                       \
 696         (IV)((data)->pos_delta),                                     \
 697         (UV)((data)->flags),                                         \
 698         (IV)((data)->whilem_c),                                      \
 699         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 700         is_inf ? "INF " : ""                                         \
 701     );                                                               \
 702     if ((data)->last_found)                                          \
 703         PerlIO_printf(Perl_debug_log,                                \
 704             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 705             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 706             SvPVX_const((data)->last_found),                         \
 707             (IV)((data)->last_end),                                  \
 708             (IV)((data)->last_start_min),                            \
 709             (IV)((data)->last_start_max),                            \
 710             ((data)->longest &&                                      \
 711              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 712             SvPVX_const((data)->longest_fixed),                      \
 713             (IV)((data)->offset_fixed),                              \
 714             ((data)->longest &&                                      \
 715              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 716             SvPVX_const((data)->longest_float),                      \
 717             (IV)((data)->offset_float_min),                          \
 718             (IV)((data)->offset_float_max)                           \
 719         );                                                           \
 720     PerlIO_printf(Perl_debug_log,"\n");                              \
 721 });
 722
 723 /* Mark that we cannot extend a found fixed substring at this point.
 724    Update the longest found anchored substring and the longest found
 725    floating substrings if needed. */
 726
 727 STATIC void
 728 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 729 {
 730     const STRLEN l = CHR_SVLEN(data->last_found);
 731     const STRLEN old_l = CHR_SVLEN(*data->longest);
 732     GET_RE_DEBUG_FLAGS_DECL;
 733
 734     PERL_ARGS_ASSERT_SCAN_COMMIT;
 735
 736     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 737         SvSetMagicSV(*data->longest, data->last_found);
 738         if (*data->longest == data->longest_fixed) {
 739             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 740             if (data->flags & SF_BEFORE_EOL)
 741                 data->flags
 742                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 743             else
 744                 data->flags &= ~SF_FIX_BEFORE_EOL;
 745             data->minlen_fixed=minlenp;
 746             data->lookbehind_fixed=0;
 747         }
 748         else { /* *data->longest == data->longest_float */
 749             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 750             data->offset_float_max = (l
 751                                       ? data->last_start_max
 752                                       : data->pos_min + data->pos_delta);
 753             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 754                 data->offset_float_max = I32_MAX;
 755             if (data->flags & SF_BEFORE_EOL)
 756                 data->flags
 757                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 758             else
 759                 data->flags &= ~SF_FL_BEFORE_EOL;
 760             data->minlen_float=minlenp;
 761             data->lookbehind_float=0;
 762         }
 763     }
 764     SvCUR_set(data->last_found, 0);
 765     {
 766         SV * const sv = data->last_found;
 767         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 768             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 769             if (mg)
 770                 mg->mg_len = 0;
 771         }
 772     }
 773     data->last_end = -1;
 774     data->flags &= ~SF_BEFORE_EOL;
 775     DEBUG_STUDYDATA("commit: ",data,0);
 776 }
 777
 778 /* These macros set, clear and test whether the synthetic start class ('ssc',
 779  * given by the parameter) matches an empty string (EOS).  This uses the
 780  * 'next_off' field in the node, to save a bit in the flags field.  The ssc
 781  * stands alone, so there is never a next_off, so this field is otherwise
 782  * unused.  The EOS information is used only for compilation, but theoretically
 783  * it could be passed on to the execution code.  This could be used to store
 784  * more than one bit of information, but only this one is currently used. */
 785 #define SET_SSC_EOS(node)   STMT_START { (node)->next_off = TRUE; } STMT_END
 786 #define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
 787 #define TEST_SSC_EOS(node)  cBOOL((node)->next_off)
 788
 789 /* Can match anything (initialization) */
 790 STATIC void
 791 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 792 {
 793     PERL_ARGS_ASSERT_CL_ANYTHING;
 794
 795     ANYOF_BITMAP_SETALL(cl);
 796     cl->flags = ANYOF_UNICODE_ALL;
 797     SET_SSC_EOS(cl);
 798
 799     /* If any portion of the regex is to operate under locale rules,
 800      * initialization includes it.  The reason this isn't done for all regexes
 801      * is that the optimizer was written under the assumption that locale was
 802      * all-or-nothing.  Given the complexity and lack of documentation in the
 803      * optimizer, and that there are inadequate test cases for locale, so many
 804      * parts of it may not work properly, it is safest to avoid locale unless
 805      * necessary. */
 806     if (RExC_contains_locale) {
 807         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 808         cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD;
 809     }
 810     else {
 811         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
 812     }
 813 }
 814
 815 /* Can match anything (initialization) */
 816 STATIC int
 817 S_cl_is_anything(const struct regnode_charclass_class *cl)
 818 {
 819     int value;
 820
 821     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 822
 823     for (value = 0; value < ANYOF_MAX; value += 2)
 824         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 825             return 1;
 826     if (!(cl->flags & ANYOF_UNICODE_ALL))
 827         return 0;
 828     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 829         return 0;
 830     return 1;
 831 }
 832
 833 /* Can match anything (initialization) */
 834 STATIC void
 835 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 836 {
 837     PERL_ARGS_ASSERT_CL_INIT;
 838
 839     Zero(cl, 1, struct regnode_charclass_class);
 840     cl->type = ANYOF;
 841     cl_anything(pRExC_state, cl);
 842     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 843 }
 844
 845 /* These two functions currently do the exact same thing */
 846 #define cl_init_zero            S_cl_init
 847
 848 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 849  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 850  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 851 STATIC void
 852 S_cl_and(struct regnode_charclass_class *cl,
 853         const struct regnode_charclass_class *and_with)
 854 {
 855     PERL_ARGS_ASSERT_CL_AND;
 856
 857     assert(PL_regkind[and_with->type] == ANYOF);
 858
 859     /* I (khw) am not sure all these restrictions are necessary XXX */
 860     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 861         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 862         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 863         && !(and_with->flags & ANYOF_LOC_FOLD)
 864         && !(cl->flags & ANYOF_LOC_FOLD)) {
 865         int i;
 866
 867         if (and_with->flags & ANYOF_INVERT)
 868             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 869                 cl->bitmap[i] &= ~and_with->bitmap[i];
 870         else
 871             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 872                 cl->bitmap[i] &= and_with->bitmap[i];
 873     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 874
 875     if (and_with->flags & ANYOF_INVERT) {
 876
 877         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 878          * aren't affected by the inversion.  Those that are affected are
 879          * handled individually below */
 880         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 881         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 882         cl->flags |= affected_flags;
 883
 884         /* We currently don't know how to deal with things that aren't in the
 885          * bitmap, but we know that the intersection is no greater than what
 886          * is already in cl, so let there be false positives that get sorted
 887          * out after the synthetic start class succeeds, and the node is
 888          * matched for real. */
 889
 890         /* The inversion of these two flags indicate that the resulting
 891          * intersection doesn't have them */
 892         if (and_with->flags & ANYOF_UNICODE_ALL) {
 893             cl->flags &= ~ANYOF_UNICODE_ALL;
 894         }
 895         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 896             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 897         }
 898     }
 899     else {   /* and'd node is not inverted */
 900         U8 outside_bitmap_but_not_utf8; /* Temp variable */
 901
 902         if (! ANYOF_NONBITMAP(and_with)) {
 903
 904             /* Here 'and_with' doesn't match anything outside the bitmap
 905              * (except possibly ANYOF_UNICODE_ALL), which means the
 906              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 907              * which case we don't know what the intersection is, but it's no
 908              * greater than what cl already has, so can just leave it alone,
 909              * with possible false positives */
 910             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 911                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 912                 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 913             }
 914         }
 915         else if (! ANYOF_NONBITMAP(cl)) {
 916
 917             /* Here, 'and_with' does match something outside the bitmap, and cl
 918              * doesn't have a list of things to match outside the bitmap.  If
 919              * cl can match all code points above 255, the intersection will
 920              * be those above-255 code points that 'and_with' matches.  If cl
 921              * can't match all Unicode code points, it means that it can't
 922              * match anything outside the bitmap (since the 'if' that got us
 923              * into this block tested for that), so we leave the bitmap empty.
 924              */
 925             if (cl->flags & ANYOF_UNICODE_ALL) {
 926                 ARG_SET(cl, ARG(and_with));
 927
 928                 /* and_with's ARG may match things that don't require UTF8.
 929                  * And now cl's will too, in spite of this being an 'and'.  See
 930                  * the comments below about the kludge */
 931                 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 932             }
 933         }
 934         else {
 935             /* Here, both 'and_with' and cl match something outside the
 936              * bitmap.  Currently we do not do the intersection, so just match
 937              * whatever cl had at the beginning.  */
 938         }
 939
 940
 941         /* Take the intersection of the two sets of flags.  However, the
 942          * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 943          * kludge around the fact that this flag is not treated like the others
 944          * which are initialized in cl_anything().  The way the optimizer works
 945          * is that the synthetic start class (SSC) is initialized to match
 946          * anything, and then the first time a real node is encountered, its
 947          * values are AND'd with the SSC's with the result being the values of
 948          * the real node.  However, there are paths through the optimizer where
 949          * the AND never gets called, so those initialized bits are set
 950          * inappropriately, which is not usually a big deal, as they just cause
 951          * false positives in the SSC, which will just mean a probably
 952          * imperceptible slow down in execution.  However this bit has a
 953          * higher false positive consequence in that it can cause utf8.pm,
 954          * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 955          * bigger slowdown and also causes significant extra memory to be used.
 956          * In order to prevent this, the code now takes a different tack.  The
 957          * bit isn't set unless some part of the regular expression needs it,
 958          * but once set it won't get cleared.  This means that these extra
 959          * modules won't get loaded unless there was some path through the
 960          * pattern that would have required them anyway, and  so any false
 961          * positives that occur by not ANDing them out when they could be
 962          * aren't as severe as they would be if we treated this bit like all
 963          * the others */
 964         outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 965                                       & ANYOF_NONBITMAP_NON_UTF8;
 966         cl->flags &= and_with->flags;
 967         cl->flags |= outside_bitmap_but_not_utf8;
 968     }
 969 }
 970
 971 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 972  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 973  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 974 STATIC void
 975 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 976 {
 977     PERL_ARGS_ASSERT_CL_OR;
 978
 979     if (or_with->flags & ANYOF_INVERT) {
 980
 981         /* Here, the or'd node is to be inverted.  This means we take the
 982          * complement of everything not in the bitmap, but currently we don't
 983          * know what that is, so give up and match anything */
 984         if (ANYOF_NONBITMAP(or_with)) {
 985             cl_anything(pRExC_state, cl);
 986         }
 987         /* We do not use
 988          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 989          *   <= (B1 | !B2) | (CL1 | !CL2)
 990          * which is wasteful if CL2 is small, but we ignore CL2:
 991          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 992          * XXXX Can we handle case-fold?  Unclear:
 993          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 994          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 995          */
 996         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 997              && !(or_with->flags & ANYOF_LOC_FOLD)
 998              && !(cl->flags & ANYOF_LOC_FOLD) ) {
 999             int i;
1000
1001             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1002                 cl->bitmap[i] |= ~or_with->bitmap[i];
1003         } /* XXXX: logic is complicated otherwise */
1004         else {
1005             cl_anything(pRExC_state, cl);
1006         }
1007
1008         /* And, we can just take the union of the flags that aren't affected
1009          * by the inversion */
1010         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
1011
1012         /* For the remaining flags:
1013             ANYOF_UNICODE_ALL and inverted means to not match anything above
1014                     255, which means that the union with cl should just be
1015                     what cl has in it, so can ignore this flag
1016             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
1017                     is 127-255 to match them, but then invert that, so the
1018                     union with cl should just be what cl has in it, so can
1019                     ignore this flag
1020          */
1021     } else {    /* 'or_with' is not inverted */
1022         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
1023         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1024              && (!(or_with->flags & ANYOF_LOC_FOLD)
1025                  || (cl->flags & ANYOF_LOC_FOLD)) ) {
1026             int i;
1027
1028             /* OR char bitmap and class bitmap separately */
1029             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1030                 cl->bitmap[i] |= or_with->bitmap[i];
1031             ANYOF_CLASS_OR(or_with, cl);
1032         }
1033         else { /* XXXX: logic is complicated, leave it along for a moment. */
1034             cl_anything(pRExC_state, cl);
1035         }
1036
1037         if (ANYOF_NONBITMAP(or_with)) {
1038
1039             /* Use the added node's outside-the-bit-map match if there isn't a
1040              * conflict.  If there is a conflict (both nodes match something
1041              * outside the bitmap, but what they match outside is not the same
1042              * pointer, and hence not easily compared until XXX we extend
1043              * inversion lists this far), give up and allow the start class to
1044              * match everything outside the bitmap.  If that stuff is all above
1045              * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1046             if (! ANYOF_NONBITMAP(cl)) {
1047                 ARG_SET(cl, ARG(or_with));
1048             }
1049             else if (ARG(cl) != ARG(or_with)) {
1050
1051                 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1052                     cl_anything(pRExC_state, cl);
1053                 }
1054                 else {
1055                     cl->flags |= ANYOF_UNICODE_ALL;
1056                 }
1057             }
1058         }
1059
1060         /* Take the union */
1061         cl->flags |= or_with->flags;
1062     }
1063 }
1064
1065 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1066 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1067 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1068 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1069
1070
1071 #ifdef DEBUGGING
1072 /*
1073    dump_trie(trie,widecharmap,revcharmap)
1074    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1075    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1076
1077    These routines dump out a trie in a somewhat readable format.
1078    The _interim_ variants are used for debugging the interim
1079    tables that are used to generate the final compressed
1080    representation which is what dump_trie expects.
1081
1082    Part of the reason for their existence is to provide a form
1083    of documentation as to how the different representations function.
1084
1085 */
1086
1087 /*
1088   Dumps the final compressed table form of the trie to Perl_debug_log.
1089   Used for debugging make_trie().
1090 */
1091
1092 STATIC void
1093 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1094             AV *revcharmap, U32 depth)
1095 {
1096     U32 state;
1097     SV *sv=sv_newmortal();
1098     int colwidth= widecharmap ? 6 : 4;
1099     U16 word;
1100     GET_RE_DEBUG_FLAGS_DECL;
1101
1102     PERL_ARGS_ASSERT_DUMP_TRIE;
1103
1104     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1105         (int)depth * 2 + 2,"",
1106         "Match","Base","Ofs" );
1107
1108     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1109         SV ** const tmp = av_fetch( revcharmap, state, 0);
1110         if ( tmp ) {
1111             PerlIO_printf( Perl_debug_log, "%*s",
1112                 colwidth,
1113                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1114                             PL_colors[0], PL_colors[1],
1115                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1116                             PERL_PV_ESCAPE_FIRSTCHAR
1117                 )
1118             );
1119         }
1120     }
1121     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1122         (int)depth * 2 + 2,"");
1123
1124     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1125         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1126     PerlIO_printf( Perl_debug_log, "\n");
1127
1128     for( state = 1 ; state < trie->statecount ; state++ ) {
1129         const U32 base = trie->states[ state ].trans.base;
1130
1131         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1132
1133         if ( trie->states[ state ].wordnum ) {
1134             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1135         } else {
1136             PerlIO_printf( Perl_debug_log, "%6s", "" );
1137         }
1138
1139         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1140
1141         if ( base ) {
1142             U32 ofs = 0;
1143
1144             while( ( base + ofs  < trie->uniquecharcount ) ||
1145                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1146                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1147                     ofs++;
1148
1149             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1150
1151             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1152                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1153                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1154                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1155                 {
1156                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1157                     colwidth,
1158                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1159                 } else {
1160                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1161                 }
1162             }
1163
1164             PerlIO_printf( Perl_debug_log, "]");
1165
1166         }
1167         PerlIO_printf( Perl_debug_log, "\n" );
1168     }
1169     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1170     for (word=1; word <= trie->wordcount; word++) {
1171         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1172             (int)word, (int)(trie->wordinfo[word].prev),
1173             (int)(trie->wordinfo[word].len));
1174     }
1175     PerlIO_printf(Perl_debug_log, "\n" );
1176 }
1177 /*
1178   Dumps a fully constructed but uncompressed trie in list form.
1179   List tries normally only are used for construction when the number of
1180   possible chars (trie->uniquecharcount) is very high.
1181   Used for debugging make_trie().
1182 */
1183 STATIC void
1184 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1185                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1186                          U32 depth)
1187 {
1188     U32 state;
1189     SV *sv=sv_newmortal();
1190     int colwidth= widecharmap ? 6 : 4;
1191     GET_RE_DEBUG_FLAGS_DECL;
1192
1193     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1194
1195     /* print out the table precompression.  */
1196     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1197         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1198         "------:-----+-----------------\n" );
1199
1200     for( state=1 ; state < next_alloc ; state ++ ) {
1201         U16 charid;
1202
1203         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1204             (int)depth * 2 + 2,"", (UV)state  );
1205         if ( ! trie->states[ state ].wordnum ) {
1206             PerlIO_printf( Perl_debug_log, "%5s| ","");
1207         } else {
1208             PerlIO_printf( Perl_debug_log, "W%4x| ",
1209                 trie->states[ state ].wordnum
1210             );
1211         }
1212         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1213             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1214             if ( tmp ) {
1215                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1216                     colwidth,
1217                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1218                             PL_colors[0], PL_colors[1],
1219                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1220                             PERL_PV_ESCAPE_FIRSTCHAR
1221                     ) ,
1222                     TRIE_LIST_ITEM(state,charid).forid,
1223                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1224                 );
1225                 if (!(charid % 10))
1226                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1227                         (int)((depth * 2) + 14), "");
1228             }
1229         }
1230         PerlIO_printf( Perl_debug_log, "\n");
1231     }
1232 }
1233
1234 /*
1235   Dumps a fully constructed but uncompressed trie in table form.
1236   This is the normal DFA style state transition table, with a few
1237   twists to facilitate compression later.
1238   Used for debugging make_trie().
1239 */
1240 STATIC void
1241 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1242                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1243                           U32 depth)
1244 {
1245     U32 state;
1246     U16 charid;
1247     SV *sv=sv_newmortal();
1248     int colwidth= widecharmap ? 6 : 4;
1249     GET_RE_DEBUG_FLAGS_DECL;
1250
1251     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1252
1253     /*
1254        print out the table precompression so that we can do a visual check
1255        that they are identical.
1256      */
1257
1258     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1259
1260     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1261         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1262         if ( tmp ) {
1263             PerlIO_printf( Perl_debug_log, "%*s",
1264                 colwidth,
1265                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1266                             PL_colors[0], PL_colors[1],
1267                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1268                             PERL_PV_ESCAPE_FIRSTCHAR
1269                 )
1270             );
1271         }
1272     }
1273
1274     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1275
1276     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1277         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1278     }
1279
1280     PerlIO_printf( Perl_debug_log, "\n" );
1281
1282     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1283
1284         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1285             (int)depth * 2 + 2,"",
1286             (UV)TRIE_NODENUM( state ) );
1287
1288         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1289             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1290             if (v)
1291                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1292             else
1293                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1294         }
1295         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1296             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1297         } else {
1298             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1299             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1300         }
1301     }
1302 }
1303
1304 #endif
1305
1306
1307 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1308   startbranch: the first branch in the whole branch sequence
1309   first      : start branch of sequence of branch-exact nodes.
1310                May be the same as startbranch
1311   last       : Thing following the last branch.
1312                May be the same as tail.
1313   tail       : item following the branch sequence
1314   count      : words in the sequence
1315   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1316   depth      : indent depth
1317
1318 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1319
1320 A trie is an N'ary tree where the branches are determined by digital
1321 decomposition of the key. IE, at the root node you look up the 1st character and
1322 follow that branch repeat until you find the end of the branches. Nodes can be
1323 marked as "accepting" meaning they represent a complete word. Eg:
1324
1325   /he|she|his|hers/
1326
1327 would convert into the following structure. Numbers represent states, letters
1328 following numbers represent valid transitions on the letter from that state, if
1329 the number is in square brackets it represents an accepting state, otherwise it
1330 will be in parenthesis.
1331
1332       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1333       |    |
1334       |   (2)
1335       |    |
1336      (1)   +-i->(6)-+-s->[7]
1337       |
1338       +-s->(3)-+-h->(4)-+-e->[5]
1339
1340       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1341
1342 This shows that when matching against the string 'hers' we will begin at state 1
1343 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1344 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1345 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1346 single traverse. We store a mapping from accepting to state to which word was
1347 matched, and then when we have multiple possibilities we try to complete the
1348 rest of the regex in the order in which they occured in the alternation.
1349
1350 The only prior NFA like behaviour that would be changed by the TRIE support is
1351 the silent ignoring of duplicate alternations which are of the form:
1352
1353  / (DUPE|DUPE) X? (?{ ... }) Y /x
1354
1355 Thus EVAL blocks following a trie may be called a different number of times with
1356 and without the optimisation. With the optimisations dupes will be silently
1357 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1358 the following demonstrates:
1359
1360  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1361
1362 which prints out 'word' three times, but
1363
1364  'words'=~/(word|word|word)(?{ print $1 })S/
1365
1366 which doesnt print it out at all. This is due to other optimisations kicking in.
1367
1368 Example of what happens on a structural level:
1369
1370 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1371
1372    1: CURLYM[1] {1,32767}(18)
1373    5:   BRANCH(8)
1374    6:     EXACT <ac>(16)
1375    8:   BRANCH(11)
1376    9:     EXACT <ad>(16)
1377   11:   BRANCH(14)
1378   12:     EXACT <ab>(16)
1379   16:   SUCCEED(0)
1380   17:   NOTHING(18)
1381   18: END(0)
1382
1383 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1384 and should turn into:
1385
1386    1: CURLYM[1] {1,32767}(18)
1387    5:   TRIE(16)
1388         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1389           <ac>
1390           <ad>
1391           <ab>
1392   16:   SUCCEED(0)
1393   17:   NOTHING(18)
1394   18: END(0)
1395
1396 Cases where tail != last would be like /(?foo|bar)baz/:
1397
1398    1: BRANCH(4)
1399    2:   EXACT <foo>(8)
1400    4: BRANCH(7)
1401    5:   EXACT <bar>(8)
1402    7: TAIL(8)
1403    8: EXACT <baz>(10)
1404   10: END(0)
1405
1406 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1407 and would end up looking like:
1408
1409     1: TRIE(8)
1410       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1411         <foo>
1412         <bar>
1413    7: TAIL(8)
1414    8: EXACT <baz>(10)
1415   10: END(0)
1416
1417     d = uvuni_to_utf8_flags(d, uv, 0);
1418
1419 is the recommended Unicode-aware way of saying
1420
1421     *(d++) = uv;
1422 */
1423
1424 #define TRIE_STORE_REVCHAR(val)                                            \
1425     STMT_START {                                                           \
1426         if (UTF) {                                                         \
1427             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1428             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1429             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1430             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1431             SvPOK_on(zlopp);                                               \
1432             SvUTF8_on(zlopp);                                              \
1433             av_push(revcharmap, zlopp);                                    \
1434         } else {                                                           \
1435             char ooooff = (char)val;                                           \
1436             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1437         }                                                                  \
1438         } STMT_END
1439
1440 #define TRIE_READ_CHAR STMT_START {                                                     \
1441     wordlen++;                                                                          \
1442     if ( UTF ) {                                                                        \
1443         /* if it is UTF then it is either already folded, or does not need folding */   \
1444         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
1445     }                                                                                   \
1446     else if (folder == PL_fold_latin1) {                                                \
1447         /* if we use this folder we have to obey unicode rules on latin-1 data */       \
1448         if ( foldlen > 0 ) {                                                            \
1449            uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
1450            foldlen -= len;                                                              \
1451            scan += len;                                                                 \
1452            len = 0;                                                                     \
1453         } else {                                                                        \
1454             len = 1;                                                                    \
1455             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                     \
1456             skiplen = UNISKIP(uvc);                                                     \
1457             foldlen -= skiplen;                                                         \
1458             scan = foldbuf + skiplen;                                                   \
1459         }                                                                               \
1460     } else {                                                                            \
1461         /* raw data, will be folded later if needed */                                  \
1462         uvc = (U32)*uc;                                                                 \
1463         len = 1;                                                                        \
1464     }                                                                                   \
1465 } STMT_END
1466
1467
1468
1469 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1470     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1471         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1472         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1473     }                                                           \
1474     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1475     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1476     TRIE_LIST_CUR( state )++;                                   \
1477 } STMT_END
1478
1479 #define TRIE_LIST_NEW(state) STMT_START {                       \
1480     Newxz( trie->states[ state ].trans.list,               \
1481         4, reg_trie_trans_le );                                 \
1482      TRIE_LIST_CUR( state ) = 1;                                \
1483      TRIE_LIST_LEN( state ) = 4;                                \
1484 } STMT_END
1485
1486 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1487     U16 dupe= trie->states[ state ].wordnum;                    \
1488     regnode * const noper_next = regnext( noper );              \
1489                                                                 \
1490     DEBUG_r({                                                   \
1491         /* store the word for dumping */                        \
1492         SV* tmp;                                                \
1493         if (OP(noper) != NOTHING)                               \
1494             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1495         else                                                    \
1496             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1497         av_push( trie_words, tmp );                             \
1498     });                                                         \
1499                                                                 \
1500     curword++;                                                  \
1501     trie->wordinfo[curword].prev   = 0;                         \
1502     trie->wordinfo[curword].len    = wordlen;                   \
1503     trie->wordinfo[curword].accept = state;                     \
1504                                                                 \
1505     if ( noper_next < tail ) {                                  \
1506         if (!trie->jump)                                        \
1507             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1508         trie->jump[curword] = (U16)(noper_next - convert);      \
1509         if (!jumper)                                            \
1510             jumper = noper_next;                                \
1511         if (!nextbranch)                                        \
1512             nextbranch= regnext(cur);                           \
1513     }                                                           \
1514                                                                 \
1515     if ( dupe ) {                                               \
1516         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1517         /* chain, so that when the bits of chain are later    */\
1518         /* linked together, the dups appear in the chain      */\
1519         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1520         trie->wordinfo[dupe].prev = curword;                    \
1521     } else {                                                    \
1522         /* we haven't inserted this word yet.                */ \
1523         trie->states[ state ].wordnum = curword;                \
1524     }                                                           \
1525 } STMT_END
1526
1527
1528 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1529      ( ( base + charid >=  ucharcount                                   \
1530          && base + charid < ubound                                      \
1531          && state == trie->trans[ base - ucharcount + charid ].check    \
1532          && trie->trans[ base - ucharcount + charid ].next )            \
1533            ? trie->trans[ base - ucharcount + charid ].next             \
1534            : ( state==1 ? special : 0 )                                 \
1535       )
1536
1537 #define MADE_TRIE       1
1538 #define MADE_JUMP_TRIE  2
1539 #define MADE_EXACT_TRIE 4
1540
1541 STATIC I32
1542 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1543 {
1544     dVAR;
1545     /* first pass, loop through and scan words */
1546     reg_trie_data *trie;
1547     HV *widecharmap = NULL;
1548     AV *revcharmap = newAV();
1549     regnode *cur;
1550     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1551     STRLEN len = 0;
1552     UV uvc = 0;
1553     U16 curword = 0;
1554     U32 next_alloc = 0;
1555     regnode *jumper = NULL;
1556     regnode *nextbranch = NULL;
1557     regnode *convert = NULL;
1558     U32 *prev_states; /* temp array mapping each state to previous one */
1559     /* we just use folder as a flag in utf8 */
1560     const U8 * folder = NULL;
1561
1562 #ifdef DEBUGGING
1563     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1564     AV *trie_words = NULL;
1565     /* along with revcharmap, this only used during construction but both are
1566      * useful during debugging so we store them in the struct when debugging.
1567      */
1568 #else
1569     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1570     STRLEN trie_charcount=0;
1571 #endif
1572     SV *re_trie_maxbuff;
1573     GET_RE_DEBUG_FLAGS_DECL;
1574
1575     PERL_ARGS_ASSERT_MAKE_TRIE;
1576 #ifndef DEBUGGING
1577     PERL_UNUSED_ARG(depth);
1578 #endif
1579
1580     switch (flags) {
1581         case EXACT: break;
1582         case EXACTFA:
1583         case EXACTFU_SS:
1584         case EXACTFU_TRICKYFOLD:
1585         case EXACTFU: folder = PL_fold_latin1; break;
1586         case EXACTF:  folder = PL_fold; break;
1587         case EXACTFL: folder = PL_fold_locale; break;
1588         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1589     }
1590
1591     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1592     trie->refcount = 1;
1593     trie->startstate = 1;
1594     trie->wordcount = word_count;
1595     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1596     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1597     if (flags == EXACT)
1598         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1599     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1600                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1601
1602     DEBUG_r({
1603         trie_words = newAV();
1604     });
1605
1606     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1607     if (!SvIOK(re_trie_maxbuff)) {
1608         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1609     }
1610     DEBUG_TRIE_COMPILE_r({
1611                 PerlIO_printf( Perl_debug_log,
1612                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1613                   (int)depth * 2 + 2, "",
1614                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1615                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1616                   (int)depth);
1617     });
1618
1619    /* Find the node we are going to overwrite */
1620     if ( first == startbranch && OP( last ) != BRANCH ) {
1621         /* whole branch chain */
1622         convert = first;
1623     } else {
1624         /* branch sub-chain */
1625         convert = NEXTOPER( first );
1626     }
1627
1628     /*  -- First loop and Setup --
1629
1630        We first traverse the branches and scan each word to determine if it
1631        contains widechars, and how many unique chars there are, this is
1632        important as we have to build a table with at least as many columns as we
1633        have unique chars.
1634
1635        We use an array of integers to represent the character codes 0..255
1636        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1637        native representation of the character value as the key and IV's for the
1638        coded index.
1639
1640        *TODO* If we keep track of how many times each character is used we can
1641        remap the columns so that the table compression later on is more
1642        efficient in terms of memory by ensuring the most common value is in the
1643        middle and the least common are on the outside.  IMO this would be better
1644        than a most to least common mapping as theres a decent chance the most
1645        common letter will share a node with the least common, meaning the node
1646        will not be compressible. With a middle is most common approach the worst
1647        case is when we have the least common nodes twice.
1648
1649      */
1650
1651     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1652         regnode *noper = NEXTOPER( cur );
1653         const U8 *uc = (U8*)STRING( noper );
1654         const U8 *e  = uc + STR_LEN( noper );
1655         STRLEN foldlen = 0;
1656         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1657         STRLEN skiplen = 0;
1658         const U8 *scan = (U8*)NULL;
1659         U32 wordlen      = 0;         /* required init */
1660         STRLEN chars = 0;
1661         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1662
1663         if (OP(noper) == NOTHING) {
1664             regnode *noper_next= regnext(noper);
1665             if (noper_next != tail && OP(noper_next) == flags) {
1666                 noper = noper_next;
1667                 uc= (U8*)STRING(noper);
1668                 e= uc + STR_LEN(noper);
1669                 trie->minlen= STR_LEN(noper);
1670             } else {
1671                 trie->minlen= 0;
1672                 continue;
1673             }
1674         }
1675
1676         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1677             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1678                                           regardless of encoding */
1679             if (OP( noper ) == EXACTFU_SS) {
1680                 /* false positives are ok, so just set this */
1681                 TRIE_BITMAP_SET(trie,0xDF);
1682             }
1683         }
1684         for ( ; uc < e ; uc += len ) {
1685             TRIE_CHARCOUNT(trie)++;
1686             TRIE_READ_CHAR;
1687             chars++;
1688             if ( uvc < 256 ) {
1689                 if ( folder ) {
1690                     U8 folded= folder[ (U8) uvc ];
1691                     if ( !trie->charmap[ folded ] ) {
1692                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
1693                         TRIE_STORE_REVCHAR( folded );
1694                     }
1695                 }
1696                 if ( !trie->charmap[ uvc ] ) {
1697                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1698                     TRIE_STORE_REVCHAR( uvc );
1699                 }
1700                 if ( set_bit ) {
1701                     /* store the codepoint in the bitmap, and its folded
1702                      * equivalent. */
1703                     TRIE_BITMAP_SET(trie, uvc);
1704
1705                     /* store the folded codepoint */
1706                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1707
1708                     if ( !UTF ) {
1709                         /* store first byte of utf8 representation of
1710                            variant codepoints */
1711                         if (! UNI_IS_INVARIANT(uvc)) {
1712                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1713                         }
1714                     }
1715                     set_bit = 0; /* We've done our bit :-) */
1716                 }
1717             } else {
1718                 SV** svpp;
1719                 if ( !widecharmap )
1720                     widecharmap = newHV();
1721
1722                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1723
1724                 if ( !svpp )
1725                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1726
1727                 if ( !SvTRUE( *svpp ) ) {
1728                     sv_setiv( *svpp, ++trie->uniquecharcount );
1729                     TRIE_STORE_REVCHAR(uvc);
1730                 }
1731             }
1732         }
1733         if( cur == first ) {
1734             trie->minlen = chars;
1735             trie->maxlen = chars;
1736         } else if (chars < trie->minlen) {
1737             trie->minlen = chars;
1738         } else if (chars > trie->maxlen) {
1739             trie->maxlen = chars;
1740         }
1741         if (OP( noper ) == EXACTFU_SS) {
1742             /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1743             if (trie->minlen > 1)
1744                 trie->minlen= 1;
1745         }
1746         if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1747             /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1748              *                - We assume that any such sequence might match a 2 byte string */
1749             if (trie->minlen > 2 )
1750                 trie->minlen= 2;
1751         }
1752
1753     } /* end first pass */
1754     DEBUG_TRIE_COMPILE_r(
1755         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1756                 (int)depth * 2 + 2,"",
1757                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1758                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1759                 (int)trie->minlen, (int)trie->maxlen )
1760     );
1761
1762     /*
1763         We now know what we are dealing with in terms of unique chars and
1764         string sizes so we can calculate how much memory a naive
1765         representation using a flat table  will take. If it's over a reasonable
1766         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1767         conservative but potentially much slower representation using an array
1768         of lists.
1769
1770         At the end we convert both representations into the same compressed
1771         form that will be used in regexec.c for matching with. The latter
1772         is a form that cannot be used to construct with but has memory
1773         properties similar to the list form and access properties similar
1774         to the table form making it both suitable for fast searches and
1775         small enough that its feasable to store for the duration of a program.
1776
1777         See the comment in the code where the compressed table is produced
1778         inplace from the flat tabe representation for an explanation of how
1779         the compression works.
1780
1781     */
1782
1783
1784     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1785     prev_states[1] = 0;
1786
1787     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1788         /*
1789             Second Pass -- Array Of Lists Representation
1790
1791             Each state will be represented by a list of charid:state records
1792             (reg_trie_trans_le) the first such element holds the CUR and LEN
1793             points of the allocated array. (See defines above).
1794
1795             We build the initial structure using the lists, and then convert
1796             it into the compressed table form which allows faster lookups
1797             (but cant be modified once converted).
1798         */
1799
1800         STRLEN transcount = 1;
1801
1802         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1803             "%*sCompiling trie using list compiler\n",
1804             (int)depth * 2 + 2, ""));
1805
1806         trie->states = (reg_trie_state *)
1807             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1808                                   sizeof(reg_trie_state) );
1809         TRIE_LIST_NEW(1);
1810         next_alloc = 2;
1811
1812         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1813
1814             regnode *noper   = NEXTOPER( cur );
1815             U8 *uc           = (U8*)STRING( noper );
1816             const U8 *e      = uc + STR_LEN( noper );
1817             U32 state        = 1;         /* required init */
1818             U16 charid       = 0;         /* sanity init */
1819             U8 *scan         = (U8*)NULL; /* sanity init */
1820             STRLEN foldlen   = 0;         /* required init */
1821             U32 wordlen      = 0;         /* required init */
1822             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1823             STRLEN skiplen   = 0;
1824
1825             if (OP(noper) == NOTHING) {
1826                 regnode *noper_next= regnext(noper);
1827                 if (noper_next != tail && OP(noper_next) == flags) {
1828                     noper = noper_next;
1829                     uc= (U8*)STRING(noper);
1830                     e= uc + STR_LEN(noper);
1831                 }
1832             }
1833
1834             if (OP(noper) != NOTHING) {
1835                 for ( ; uc < e ; uc += len ) {
1836
1837                     TRIE_READ_CHAR;
1838
1839                     if ( uvc < 256 ) {
1840                         charid = trie->charmap[ uvc ];
1841                     } else {
1842                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1843                         if ( !svpp ) {
1844                             charid = 0;
1845                         } else {
1846                             charid=(U16)SvIV( *svpp );
1847                         }
1848                     }
1849                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1850                     if ( charid ) {
1851
1852                         U16 check;
1853                         U32 newstate = 0;
1854
1855                         charid--;
1856                         if ( !trie->states[ state ].trans.list ) {
1857                             TRIE_LIST_NEW( state );
1858                         }
1859                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1860                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1861                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1862                                 break;
1863                             }
1864                         }
1865                         if ( ! newstate ) {
1866                             newstate = next_alloc++;
1867                             prev_states[newstate] = state;
1868                             TRIE_LIST_PUSH( state, charid, newstate );
1869                             transcount++;
1870                         }
1871                         state = newstate;
1872                     } else {
1873                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1874                     }
1875                 }
1876             }
1877             TRIE_HANDLE_WORD(state);
1878
1879         } /* end second pass */
1880
1881         /* next alloc is the NEXT state to be allocated */
1882         trie->statecount = next_alloc;
1883         trie->states = (reg_trie_state *)
1884             PerlMemShared_realloc( trie->states,
1885                                    next_alloc
1886                                    * sizeof(reg_trie_state) );
1887
1888         /* and now dump it out before we compress it */
1889         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1890                                                          revcharmap, next_alloc,
1891                                                          depth+1)
1892         );
1893
1894         trie->trans = (reg_trie_trans *)
1895             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1896         {
1897             U32 state;
1898             U32 tp = 0;
1899             U32 zp = 0;
1900
1901
1902             for( state=1 ; state < next_alloc ; state ++ ) {
1903                 U32 base=0;
1904
1905                 /*
1906                 DEBUG_TRIE_COMPILE_MORE_r(
1907                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1908                 );
1909                 */
1910
1911                 if (trie->states[state].trans.list) {
1912                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1913                     U16 maxid=minid;
1914                     U16 idx;
1915
1916                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1917                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1918                         if ( forid < minid ) {
1919                             minid=forid;
1920                         } else if ( forid > maxid ) {
1921                             maxid=forid;
1922                         }
1923                     }
1924                     if ( transcount < tp + maxid - minid + 1) {
1925                         transcount *= 2;
1926                         trie->trans = (reg_trie_trans *)
1927                             PerlMemShared_realloc( trie->trans,
1928                                                      transcount
1929                                                      * sizeof(reg_trie_trans) );
1930                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1931                     }
1932                     base = trie->uniquecharcount + tp - minid;
1933                     if ( maxid == minid ) {
1934                         U32 set = 0;
1935                         for ( ; zp < tp ; zp++ ) {
1936                             if ( ! trie->trans[ zp ].next ) {
1937                                 base = trie->uniquecharcount + zp - minid;
1938                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1939                                 trie->trans[ zp ].check = state;
1940                                 set = 1;
1941                                 break;
1942                             }
1943                         }
1944                         if ( !set ) {
1945                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1946                             trie->trans[ tp ].check = state;
1947                             tp++;
1948                             zp = tp;
1949                         }
1950                     } else {
1951                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1952                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1953                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1954                             trie->trans[ tid ].check = state;
1955                         }
1956                         tp += ( maxid - minid + 1 );
1957                     }
1958                     Safefree(trie->states[ state ].trans.list);
1959                 }
1960                 /*
1961                 DEBUG_TRIE_COMPILE_MORE_r(
1962                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1963                 );
1964                 */
1965                 trie->states[ state ].trans.base=base;
1966             }
1967             trie->lasttrans = tp + 1;
1968         }
1969     } else {
1970         /*
1971            Second Pass -- Flat Table Representation.
1972
1973            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1974            We know that we will need Charcount+1 trans at most to store the data
1975            (one row per char at worst case) So we preallocate both structures
1976            assuming worst case.
1977
1978            We then construct the trie using only the .next slots of the entry
1979            structs.
1980
1981            We use the .check field of the first entry of the node temporarily to
1982            make compression both faster and easier by keeping track of how many non
1983            zero fields are in the node.
1984
1985            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1986            transition.
1987
1988            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1989            number representing the first entry of the node, and state as a
1990            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1991            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1992            are 2 entrys per node. eg:
1993
1994              A B       A B
1995           1. 2 4    1. 3 7
1996           2. 0 3    3. 0 5
1997           3. 0 0    5. 0 0
1998           4. 0 0    7. 0 0
1999
2000            The table is internally in the right hand, idx form. However as we also
2001            have to deal with the states array which is indexed by nodenum we have to
2002            use TRIE_NODENUM() to convert.
2003
2004         */
2005         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2006             "%*sCompiling trie using table compiler\n",
2007             (int)depth * 2 + 2, ""));
2008
2009         trie->trans = (reg_trie_trans *)
2010             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2011                                   * trie->uniquecharcount + 1,
2012                                   sizeof(reg_trie_trans) );
2013         trie->states = (reg_trie_state *)
2014             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2015                                   sizeof(reg_trie_state) );
2016         next_alloc = trie->uniquecharcount + 1;
2017
2018
2019         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2020
2021             regnode *noper   = NEXTOPER( cur );
2022             const U8 *uc     = (U8*)STRING( noper );
2023             const U8 *e      = uc + STR_LEN( noper );
2024
2025             U32 state        = 1;         /* required init */
2026
2027             U16 charid       = 0;         /* sanity init */
2028             U32 accept_state = 0;         /* sanity init */
2029             U8 *scan         = (U8*)NULL; /* sanity init */
2030
2031             STRLEN foldlen   = 0;         /* required init */
2032             U32 wordlen      = 0;         /* required init */
2033             STRLEN skiplen   = 0;
2034             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2035
2036             if (OP(noper) == NOTHING) {
2037                 regnode *noper_next= regnext(noper);
2038                 if (noper_next != tail && OP(noper_next) == flags) {
2039                     noper = noper_next;
2040                     uc= (U8*)STRING(noper);
2041                     e= uc + STR_LEN(noper);
2042                 }
2043             }
2044
2045             if ( OP(noper) != NOTHING ) {
2046                 for ( ; uc < e ; uc += len ) {
2047
2048                     TRIE_READ_CHAR;
2049
2050                     if ( uvc < 256 ) {
2051                         charid = trie->charmap[ uvc ];
2052                     } else {
2053                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2054                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2055                     }
2056                     if ( charid ) {
2057                         charid--;
2058                         if ( !trie->trans[ state + charid ].next ) {
2059                             trie->trans[ state + charid ].next = next_alloc;
2060                             trie->trans[ state ].check++;
2061                             prev_states[TRIE_NODENUM(next_alloc)]
2062                                     = TRIE_NODENUM(state);
2063                             next_alloc += trie->uniquecharcount;
2064                         }
2065                         state = trie->trans[ state + charid ].next;
2066                     } else {
2067                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2068                     }
2069                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
2070                 }
2071             }
2072             accept_state = TRIE_NODENUM( state );
2073             TRIE_HANDLE_WORD(accept_state);
2074
2075         } /* end second pass */
2076
2077         /* and now dump it out before we compress it */
2078         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2079                                                           revcharmap,
2080                                                           next_alloc, depth+1));
2081
2082         {
2083         /*
2084            * Inplace compress the table.*
2085
2086            For sparse data sets the table constructed by the trie algorithm will
2087            be mostly 0/FAIL transitions or to put it another way mostly empty.
2088            (Note that leaf nodes will not contain any transitions.)
2089
2090            This algorithm compresses the tables by eliminating most such
2091            transitions, at the cost of a modest bit of extra work during lookup:
2092
2093            - Each states[] entry contains a .base field which indicates the
2094            index in the state[] array wheres its transition data is stored.
2095
2096            - If .base is 0 there are no valid transitions from that node.
2097
2098            - If .base is nonzero then charid is added to it to find an entry in
2099            the trans array.
2100
2101            -If trans[states[state].base+charid].check!=state then the
2102            transition is taken to be a 0/Fail transition. Thus if there are fail
2103            transitions at the front of the node then the .base offset will point
2104            somewhere inside the previous nodes data (or maybe even into a node
2105            even earlier), but the .check field determines if the transition is
2106            valid.
2107
2108            XXX - wrong maybe?
2109            The following process inplace converts the table to the compressed
2110            table: We first do not compress the root node 1,and mark all its
2111            .check pointers as 1 and set its .base pointer as 1 as well. This
2112            allows us to do a DFA construction from the compressed table later,
2113            and ensures that any .base pointers we calculate later are greater
2114            than 0.
2115
2116            - We set 'pos' to indicate the first entry of the second node.
2117
2118            - We then iterate over the columns of the node, finding the first and
2119            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2120            and set the .check pointers accordingly, and advance pos
2121            appropriately and repreat for the next node. Note that when we copy
2122            the next pointers we have to convert them from the original
2123            NODEIDX form to NODENUM form as the former is not valid post
2124            compression.
2125
2126            - If a node has no transitions used we mark its base as 0 and do not
2127            advance the pos pointer.
2128
2129            - If a node only has one transition we use a second pointer into the
2130            structure to fill in allocated fail transitions from other states.
2131            This pointer is independent of the main pointer and scans forward
2132            looking for null transitions that are allocated to a state. When it
2133            finds one it writes the single transition into the "hole".  If the
2134            pointer doesnt find one the single transition is appended as normal.
2135
2136            - Once compressed we can Renew/realloc the structures to release the
2137            excess space.
2138
2139            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2140            specifically Fig 3.47 and the associated pseudocode.
2141
2142            demq
2143         */
2144         const U32 laststate = TRIE_NODENUM( next_alloc );
2145         U32 state, charid;
2146         U32 pos = 0, zp=0;
2147         trie->statecount = laststate;
2148
2149         for ( state = 1 ; state < laststate ; state++ ) {
2150             U8 flag = 0;
2151             const U32 stateidx = TRIE_NODEIDX( state );
2152             const U32 o_used = trie->trans[ stateidx ].check;
2153             U32 used = trie->trans[ stateidx ].check;
2154             trie->trans[ stateidx ].check = 0;
2155
2156             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2157                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2158                     if ( trie->trans[ stateidx + charid ].next ) {
2159                         if (o_used == 1) {
2160                             for ( ; zp < pos ; zp++ ) {
2161                                 if ( ! trie->trans[ zp ].next ) {
2162                                     break;
2163                                 }
2164                             }
2165                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2166                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2167                             trie->trans[ zp ].check = state;
2168                             if ( ++zp > pos ) pos = zp;
2169                             break;
2170                         }
2171                         used--;
2172                     }
2173                     if ( !flag ) {
2174                         flag = 1;
2175                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2176                     }
2177                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2178                     trie->trans[ pos ].check = state;
2179                     pos++;
2180                 }
2181             }
2182         }
2183         trie->lasttrans = pos + 1;
2184         trie->states = (reg_trie_state *)
2185             PerlMemShared_realloc( trie->states, laststate
2186                                    * sizeof(reg_trie_state) );
2187         DEBUG_TRIE_COMPILE_MORE_r(
2188                 PerlIO_printf( Perl_debug_log,
2189                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2190                     (int)depth * 2 + 2,"",
2191                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2192                     (IV)next_alloc,
2193                     (IV)pos,
2194                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2195             );
2196
2197         } /* end table compress */
2198     }
2199     DEBUG_TRIE_COMPILE_MORE_r(
2200             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2201                 (int)depth * 2 + 2, "",
2202                 (UV)trie->statecount,
2203                 (UV)trie->lasttrans)
2204     );
2205     /* resize the trans array to remove unused space */
2206     trie->trans = (reg_trie_trans *)
2207         PerlMemShared_realloc( trie->trans, trie->lasttrans
2208                                * sizeof(reg_trie_trans) );
2209
2210     {   /* Modify the program and insert the new TRIE node */
2211         U8 nodetype =(U8)(flags & 0xFF);
2212         char *str=NULL;
2213
2214 #ifdef DEBUGGING
2215         regnode *optimize = NULL;
2216 #ifdef RE_TRACK_PATTERN_OFFSETS
2217
2218         U32 mjd_offset = 0;
2219         U32 mjd_nodelen = 0;
2220 #endif /* RE_TRACK_PATTERN_OFFSETS */
2221 #endif /* DEBUGGING */
2222         /*
2223            This means we convert either the first branch or the first Exact,
2224            depending on whether the thing following (in 'last') is a branch
2225            or not and whther first is the startbranch (ie is it a sub part of
2226            the alternation or is it the whole thing.)
2227            Assuming its a sub part we convert the EXACT otherwise we convert
2228            the whole branch sequence, including the first.
2229          */
2230         /* Find the node we are going to overwrite */
2231         if ( first != startbranch || OP( last ) == BRANCH ) {
2232             /* branch sub-chain */
2233             NEXT_OFF( first ) = (U16)(last - first);
2234 #ifdef RE_TRACK_PATTERN_OFFSETS
2235             DEBUG_r({
2236                 mjd_offset= Node_Offset((convert));
2237                 mjd_nodelen= Node_Length((convert));
2238             });
2239 #endif
2240             /* whole branch chain */
2241         }
2242 #ifdef RE_TRACK_PATTERN_OFFSETS
2243         else {
2244             DEBUG_r({
2245                 const  regnode *nop = NEXTOPER( convert );
2246                 mjd_offset= Node_Offset((nop));
2247                 mjd_nodelen= Node_Length((nop));
2248             });
2249         }
2250         DEBUG_OPTIMISE_r(
2251             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2252                 (int)depth * 2 + 2, "",
2253                 (UV)mjd_offset, (UV)mjd_nodelen)
2254         );
2255 #endif
2256         /* But first we check to see if there is a common prefix we can
2257            split out as an EXACT and put in front of the TRIE node.  */
2258         trie->startstate= 1;
2259         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2260             U32 state;
2261             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2262                 U32 ofs = 0;
2263                 I32 idx = -1;
2264                 U32 count = 0;
2265                 const U32 base = trie->states[ state ].trans.base;
2266
2267                 if ( trie->states[state].wordnum )
2268                         count = 1;
2269
2270                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2271                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2272                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2273                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2274                     {
2275                         if ( ++count > 1 ) {
2276                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2277                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2278                             if ( state == 1 ) break;
2279                             if ( count == 2 ) {
2280                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2281                                 DEBUG_OPTIMISE_r(
2282                                     PerlIO_printf(Perl_debug_log,
2283                                         "%*sNew Start State=%"UVuf" Class: [",
2284                                         (int)depth * 2 + 2, "",
2285                                         (UV)state));
2286                                 if (idx >= 0) {
2287                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2288                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2289
2290                                     TRIE_BITMAP_SET(trie,*ch);
2291                                     if ( folder )
2292                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2293                                     DEBUG_OPTIMISE_r(
2294                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2295                                     );
2296                                 }
2297                             }
2298                             TRIE_BITMAP_SET(trie,*ch);
2299                             if ( folder )
2300                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2301                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2302                         }
2303                         idx = ofs;
2304                     }
2305                 }
2306                 if ( count == 1 ) {
2307                     SV **tmp = av_fetch( revcharmap, idx, 0);
2308                     STRLEN len;
2309                     char *ch = SvPV( *tmp, len );
2310                     DEBUG_OPTIMISE_r({
2311                         SV *sv=sv_newmortal();
2312                         PerlIO_printf( Perl_debug_log,
2313                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2314                             (int)depth * 2 + 2, "",
2315                             (UV)state, (UV)idx,
2316                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2317                                 PL_colors[0], PL_colors[1],
2318                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2319                                 PERL_PV_ESCAPE_FIRSTCHAR
2320                             )
2321                         );
2322                     });
2323                     if ( state==1 ) {
2324                         OP( convert ) = nodetype;
2325                         str=STRING(convert);
2326                         STR_LEN(convert)=0;
2327                     }
2328                     STR_LEN(convert) += len;
2329                     while (len--)
2330                         *str++ = *ch++;
2331                 } else {
2332 #ifdef DEBUGGING
2333                     if (state>1)
2334                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2335 #endif
2336                     break;
2337                 }
2338             }
2339             trie->prefixlen = (state-1);
2340             if (str) {
2341                 regnode *n = convert+NODE_SZ_STR(convert);
2342                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2343                 trie->startstate = state;
2344                 trie->minlen -= (state - 1);
2345                 trie->maxlen -= (state - 1);
2346 #ifdef DEBUGGING
2347                /* At least the UNICOS C compiler choked on this
2348                 * being argument to DEBUG_r(), so let's just have
2349                 * it right here. */
2350                if (
2351 #ifdef PERL_EXT_RE_BUILD
2352                    1
2353 #else
2354                    DEBUG_r_TEST
2355 #endif
2356                    ) {
2357                    regnode *fix = convert;
2358                    U32 word = trie->wordcount;
2359                    mjd_nodelen++;
2360                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2361                    while( ++fix < n ) {
2362                        Set_Node_Offset_Length(fix, 0, 0);
2363                    }
2364                    while (word--) {
2365                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2366                        if (tmp) {
2367                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2368                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2369                            else
2370                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2371                        }
2372                    }
2373                }
2374 #endif
2375                 if (trie->maxlen) {
2376                     convert = n;
2377                 } else {
2378                     NEXT_OFF(convert) = (U16)(tail - convert);
2379                     DEBUG_r(optimize= n);
2380                 }
2381             }
2382         }
2383         if (!jumper)
2384             jumper = last;
2385         if ( trie->maxlen ) {
2386             NEXT_OFF( convert ) = (U16)(tail - convert);
2387             ARG_SET( convert, data_slot );
2388             /* Store the offset to the first unabsorbed branch in
2389                jump[0], which is otherwise unused by the jump logic.
2390                We use this when dumping a trie and during optimisation. */
2391             if (trie->jump)
2392                 trie->jump[0] = (U16)(nextbranch - convert);
2393
2394             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2395              *   and there is a bitmap
2396              *   and the first "jump target" node we found leaves enough room
2397              * then convert the TRIE node into a TRIEC node, with the bitmap
2398              * embedded inline in the opcode - this is hypothetically faster.
2399              */
2400             if ( !trie->states[trie->startstate].wordnum
2401                  && trie->bitmap
2402                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2403             {
2404                 OP( convert ) = TRIEC;
2405                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2406                 PerlMemShared_free(trie->bitmap);
2407                 trie->bitmap= NULL;
2408             } else
2409                 OP( convert ) = TRIE;
2410
2411             /* store the type in the flags */
2412             convert->flags = nodetype;
2413             DEBUG_r({
2414             optimize = convert
2415                       + NODE_STEP_REGNODE
2416                       + regarglen[ OP( convert ) ];
2417             });
2418             /* XXX We really should free up the resource in trie now,
2419                    as we won't use them - (which resources?) dmq */
2420         }
2421         /* needed for dumping*/
2422         DEBUG_r(if (optimize) {
2423             regnode *opt = convert;
2424
2425             while ( ++opt < optimize) {
2426                 Set_Node_Offset_Length(opt,0,0);
2427             }
2428             /*
2429                 Try to clean up some of the debris left after the
2430                 optimisation.
2431              */
2432             while( optimize < jumper ) {
2433                 mjd_nodelen += Node_Length((optimize));
2434                 OP( optimize ) = OPTIMIZED;
2435                 Set_Node_Offset_Length(optimize,0,0);
2436                 optimize++;
2437             }
2438             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2439         });
2440     } /* end node insert */
2441
2442     /*  Finish populating the prev field of the wordinfo array.  Walk back
2443      *  from each accept state until we find another accept state, and if
2444      *  so, point the first word's .prev field at the second word. If the
2445      *  second already has a .prev field set, stop now. This will be the
2446      *  case either if we've already processed that word's accept state,
2447      *  or that state had multiple words, and the overspill words were
2448      *  already linked up earlier.
2449      */
2450     {
2451         U16 word;
2452         U32 state;
2453         U16 prev;
2454
2455         for (word=1; word <= trie->wordcount; word++) {
2456             prev = 0;
2457             if (trie->wordinfo[word].prev)
2458                 continue;
2459             state = trie->wordinfo[word].accept;
2460             while (state) {
2461                 state = prev_states[state];
2462                 if (!state)
2463                     break;
2464                 prev = trie->states[state].wordnum;
2465                 if (prev)
2466                     break;
2467             }
2468             trie->wordinfo[word].prev = prev;
2469         }
2470         Safefree(prev_states);
2471     }
2472
2473
2474     /* and now dump out the compressed format */
2475     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2476
2477     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2478 #ifdef DEBUGGING
2479     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2480     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2481 #else
2482     SvREFCNT_dec_NN(revcharmap);
2483 #endif
2484     return trie->jump
2485            ? MADE_JUMP_TRIE
2486            : trie->startstate>1
2487              ? MADE_EXACT_TRIE
2488              : MADE_TRIE;
2489 }
2490
2491 STATIC void
2492 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2493 {
2494 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2495
2496    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2497    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2498    ISBN 0-201-10088-6
2499
2500    We find the fail state for each state in the trie, this state is the longest proper
2501    suffix of the current state's 'word' that is also a proper prefix of another word in our
2502    trie. State 1 represents the word '' and is thus the default fail state. This allows
2503    the DFA not to have to restart after its tried and failed a word at a given point, it
2504    simply continues as though it had been matching the other word in the first place.
2505    Consider
2506       'abcdgu'=~/abcdefg|cdgu/
2507    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2508    fail, which would bring us to the state representing 'd' in the second word where we would
2509    try 'g' and succeed, proceeding to match 'cdgu'.
2510  */
2511  /* add a fail transition */
2512     const U32 trie_offset = ARG(source);
2513     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2514     U32 *q;
2515     const U32 ucharcount = trie->uniquecharcount;
2516     const U32 numstates = trie->statecount;
2517     const U32 ubound = trie->lasttrans + ucharcount;
2518     U32 q_read = 0;
2519     U32 q_write = 0;
2520     U32 charid;
2521     U32 base = trie->states[ 1 ].trans.base;
2522     U32 *fail;
2523     reg_ac_data *aho;
2524     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2525     GET_RE_DEBUG_FLAGS_DECL;
2526
2527     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2528 #ifndef DEBUGGING
2529     PERL_UNUSED_ARG(depth);
2530 #endif
2531
2532
2533     ARG_SET( stclass, data_slot );
2534     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2535     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2536     aho->trie=trie_offset;
2537     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2538     Copy( trie->states, aho->states, numstates, reg_trie_state );
2539     Newxz( q, numstates, U32);
2540     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2541     aho->refcount = 1;
2542     fail = aho->fail;
2543     /* initialize fail[0..1] to be 1 so that we always have
2544        a valid final fail state */
2545     fail[ 0 ] = fail[ 1 ] = 1;
2546
2547     for ( charid = 0; charid < ucharcount ; charid++ ) {
2548         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2549         if ( newstate ) {
2550             q[ q_write ] = newstate;
2551             /* set to point at the root */
2552             fail[ q[ q_write++ ] ]=1;
2553         }
2554     }
2555     while ( q_read < q_write) {
2556         const U32 cur = q[ q_read++ % numstates ];
2557         base = trie->states[ cur ].trans.base;
2558
2559         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2560             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2561             if (ch_state) {
2562                 U32 fail_state = cur;
2563                 U32 fail_base;
2564                 do {
2565                     fail_state = fail[ fail_state ];
2566                     fail_base = aho->states[ fail_state ].trans.base;
2567                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2568
2569                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2570                 fail[ ch_state ] = fail_state;
2571                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2572                 {
2573                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2574                 }
2575                 q[ q_write++ % numstates] = ch_state;
2576             }
2577         }
2578     }
2579     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2580        when we fail in state 1, this allows us to use the
2581        charclass scan to find a valid start char. This is based on the principle
2582        that theres a good chance the string being searched contains lots of stuff
2583        that cant be a start char.
2584      */
2585     fail[ 0 ] = fail[ 1 ] = 0;
2586     DEBUG_TRIE_COMPILE_r({
2587         PerlIO_printf(Perl_debug_log,
2588                       "%*sStclass Failtable (%"UVuf" states): 0",
2589                       (int)(depth * 2), "", (UV)numstates
2590         );
2591         for( q_read=1; q_read<numstates; q_read++ ) {
2592             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2593         }
2594         PerlIO_printf(Perl_debug_log, "\n");
2595     });
2596     Safefree(q);
2597     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2598 }
2599
2600
2601 /*
2602  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2603  * These need to be revisited when a newer toolchain becomes available.
2604  */
2605 #if defined(__sparc64__) && defined(__GNUC__)
2606 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2607 #       undef  SPARC64_GCC_WORKAROUND
2608 #       define SPARC64_GCC_WORKAROUND 1
2609 #   endif
2610 #endif
2611
2612 #define DEBUG_PEEP(str,scan,depth) \
2613     DEBUG_OPTIMISE_r({if (scan){ \
2614        SV * const mysv=sv_newmortal(); \
2615        regnode *Next = regnext(scan); \
2616        regprop(RExC_rx, mysv, scan); \
2617        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2618        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2619        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2620    }});
2621
2622
2623 /* The below joins as many adjacent EXACTish nodes as possible into a single
2624  * one.  The regop may be changed if the node(s) contain certain sequences that
2625  * require special handling.  The joining is only done if:
2626  * 1) there is room in the current conglomerated node to entirely contain the
2627  *    next one.
2628  * 2) they are the exact same node type
2629  *
2630  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2631  * these get optimized out
2632  *
2633  * If a node is to match under /i (folded), the number of characters it matches
2634  * can be different than its character length if it contains a multi-character
2635  * fold.  *min_subtract is set to the total delta of the input nodes.
2636  *
2637  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2638  * and contains LATIN SMALL LETTER SHARP S
2639  *
2640  * This is as good a place as any to discuss the design of handling these
2641  * multi-character fold sequences.  It's been wrong in Perl for a very long
2642  * time.  There are three code points in Unicode whose multi-character folds
2643  * were long ago discovered to mess things up.  The previous designs for
2644  * dealing with these involved assigning a special node for them.  This
2645  * approach doesn't work, as evidenced by this example:
2646  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2647  * Both these fold to "sss", but if the pattern is parsed to create a node that
2648  * would match just the \xDF, it won't be able to handle the case where a
2649  * successful match would have to cross the node's boundary.  The new approach
2650  * that hopefully generally solves the problem generates an EXACTFU_SS node
2651  * that is "sss".
2652  *
2653  * It turns out that there are problems with all multi-character folds, and not
2654  * just these three.  Now the code is general, for all such cases, but the
2655  * three still have some special handling.  The approach taken is:
2656  * 1)   This routine examines each EXACTFish node that could contain multi-
2657  *      character fold sequences.  It returns in *min_subtract how much to
2658  *      subtract from the the actual length of the string to get a real minimum
2659  *      match length; it is 0 if there are no multi-char folds.  This delta is
2660  *      used by the caller to adjust the min length of the match, and the delta
2661  *      between min and max, so that the optimizer doesn't reject these
2662  *      possibilities based on size constraints.
2663  * 2)   Certain of these sequences require special handling by the trie code,
2664  *      so, if found, this code changes the joined node type to special ops:
2665  *      EXACTFU_TRICKYFOLD and EXACTFU_SS.
2666  * 3)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2667  *      is used for an EXACTFU node that contains at least one "ss" sequence in
2668  *      it.  For non-UTF-8 patterns and strings, this is the only case where
2669  *      there is a possible fold length change.  That means that a regular
2670  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
2671  *      with length changes, and so can be processed faster.  regexec.c takes
2672  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
2673  *      pre-folded by regcomp.c.  This saves effort in regex matching.
2674  *      However, the pre-folding isn't done for non-UTF8 patterns because the
2675  *      fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2676  *      down by forcing the pattern into UTF8 unless necessary.  Also what
2677  *      EXACTF and EXACTFL nodes fold to isn't known until runtime.  The fold
2678  *      possibilities for the non-UTF8 patterns are quite simple, except for
2679  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
2680  *      members of a fold-pair, and arrays are set up for all of them so that
2681  *      the other member of the pair can be found quickly.  Code elsewhere in
2682  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2683  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
2684  *      described in the next item.
2685  * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
2686  *      'ss' or not is not knowable at compile time.  It will match iff the
2687  *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
2688  *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
2689  *      it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2690  *      described in item 3).  An assumption that the optimizer part of
2691  *      regexec.c (probably unwittingly) makes is that a character in the
2692  *      pattern corresponds to at most a single character in the target string.
2693  *      (And I do mean character, and not byte here, unlike other parts of the
2694  *      documentation that have never been updated to account for multibyte
2695  *      Unicode.)  This assumption is wrong only in this case, as all other
2696  *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
2697  *      virtue of having this file pre-fold UTF-8 patterns.   I'm
2698  *      reluctant to try to change this assumption, so instead the code punts.
2699  *      This routine examines EXACTF nodes for the sharp s, and returns a
2700  *      boolean indicating whether or not the node is an EXACTF node that
2701  *      contains a sharp s.  When it is true, the caller sets a flag that later
2702  *      causes the optimizer in this file to not set values for the floating
2703  *      and fixed string lengths, and thus avoids the optimizer code in
2704  *      regexec.c that makes the invalid assumption.  Thus, there is no
2705  *      optimization based on string lengths for EXACTF nodes that contain the
2706  *      sharp s.  This only happens for /id rules (which means the pattern
2707  *      isn't in UTF-8).
2708  */
2709
2710 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2711     if (PL_regkind[OP(scan)] == EXACT) \
2712         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2713
2714 STATIC U32
2715 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2716     /* Merge several consecutive EXACTish nodes into one. */
2717     regnode *n = regnext(scan);
2718     U32 stringok = 1;
2719     regnode *next = scan + NODE_SZ_STR(scan);
2720     U32 merged = 0;
2721     U32 stopnow = 0;
2722 #ifdef DEBUGGING
2723     regnode *stop = scan;
2724     GET_RE_DEBUG_FLAGS_DECL;
2725 #else
2726     PERL_UNUSED_ARG(depth);
2727 #endif
2728
2729     PERL_ARGS_ASSERT_JOIN_EXACT;
2730 #ifndef EXPERIMENTAL_INPLACESCAN
2731     PERL_UNUSED_ARG(flags);
2732     PERL_UNUSED_ARG(val);
2733 #endif
2734     DEBUG_PEEP("join",scan,depth);
2735
2736     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2737      * EXACT ones that are mergeable to the current one. */
2738     while (n
2739            && (PL_regkind[OP(n)] == NOTHING
2740                || (stringok && OP(n) == OP(scan)))
2741            && NEXT_OFF(n)
2742            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2743     {
2744
2745         if (OP(n) == TAIL || n > next)
2746             stringok = 0;
2747         if (PL_regkind[OP(n)] == NOTHING) {
2748             DEBUG_PEEP("skip:",n,depth);
2749             NEXT_OFF(scan) += NEXT_OFF(n);
2750             next = n + NODE_STEP_REGNODE;
2751 #ifdef DEBUGGING
2752             if (stringok)
2753                 stop = n;
2754 #endif
2755             n = regnext(n);
2756         }
2757         else if (stringok) {
2758             const unsigned int oldl = STR_LEN(scan);
2759             regnode * const nnext = regnext(n);
2760
2761             /* XXX I (khw) kind of doubt that this works on platforms where
2762              * U8_MAX is above 255 because of lots of other assumptions */
2763             /* Don't join if the sum can't fit into a single node */
2764             if (oldl + STR_LEN(n) > U8_MAX)
2765                 break;
2766
2767             DEBUG_PEEP("merg",n,depth);
2768             merged++;
2769
2770             NEXT_OFF(scan) += NEXT_OFF(n);
2771             STR_LEN(scan) += STR_LEN(n);
2772             next = n + NODE_SZ_STR(n);
2773             /* Now we can overwrite *n : */
2774             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2775 #ifdef DEBUGGING
2776             stop = next - 1;
2777 #endif
2778             n = nnext;
2779             if (stopnow) break;
2780         }
2781
2782 #ifdef EXPERIMENTAL_INPLACESCAN
2783         if (flags && !NEXT_OFF(n)) {
2784             DEBUG_PEEP("atch", val, depth);
2785             if (reg_off_by_arg[OP(n)]) {
2786                 ARG_SET(n, val - n);
2787             }
2788             else {
2789                 NEXT_OFF(n) = val - n;
2790             }
2791             stopnow = 1;
2792         }
2793 #endif
2794     }
2795
2796     *min_subtract = 0;
2797     *has_exactf_sharp_s = FALSE;
2798
2799     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2800      * can now analyze for sequences of problematic code points.  (Prior to
2801      * this final joining, sequences could have been split over boundaries, and
2802      * hence missed).  The sequences only happen in folding, hence for any
2803      * non-EXACT EXACTish node */
2804     if (OP(scan) != EXACT) {
2805         const U8 * const s0 = (U8*) STRING(scan);
2806         const U8 * s = s0;
2807         const U8 * const s_end = s0 + STR_LEN(scan);
2808
2809         /* One pass is made over the node's string looking for all the
2810          * possibilities.  to avoid some tests in the loop, there are two main
2811          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2812          * non-UTF-8 */
2813         if (UTF) {
2814
2815             /* Examine the string for a multi-character fold sequence.  UTF-8
2816              * patterns have all characters pre-folded by the time this code is
2817              * executed */
2818             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2819                                      length sequence we are looking for is 2 */
2820             {
2821                 int count = 0;
2822                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2823                 if (! len) {    /* Not a multi-char fold: get next char */
2824                     s += UTF8SKIP(s);
2825                     continue;
2826                 }
2827
2828                 /* Nodes with 'ss' require special handling, except for EXACTFL
2829                  * and EXACTFA for which there is no multi-char fold to this */
2830                 if (len == 2 && *s == 's' && *(s+1) == 's'
2831                     && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2832                 {
2833                     count = 2;
2834                     OP(scan) = EXACTFU_SS;
2835                     s += 2;
2836                 }
2837                 else if (len == 6   /* len is the same in both ASCII and EBCDIC for these */
2838                          && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
2839                                       COMBINING_DIAERESIS_UTF8
2840                                       COMBINING_ACUTE_ACCENT_UTF8,
2841                                    6)
2842                              || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
2843                                          COMBINING_DIAERESIS_UTF8
2844                                          COMBINING_ACUTE_ACCENT_UTF8,
2845                                      6)))
2846                 {
2847                     count = 3;
2848
2849                     /* These two folds require special handling by trie's, so
2850                      * change the node type to indicate this.  If EXACTFA and
2851                      * EXACTFL were ever to be handled by trie's, this would
2852                      * have to be changed.  If this node has already been
2853                      * changed to EXACTFU_SS in this loop, leave it as is.  (I
2854                      * (khw) think it doesn't matter in regexec.c for UTF
2855                      * patterns, but no need to change it */
2856                     if (OP(scan) == EXACTFU) {
2857                         OP(scan) = EXACTFU_TRICKYFOLD;
2858                     }
2859                     s += 6;
2860                 }
2861                 else { /* Here is a generic multi-char fold. */
2862                     const U8* multi_end  = s + len;
2863
2864                     /* Count how many characters in it.  In the case of /l and
2865                      * /aa, no folds which contain ASCII code points are
2866                      * allowed, so check for those, and skip if found.  (In
2867                      * EXACTFL, no folds are allowed to any Latin1 code point,
2868                      * not just ASCII.  But there aren't any of these
2869                      * currently, nor ever likely, so don't take the time to
2870                      * test for them.  The code that generates the
2871                      * is_MULTI_foo() macros croaks should one actually get put
2872                      * into Unicode .) */
2873                     if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2874                         count = utf8_length(s, multi_end);
2875                         s = multi_end;
2876                     }
2877                     else {
2878                         while (s < multi_end) {
2879                             if (isASCII(*s)) {
2880                                 s++;
2881                                 goto next_iteration;
2882                             }
2883                             else {
2884                                 s += UTF8SKIP(s);
2885                             }
2886                             count++;
2887                         }
2888                     }
2889                 }
2890
2891                 /* The delta is how long the sequence is minus 1 (1 is how long
2892                  * the character that folds to the sequence is) */
2893                 *min_subtract += count - 1;
2894             next_iteration: ;
2895             }
2896         }
2897         else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2898
2899             /* Here, the pattern is not UTF-8.  Look for the multi-char folds
2900              * that are all ASCII.  As in the above case, EXACTFL and EXACTFA
2901              * nodes can't have multi-char folds to this range (and there are
2902              * no existing ones in the upper latin1 range).  In the EXACTF
2903              * case we look also for the sharp s, which can be in the final
2904              * position.  Otherwise we can stop looking 1 byte earlier because
2905              * have to find at least two characters for a multi-fold */
2906             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2907
2908             /* The below is perhaps overboard, but this allows us to save a
2909              * test each time through the loop at the expense of a mask.  This
2910              * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
2911              * by a single bit.  On ASCII they are 32 apart; on EBCDIC, they
2912              * are 64.  This uses an exclusive 'or' to find that bit and then
2913              * inverts it to form a mask, with just a single 0, in the bit
2914              * position where 'S' and 's' differ. */
2915             const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2916             const U8 s_masked = 's' & S_or_s_mask;
2917
2918             while (s < upper) {
2919                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2920                 if (! len) {    /* Not a multi-char fold. */
2921                     if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2922                     {
2923                         *has_exactf_sharp_s = TRUE;
2924                     }
2925                     s++;
2926                     continue;
2927                 }
2928
2929                 if (len == 2
2930                     && ((*s & S_or_s_mask) == s_masked)
2931                     && ((*(s+1) & S_or_s_mask) == s_masked))
2932                 {
2933
2934                     /* EXACTF nodes need to know that the minimum length
2935                      * changed so that a sharp s in the string can match this
2936                      * ss in the pattern, but they remain EXACTF nodes, as they
2937                      * won't match this unless the target string is is UTF-8,
2938                      * which we don't know until runtime */
2939                     if (OP(scan) != EXACTF) {
2940                         OP(scan) = EXACTFU_SS;
2941                     }
2942                 }
2943
2944                 *min_subtract += len - 1;
2945                 s += len;
2946             }
2947         }
2948     }
2949
2950 #ifdef DEBUGGING
2951     /* Allow dumping but overwriting the collection of skipped
2952      * ops and/or strings with fake optimized ops */
2953     n = scan + NODE_SZ_STR(scan);
2954     while (n <= stop) {
2955         OP(n) = OPTIMIZED;
2956         FLAGS(n) = 0;
2957         NEXT_OFF(n) = 0;
2958         n++;
2959     }
2960 #endif
2961     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2962     return stopnow;
2963 }
2964
2965 /* REx optimizer.  Converts nodes into quicker variants "in place".
2966    Finds fixed substrings.  */
2967
2968 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2969    to the position after last scanned or to NULL. */
2970
2971 #define INIT_AND_WITHP \
2972     assert(!and_withp); \
2973     Newx(and_withp,1,struct regnode_charclass_class); \
2974     SAVEFREEPV(and_withp)
2975
2976 /* this is a chain of data about sub patterns we are processing that
2977    need to be handled separately/specially in study_chunk. Its so
2978    we can simulate recursion without losing state.  */
2979 struct scan_frame;
2980 typedef struct scan_frame {
2981     regnode *last;  /* last node to process in this frame */
2982     regnode *next;  /* next node to process when last is reached */
2983     struct scan_frame *prev; /*previous frame*/
2984     I32 stop; /* what stopparen do we use */
2985 } scan_frame;
2986
2987
2988 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2989
2990 STATIC I32
2991 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2992                         I32 *minlenp, I32 *deltap,
2993                         regnode *last,
2994                         scan_data_t *data,
2995                         I32 stopparen,
2996                         U8* recursed,
2997                         struct regnode_charclass_class *and_withp,
2998                         U32 flags, U32 depth)
2999                         /* scanp: Start here (read-write). */
3000                         /* deltap: Write maxlen-minlen here. */
3001                         /* last: Stop before this one. */
3002                         /* data: string data about the pattern */
3003                         /* stopparen: treat close N as END */
3004                         /* recursed: which subroutines have we recursed into */
3005                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3006 {
3007     dVAR;
3008     I32 min = 0;    /* There must be at least this number of characters to match */
3009     I32 pars = 0, code;
3010     regnode *scan = *scanp, *next;
3011     I32 delta = 0;
3012     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3013     int is_inf_internal = 0;            /* The studied chunk is infinite */
3014     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3015     scan_data_t data_fake;
3016     SV *re_trie_maxbuff = NULL;
3017     regnode *first_non_open = scan;
3018     I32 stopmin = I32_MAX;
3019     scan_frame *frame = NULL;
3020     GET_RE_DEBUG_FLAGS_DECL;
3021
3022     PERL_ARGS_ASSERT_STUDY_CHUNK;
3023
3024 #ifdef DEBUGGING
3025     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3026 #endif
3027
3028     if ( depth == 0 ) {
3029         while (first_non_open && OP(first_non_open) == OPEN)
3030             first_non_open=regnext(first_non_open);
3031     }
3032
3033
3034   fake_study_recurse:
3035     while ( scan && OP(scan) != END && scan < last ){
3036         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3037                                    node length to get a real minimum (because
3038                                    the folded version may be shorter) */
3039         bool has_exactf_sharp_s = FALSE;
3040         /* Peephole optimizer: */
3041         DEBUG_STUDYDATA("Peep:", data,depth);
3042         DEBUG_PEEP("Peep",scan,depth);
3043
3044         /* Its not clear to khw or hv why this is done here, and not in the
3045          * clauses that deal with EXACT nodes.  khw's guess is that it's
3046          * because of a previous design */
3047         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3048
3049         /* Follow the next-chain of the current node and optimize
3050            away all the NOTHINGs from it.  */
3051         if (OP(scan) != CURLYX) {
3052             const int max = (reg_off_by_arg[OP(scan)]
3053                        ? I32_MAX
3054                        /* I32 may be smaller than U16 on CRAYs! */
3055                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3056             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3057             int noff;
3058             regnode *n = scan;
3059
3060             /* Skip NOTHING and LONGJMP. */
3061             while ((n = regnext(n))
3062                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3063                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3064                    && off + noff < max)
3065                 off += noff;
3066             if (reg_off_by_arg[OP(scan)])
3067                 ARG(scan) = off;
3068             else
3069                 NEXT_OFF(scan) = off;
3070         }
3071
3072
3073
3074         /* The principal pseudo-switch.  Cannot be a switch, since we
3075            look into several different things.  */
3076         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3077                    || OP(scan) == IFTHEN) {
3078             next = regnext(scan);
3079             code = OP(scan);
3080             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3081
3082             if (OP(next) == code || code == IFTHEN) {
3083                 /* NOTE - There is similar code to this block below for handling
3084                    TRIE nodes on a re-study.  If you change stuff here check there
3085                    too. */
3086                 I32 max1 = 0, min1 = I32_MAX, num = 0;
3087                 struct regnode_charclass_class accum;
3088                 regnode * const startbranch=scan;
3089
3090                 if (flags & SCF_DO_SUBSTR)
3091                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3092                 if (flags & SCF_DO_STCLASS)
3093                     cl_init_zero(pRExC_state, &accum);
3094
3095                 while (OP(scan) == code) {
3096                     I32 deltanext, minnext, f = 0, fake;
3097                     struct regnode_charclass_class this_class;
3098
3099                     num++;
3100                     data_fake.flags = 0;
3101                     if (data) {
3102                         data_fake.whilem_c = data->whilem_c;
3103                         data_fake.last_closep = data->last_closep;
3104                     }
3105                     else
3106                         data_fake.last_closep = &fake;
3107
3108                     data_fake.pos_delta = delta;
3109                     next = regnext(scan);
3110                     scan = NEXTOPER(scan);
3111                     if (code != BRANCH)
3112                         scan = NEXTOPER(scan);
3113                     if (flags & SCF_DO_STCLASS) {
3114                         cl_init(pRExC_state, &this_class);
3115                         data_fake.start_class = &this_class;
3116                         f = SCF_DO_STCLASS_AND;
3117                     }
3118                     if (flags & SCF_WHILEM_VISITED_POS)
3119                         f |= SCF_WHILEM_VISITED_POS;
3120
3121                     /* we suppose the run is continuous, last=next...*/
3122                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3123                                           next, &data_fake,
3124                                           stopparen, recursed, NULL, f,depth+1);
3125                     if (min1 > minnext)
3126                         min1 = minnext;
3127                     if (max1 < minnext + deltanext)
3128                         max1 = minnext + deltanext;
3129                     if (deltanext == I32_MAX)
3130                         is_inf = is_inf_internal = 1;
3131                     scan = next;
3132                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3133                         pars++;
3134                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3135                         if ( stopmin > minnext)
3136                             stopmin = min + min1;
3137                         flags &= ~SCF_DO_SUBSTR;
3138                         if (data)
3139                             data->flags |= SCF_SEEN_ACCEPT;
3140                     }
3141                     if (data) {
3142                         if (data_fake.flags & SF_HAS_EVAL)
3143                             data->flags |= SF_HAS_EVAL;
3144                         data->whilem_c = data_fake.whilem_c;
3145                     }
3146                     if (flags & SCF_DO_STCLASS)
3147                         cl_or(pRExC_state, &accum, &this_class);
3148                 }
3149                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3150                     min1 = 0;
3151                 if (flags & SCF_DO_SUBSTR) {
3152                     data->pos_min += min1;
3153                     data->pos_delta += max1 - min1;
3154                     if (max1 != min1 || is_inf)
3155                         data->longest = &(data->longest_float);
3156                 }
3157                 min += min1;
3158                 delta += max1 - min1;
3159                 if (flags & SCF_DO_STCLASS_OR) {
3160                     cl_or(pRExC_state, data->start_class, &accum);
3161                     if (min1) {
3162                         cl_and(data->start_class, and_withp);
3163                         flags &= ~SCF_DO_STCLASS;
3164                     }
3165                 }
3166                 else if (flags & SCF_DO_STCLASS_AND) {
3167                     if (min1) {
3168                         cl_and(data->start_class, &accum);
3169                         flags &= ~SCF_DO_STCLASS;
3170                     }
3171                     else {
3172                         /* Switch to OR mode: cache the old value of
3173                          * data->start_class */
3174                         INIT_AND_WITHP;
3175                         StructCopy(data->start_class, and_withp,
3176                                    struct regnode_charclass_class);
3177                         flags &= ~SCF_DO_STCLASS_AND;
3178                         StructCopy(&accum, data->start_class,
3179                                    struct regnode_charclass_class);
3180                         flags |= SCF_DO_STCLASS_OR;
3181                         SET_SSC_EOS(data->start_class);
3182                     }
3183                 }
3184
3185                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3186                 /* demq.
3187
3188                    Assuming this was/is a branch we are dealing with: 'scan' now
3189                    points at the item that follows the branch sequence, whatever
3190                    it is. We now start at the beginning of the sequence and look
3191                    for subsequences of
3192
3193                    BRANCH->EXACT=>x1
3194                    BRANCH->EXACT=>x2
3195                    tail
3196
3197                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
3198
3199                    If we can find such a subsequence we need to turn the first
3200                    element into a trie and then add the subsequent branch exact
3201                    strings to the trie.
3202
3203                    We have two cases
3204
3205                      1. patterns where the whole set of branches can be converted.
3206
3207                      2. patterns where only a subset can be converted.
3208
3209                    In case 1 we can replace the whole set with a single regop
3210                    for the trie. In case 2 we need to keep the start and end
3211                    branches so
3212
3213                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3214                      becomes BRANCH TRIE; BRANCH X;
3215
3216                   There is an additional case, that being where there is a
3217                   common prefix, which gets split out into an EXACT like node
3218                   preceding the TRIE node.
3219
3220                   If x(1..n)==tail then we can do a simple trie, if not we make
3221                   a "jump" trie, such that when we match the appropriate word
3222                   we "jump" to the appropriate tail node. Essentially we turn
3223                   a nested if into a case structure of sorts.
3224
3225                 */
3226
3227                     int made=0;
3228                     if (!re_trie_maxbuff) {
3229                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3230                         if (!SvIOK(re_trie_maxbuff))
3231                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3232                     }
3233                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3234                         regnode *cur;
3235                         regnode *first = (regnode *)NULL;
3236                         regnode *last = (regnode *)NULL;
3237                         regnode *tail = scan;
3238                         U8 trietype = 0;
3239                         U32 count=0;
3240
3241 #ifdef DEBUGGING
3242                         SV * const mysv = sv_newmortal();       /* for dumping */
3243 #endif
3244                         /* var tail is used because there may be a TAIL
3245                            regop in the way. Ie, the exacts will point to the
3246                            thing following the TAIL, but the last branch will
3247                            point at the TAIL. So we advance tail. If we
3248                            have nested (?:) we may have to move through several
3249                            tails.
3250                          */
3251
3252                         while ( OP( tail ) == TAIL ) {
3253                             /* this is the TAIL generated by (?:) */
3254                             tail = regnext( tail );
3255                         }
3256
3257
3258                         DEBUG_TRIE_COMPILE_r({
3259                             regprop(RExC_rx, mysv, tail );
3260                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3261                                 (int)depth * 2 + 2, "",
3262                                 "Looking for TRIE'able sequences. Tail node is: ",
3263                                 SvPV_nolen_const( mysv )
3264                             );
3265                         });
3266
3267                         /*
3268
3269                             Step through the branches
3270                                 cur represents each branch,
3271                                 noper is the first thing to be matched as part of that branch
3272                                 noper_next is the regnext() of that node.
3273
3274                             We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3275                             via a "jump trie" but we also support building with NOJUMPTRIE,
3276                             which restricts the trie logic to structures like /FOO|BAR/.
3277
3278                             If noper is a trieable nodetype then the branch is a possible optimization
3279                             target. If we are building under NOJUMPTRIE then we require that noper_next
3280                             is the same as scan (our current position in the regex program).
3281
3282                             Once we have two or more consecutive such branches we can create a
3283                             trie of the EXACT's contents and stitch it in place into the program.
3284
3285                             If the sequence represents all of the branches in the alternation we
3286                             replace the entire thing with a single TRIE node.
3287
3288                             Otherwise when it is a subsequence we need to stitch it in place and
3289                             replace only the relevant branches. This means the first branch has
3290                             to remain as it is used by the alternation logic, and its next pointer,
3291                             and needs to be repointed at the item on the branch chain following
3292                             the last branch we have optimized away.
3293
3294                             This could be either a BRANCH, in which case the subsequence is internal,
3295                             or it could be the item following the branch sequence in which case the
3296                             subsequence is at the end (which does not necessarily mean the first node
3297                             is the start of the alternation).
3298
3299                             TRIE_TYPE(X) is a define which maps the optype to a trietype.
3300
3301                                 optype          |  trietype
3302                                 ----------------+-----------
3303                                 NOTHING         | NOTHING
3304                                 EXACT           | EXACT
3305                                 EXACTFU         | EXACTFU
3306                                 EXACTFU_SS      | EXACTFU
3307                                 EXACTFU_TRICKYFOLD | EXACTFU
3308                                 EXACTFA         | 0
3309
3310
3311                         */
3312 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3313                        ( EXACT == (X) )   ? EXACT :        \
3314                        ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
3315                        0 )
3316
3317                         /* dont use tail as the end marker for this traverse */
3318                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3319                             regnode * const noper = NEXTOPER( cur );
3320                             U8 noper_type = OP( noper );
3321                             U8 noper_trietype = TRIE_TYPE( noper_type );
3322 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3323                             regnode * const noper_next = regnext( noper );
3324                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3325                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3326 #endif
3327
3328                             DEBUG_TRIE_COMPILE_r({
3329                                 regprop(RExC_rx, mysv, cur);
3330                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3331                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3332
3333                                 regprop(RExC_rx, mysv, noper);
3334                                 PerlIO_printf( Perl_debug_log, " -> %s",
3335                                     SvPV_nolen_const(mysv));
3336
3337                                 if ( noper_next ) {
3338                                   regprop(RExC_rx, mysv, noper_next );
3339                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3340                                     SvPV_nolen_const(mysv));
3341                                 }
3342                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3343                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3344                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3345                                 );
3346                             });
3347
3348                             /* Is noper a trieable nodetype that can be merged with the
3349                              * current trie (if there is one)? */
3350                             if ( noper_trietype
3351                                   &&
3352                                   (
3353                                         ( noper_trietype == NOTHING)
3354                                         || ( trietype == NOTHING )
3355                                         || ( trietype == noper_trietype )
3356                                   )
3357 #ifdef NOJUMPTRIE
3358                                   && noper_next == tail
3359 #endif
3360                                   && count < U16_MAX)
3361                             {
3362                                 /* Handle mergable triable node
3363                                  * Either we are the first node in a new trieable sequence,
3364                                  * in which case we do some bookkeeping, otherwise we update
3365                                  * the end pointer. */
3366                                 if ( !first ) {
3367                                     first = cur;
3368                                     if ( noper_trietype == NOTHING ) {
3369 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3370                                         regnode * const noper_next = regnext( noper );
3371                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3372                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3373 #endif
3374
3375                                         if ( noper_next_trietype ) {
3376                                             trietype = noper_next_trietype;
3377                                         } else if (noper_next_type)  {
3378                                             /* a NOTHING regop is 1 regop wide. We need at least two
3379                                              * for a trie so we can't merge this in */
3380                                             first = NULL;
3381                                         }
3382                                     } else {
3383                                         trietype = noper_trietype;
3384                                     }
3385                                 } else {
3386                                     if ( trietype == NOTHING )
3387                                         trietype = noper_trietype;
3388                                     last = cur;
3389                                 }
3390                                 if (first)
3391                                     count++;
3392                             } /* end handle mergable triable node */
3393                             else {
3394                                 /* handle unmergable node -
3395                                  * noper may either be a triable node which can not be tried
3396                                  * together with the current trie, or a non triable node */
3397                                 if ( last ) {
3398                                     /* If last is set and trietype is not NOTHING then we have found
3399                                      * at least two triable branch sequences in a row of a similar
3400                                      * trietype so we can turn them into a trie. If/when we
3401                                      * allow NOTHING to start a trie sequence this condition will be
3402                                      * required, and it isn't expensive so we leave it in for now. */
3403                                     if ( trietype && trietype != NOTHING )
3404                                         make_trie( pRExC_state,
3405                                                 startbranch, first, cur, tail, count,
3406                                                 trietype, depth+1 );
3407                                     last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3408                                 }
3409                                 if ( noper_trietype
3410 #ifdef NOJUMPTRIE
3411                                      && noper_next == tail
3412 #endif
3413                                 ){
3414                                     /* noper is triable, so we can start a new trie sequence */
3415                                     count = 1;
3416                                     first = cur;
3417                                     trietype = noper_trietype;
3418                                 } else if (first) {
3419                                     /* if we already saw a first but the current node is not triable then we have
3420                                      * to reset the first information. */
3421                                     count = 0;
3422                                     first = NULL;
3423                                     trietype = 0;
3424                                 }
3425                             } /* end handle unmergable node */
3426                         } /* loop over branches */
3427                         DEBUG_TRIE_COMPILE_r({
3428                             regprop(RExC_rx, mysv, cur);
3429                             PerlIO_printf( Perl_debug_log,
3430                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3431                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3432
3433                         });
3434                         if ( last && trietype ) {
3435                             if ( trietype != NOTHING ) {
3436                                 /* the last branch of the sequence was part of a trie,
3437                                  * so we have to construct it here outside of the loop
3438                                  */
3439                                 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3440 #ifdef TRIE_STUDY_OPT
3441                                 if ( ((made == MADE_EXACT_TRIE &&
3442                                      startbranch == first)
3443                                      || ( first_non_open == first )) &&
3444                                      depth==0 ) {
3445                                     flags |= SCF_TRIE_RESTUDY;
3446                                     if ( startbranch == first
3447                                          && scan == tail )
3448                                     {
3449                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3450                                     }
3451                                 }
3452 #endif
3453                             } else {
3454                                 /* at this point we know whatever we have is a NOTHING sequence/branch
3455                                  * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3456                                  */
3457                                 if ( startbranch == first ) {
3458                                     regnode *opt;
3459                                     /* the entire thing is a NOTHING sequence, something like this:
3460                                      * (?:|) So we can turn it into a plain NOTHING op. */
3461                                     DEBUG_TRIE_COMPILE_r({
3462                                         regprop(RExC_rx, mysv, cur);
3463                                         PerlIO_printf( Perl_debug_log,
3464                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3465                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3466
3467                                     });
3468                                     OP(startbranch)= NOTHING;
3469                                     NEXT_OFF(startbranch)= tail - startbranch;
3470                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
3471                                         OP(opt)= OPTIMIZED;
3472                                 }
3473                             }
3474                         } /* end if ( last) */
3475                     } /* TRIE_MAXBUF is non zero */
3476
3477                 } /* do trie */
3478
3479             }
3480             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3481                 scan = NEXTOPER(NEXTOPER(scan));
3482             } else                      /* single branch is optimized. */
3483                 scan = NEXTOPER(scan);
3484             continue;
3485         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3486             scan_frame *newframe = NULL;
3487             I32 paren;
3488             regnode *start;
3489             regnode *end;
3490
3491             if (OP(scan) != SUSPEND) {
3492             /* set the pointer */
3493                 if (OP(scan) == GOSUB) {
3494                     paren = ARG(scan);
3495                     RExC_recurse[ARG2L(scan)] = scan;
3496                     start = RExC_open_parens[paren-1];
3497                     end   = RExC_close_parens[paren-1];
3498                 } else {
3499                     paren = 0;
3500                     start = RExC_rxi->program + 1;
3501                     end   = RExC_opend;
3502                 }
3503                 if (!recursed) {
3504                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3505                     SAVEFREEPV(recursed);
3506                 }
3507                 if (!PAREN_TEST(recursed,paren+1)) {
3508                     PAREN_SET(recursed,paren+1);
3509                     Newx(newframe,1,scan_frame);
3510                 } else {
3511                     if (flags & SCF_DO_SUBSTR) {
3512                         SCAN_COMMIT(pRExC_state,data,minlenp);
3513                         data->longest = &(data->longest_float);
3514                     }
3515                     is_inf = is_inf_internal = 1;
3516                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3517                         cl_anything(pRExC_state, data->start_class);
3518                     flags &= ~SCF_DO_STCLASS;
3519                 }
3520             } else {
3521                 Newx(newframe,1,scan_frame);
3522                 paren = stopparen;
3523                 start = scan+2;
3524                 end = regnext(scan);
3525             }
3526             if (newframe) {
3527                 assert(start);
3528                 assert(end);
3529                 SAVEFREEPV(newframe);
3530                 newframe->next = regnext(scan);
3531                 newframe->last = last;
3532                 newframe->stop = stopparen;
3533                 newframe->prev = frame;
3534
3535                 frame = newframe;
3536                 scan =  start;
3537                 stopparen = paren;
3538                 last = end;
3539
3540                 continue;
3541             }
3542         }
3543         else if (OP(scan) == EXACT) {
3544             I32 l = STR_LEN(scan);
3545             UV uc;
3546             if (UTF) {
3547                 const U8 * const s = (U8*)STRING(scan);
3548                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3549                 l = utf8_length(s, s + l);
3550             } else {
3551                 uc = *((U8*)STRING(scan));
3552             }
3553             min += l;
3554             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3555                 /* The code below prefers earlier match for fixed
3556                    offset, later match for variable offset.  */
3557                 if (data->last_end == -1) { /* Update the start info. */
3558                     data->last_start_min = data->pos_min;
3559                     data->last_start_max = is_inf
3560                         ? I32_MAX : data->pos_min + data->pos_delta;
3561                 }
3562                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3563                 if (UTF)
3564                     SvUTF8_on(data->last_found);
3565                 {
3566                     SV * const sv = data->last_found;
3567                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3568                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3569                     if (mg && mg->mg_len >= 0)
3570                         mg->mg_len += utf8_length((U8*)STRING(scan),
3571                                                   (U8*)STRING(scan)+STR_LEN(scan));
3572                 }
3573                 data->last_end = data->pos_min + l;
3574                 data->pos_min += l; /* As in the first entry. */
3575                 data->flags &= ~SF_BEFORE_EOL;
3576             }
3577             if (flags & SCF_DO_STCLASS_AND) {
3578                 /* Check whether it is compatible with what we know already! */
3579                 int compat = 1;
3580
3581
3582                 /* If compatible, we or it in below.  It is compatible if is
3583                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3584                  * it's for a locale.  Even if there isn't unicode semantics
3585                  * here, at runtime there may be because of matching against a
3586                  * utf8 string, so accept a possible false positive for
3587                  * latin1-range folds */
3588                 if (uc >= 0x100 ||
3589                     (!(data->start_class->flags & ANYOF_LOCALE)
3590                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3591                     && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3592                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3593                     )
3594                 {
3595                     compat = 0;
3596                 }
3597                 ANYOF_CLASS_ZERO(data->start_class);
3598                 ANYOF_BITMAP_ZERO(data->start_class);
3599                 if (compat)
3600                     ANYOF_BITMAP_SET(data->start_class, uc);
3601                 else if (uc >= 0x100) {
3602                     int i;
3603
3604                     /* Some Unicode code points fold to the Latin1 range; as
3605                      * XXX temporary code, instead of figuring out if this is
3606                      * one, just assume it is and set all the start class bits
3607                      * that could be some such above 255 code point's fold
3608                      * which will generate fals positives.  As the code
3609                      * elsewhere that does compute the fold settles down, it
3610                      * can be extracted out and re-used here */
3611                     for (i = 0; i < 256; i++){
3612                         if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3613                             ANYOF_BITMAP_SET(data->start_class, i);
3614                         }
3615                     }
3616                 }
3617                 CLEAR_SSC_EOS(data->start_class);
3618                 if (uc < 0x100)
3619                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3620             }
3621             else if (flags & SCF_DO_STCLASS_OR) {
3622                 /* false positive possible if the class is case-folded */
3623                 if (uc < 0x100)
3624                     ANYOF_BITMAP_SET(data->start_class, uc);
3625                 else
3626                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3627                 CLEAR_SSC_EOS(data->start_class);
3628                 cl_and(data->start_class, and_withp);
3629             }
3630             flags &= ~SCF_DO_STCLASS;
3631         }
3632         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3633             I32 l = STR_LEN(scan);
3634             UV uc = *((U8*)STRING(scan));
3635
3636             /* Search for fixed substrings supports EXACT only. */
3637             if (flags & SCF_DO_SUBSTR) {
3638                 assert(data);
3639                 SCAN_COMMIT(pRExC_state, data, minlenp);
3640             }
3641             if (UTF) {
3642                 const U8 * const s = (U8 *)STRING(scan);
3643                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3644                 l = utf8_length(s, s + l);
3645             }
3646             if (has_exactf_sharp_s) {
3647                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3648             }
3649             min += l - min_subtract;
3650             assert (min >= 0);
3651             delta += min_subtract;
3652             if (flags & SCF_DO_SUBSTR) {
3653                 data->pos_min += l - min_subtract;
3654                 if (data->pos_min < 0) {
3655                     data->pos_min = 0;
3656                 }
3657                 data->pos_delta += min_subtract;
3658                 if (min_subtract) {
3659                     data->longest = &(data->longest_float);
3660                 }
3661             }
3662             if (flags & SCF_DO_STCLASS_AND) {
3663                 /* Check whether it is compatible with what we know already! */
3664                 int compat = 1;
3665                 if (uc >= 0x100 ||
3666                  (!(data->start_class->flags & ANYOF_LOCALE)
3667                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3668                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3669                 {
3670                     compat = 0;
3671                 }
3672                 ANYOF_CLASS_ZERO(data->start_class);
3673                 ANYOF_BITMAP_ZERO(data->start_class);
3674                 if (compat) {
3675                     ANYOF_BITMAP_SET(data->start_class, uc);
3676                     CLEAR_SSC_EOS(data->start_class);
3677                     if (OP(scan) == EXACTFL) {
3678                         /* XXX This set is probably no longer necessary, and
3679                          * probably wrong as LOCALE now is on in the initial
3680                          * state */
3681                         data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3682                     }
3683                     else {
3684
3685                         /* Also set the other member of the fold pair.  In case
3686                          * that unicode semantics is called for at runtime, use
3687                          * the full latin1 fold.  (Can't do this for locale,
3688                          * because not known until runtime) */
3689                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3690
3691                         /* All other (EXACTFL handled above) folds except under
3692                          * /iaa that include s, S, and sharp_s also may include
3693                          * the others */
3694                         if (OP(scan) != EXACTFA) {
3695                             if (uc == 's' || uc == 'S') {
3696                                 ANYOF_BITMAP_SET(data->start_class,
3697                                                  LATIN_SMALL_LETTER_SHARP_S);
3698                             }
3699                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3700                                 ANYOF_BITMAP_SET(data->start_class, 's');
3701                                 ANYOF_BITMAP_SET(data->start_class, 'S');
3702                             }
3703                         }
3704                     }
3705                 }
3706                 else if (uc >= 0x100) {
3707                     int i;
3708                     for (i = 0; i < 256; i++){
3709                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3710                             ANYOF_BITMAP_SET(data->start_class, i);
3711                         }
3712                     }
3713                 }
3714             }
3715             else if (flags & SCF_DO_STCLASS_OR) {
3716                 if (data->start_class->flags & ANYOF_LOC_FOLD) {
3717                     /* false positive possible if the class is case-folded.
3718                        Assume that the locale settings are the same... */
3719                     if (uc < 0x100) {
3720                         ANYOF_BITMAP_SET(data->start_class, uc);
3721                         if (OP(scan) != EXACTFL) {
3722
3723                             /* And set the other member of the fold pair, but
3724                              * can't do that in locale because not known until
3725                              * run-time */
3726                             ANYOF_BITMAP_SET(data->start_class,
3727                                              PL_fold_latin1[uc]);
3728
3729                             /* All folds except under /iaa that include s, S,
3730                              * and sharp_s also may include the others */
3731                             if (OP(scan) != EXACTFA) {
3732                                 if (uc == 's' || uc == 'S') {
3733                                     ANYOF_BITMAP_SET(data->start_class,
3734                                                    LATIN_SMALL_LETTER_SHARP_S);
3735                                 }
3736                                 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3737                                     ANYOF_BITMAP_SET(data->start_class, 's');
3738                                     ANYOF_BITMAP_SET(data->start_class, 'S');
3739                                 }
3740                             }
3741                         }
3742                     }
3743                     CLEAR_SSC_EOS(data->start_class);
3744                 }
3745                 cl_and(data->start_class, and_withp);
3746             }
3747             flags &= ~SCF_DO_STCLASS;
3748         }
3749         else if (REGNODE_VARIES(OP(scan))) {
3750             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3751             I32 f = flags, pos_before = 0;
3752             regnode * const oscan = scan;
3753             struct regnode_charclass_class this_class;
3754             struct regnode_charclass_class *oclass = NULL;
3755             I32 next_is_eval = 0;
3756
3757             switch (PL_regkind[OP(scan)]) {
3758             case WHILEM:                /* End of (?:...)* . */
3759                 scan = NEXTOPER(scan);
3760                 goto finish;
3761             case PLUS:
3762                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3763                     next = NEXTOPER(scan);
3764                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3765                         mincount = 1;
3766                         maxcount = REG_INFTY;
3767                         next = regnext(scan);
3768                         scan = NEXTOPER(scan);
3769                         goto do_curly;
3770                     }
3771                 }
3772                 if (flags & SCF_DO_SUBSTR)
3773                     data->pos_min++;
3774                 min++;
3775                 /* Fall through. */
3776             case STAR:
3777                 if (flags & SCF_DO_STCLASS) {
3778                     mincount = 0;
3779                     maxcount = REG_INFTY;
3780                     next = regnext(scan);
3781                     scan = NEXTOPER(scan);
3782                     goto do_curly;
3783                 }
3784                 is_inf = is_inf_internal = 1;
3785                 scan = regnext(scan);
3786                 if (flags & SCF_DO_SUBSTR) {
3787                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3788                     data->longest = &(data->longest_float);
3789                 }
3790                 goto optimize_curly_tail;
3791             case CURLY:
3792                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3793                     && (scan->flags == stopparen))
3794                 {
3795                     mincount = 1;
3796                     maxcount = 1;
3797                 } else {
3798                     mincount = ARG1(scan);
3799                     maxcount = ARG2(scan);
3800                 }
3801                 next = regnext(scan);
3802                 if (OP(scan) == CURLYX) {
3803                     I32 lp = (data ? *(data->last_closep) : 0);
3804                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3805                 }
3806                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3807                 next_is_eval = (OP(scan) == EVAL);
3808               do_curly:
3809                 if (flags & SCF_DO_SUBSTR) {
3810                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3811                     pos_before = data->pos_min;
3812                 }
3813                 if (data) {
3814                     fl = data->flags;
3815                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3816                     if (is_inf)
3817                         data->flags |= SF_IS_INF;
3818                 }
3819                 if (flags & SCF_DO_STCLASS) {
3820                     cl_init(pRExC_state, &this_class);
3821                     oclass = data->start_class;
3822                     data->start_class = &this_class;
3823                     f |= SCF_DO_STCLASS_AND;
3824                     f &= ~SCF_DO_STCLASS_OR;
3825                 }
3826                 /* Exclude from super-linear cache processing any {n,m}
3827                    regops for which the combination of input pos and regex
3828                    pos is not enough information to determine if a match
3829                    will be possible.
3830
3831                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3832                    regex pos at the \s*, the prospects for a match depend not
3833                    only on the input position but also on how many (bar\s*)
3834                    repeats into the {4,8} we are. */
3835                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3836                     f &= ~SCF_WHILEM_VISITED_POS;
3837
3838                 /* This will finish on WHILEM, setting scan, or on NULL: */
3839                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3840                                       last, data, stopparen, recursed, NULL,
3841                                       (mincount == 0
3842                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3843
3844                 if (flags & SCF_DO_STCLASS)
3845                     data->start_class = oclass;
3846                 if (mincount == 0 || minnext == 0) {
3847                     if (flags & SCF_DO_STCLASS_OR) {
3848                         cl_or(pRExC_state, data->start_class, &this_class);
3849                     }
3850                     else if (flags & SCF_DO_STCLASS_AND) {
3851                         /* Switch to OR mode: cache the old value of
3852                          * data->start_class */
3853                         INIT_AND_WITHP;
3854                         StructCopy(data->start_class, and_withp,
3855                                    struct regnode_charclass_class);
3856                         flags &= ~SCF_DO_STCLASS_AND;
3857                         StructCopy(&this_class, data->start_class,
3858                                    struct regnode_charclass_class);
3859                         flags |= SCF_DO_STCLASS_OR;
3860                         SET_SSC_EOS(data->start_class);
3861                     }
3862                 } else {                /* Non-zero len */
3863                     if (flags & SCF_DO_STCLASS_OR) {
3864                         cl_or(pRExC_state, data->start_class, &this_class);
3865                         cl_and(data->start_class, and_withp);
3866                     }
3867                     else if (flags & SCF_DO_STCLASS_AND)
3868                         cl_and(data->start_class, &this_class);
3869                     flags &= ~SCF_DO_STCLASS;
3870                 }
3871                 if (!scan)              /* It was not CURLYX, but CURLY. */
3872                     scan = next;
3873                 if ( /* ? quantifier ok, except for (?{ ... }) */
3874                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3875                     && (minnext == 0) && (deltanext == 0)
3876                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3877                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3878                 {
3879                     /* Fatal warnings may leak the regexp without this: */
3880                     SAVEFREESV(RExC_rx_sv);
3881                     ckWARNreg(RExC_parse,
3882                               "Quantifier unexpected on zero-length expression");
3883                     (void)ReREFCNT_inc(RExC_rx_sv);
3884                 }
3885
3886                 min += minnext * mincount;
3887                 is_inf_internal |= ((maxcount == REG_INFTY
3888                                      && (minnext + deltanext) > 0)
3889                                     || deltanext == I32_MAX);
3890                 is_inf |= is_inf_internal;
3891                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3892
3893                 /* Try powerful optimization CURLYX => CURLYN. */
3894                 if (  OP(oscan) == CURLYX && data
3895                       && data->flags & SF_IN_PAR
3896                       && !(data->flags & SF_HAS_EVAL)
3897                       && !deltanext && minnext == 1 ) {
3898                     /* Try to optimize to CURLYN.  */
3899                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3900                     regnode * const nxt1 = nxt;
3901 #ifdef DEBUGGING
3902                     regnode *nxt2;
3903 #endif
3904
3905                     /* Skip open. */
3906                     nxt = regnext(nxt);
3907                     if (!REGNODE_SIMPLE(OP(nxt))
3908                         && !(PL_regkind[OP(nxt)] == EXACT
3909                              && STR_LEN(nxt) == 1))
3910                         goto nogo;
3911 #ifdef DEBUGGING
3912                     nxt2 = nxt;
3913 #endif
3914                     nxt = regnext(nxt);
3915                     if (OP(nxt) != CLOSE)
3916                         goto nogo;
3917                     if (RExC_open_parens) {
3918                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3919                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3920                     }
3921                     /* Now we know that nxt2 is the only contents: */
3922                     oscan->flags = (U8)ARG(nxt);
3923                     OP(oscan) = CURLYN;
3924                     OP(nxt1) = NOTHING; /* was OPEN. */
3925
3926 #ifdef DEBUGGING
3927                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3928                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3929                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3930                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3931                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3932                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3933 #endif
3934                 }
3935               nogo:
3936
3937                 /* Try optimization CURLYX => CURLYM. */
3938                 if (  OP(oscan) == CURLYX && data
3939                       && !(data->flags & SF_HAS_PAR)
3940                       && !(data->flags & SF_HAS_EVAL)
3941                       && !deltanext     /* atom is fixed width */
3942                       && minnext != 0   /* CURLYM can't handle zero width */
3943                       && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3944                 ) {
3945                     /* XXXX How to optimize if data == 0? */
3946                     /* Optimize to a simpler form.  */
3947                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3948                     regnode *nxt2;
3949
3950                     OP(oscan) = CURLYM;
3951                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3952                             && (OP(nxt2) != WHILEM))
3953                         nxt = nxt2;
3954                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3955                     /* Need to optimize away parenths. */
3956                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3957                         /* Set the parenth number.  */
3958                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3959
3960                         oscan->flags = (U8)ARG(nxt);
3961                         if (RExC_open_parens) {
3962                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3963                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3964                         }
3965                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3966                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3967
3968 #ifdef DEBUGGING
3969                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3970                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3971                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3972                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3973 #endif
3974 #if 0
3975                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3976                             regnode *nnxt = regnext(nxt1);
3977                             if (nnxt == nxt) {
3978                                 if (reg_off_by_arg[OP(nxt1)])
3979                                     ARG_SET(nxt1, nxt2 - nxt1);
3980                                 else if (nxt2 - nxt1 < U16_MAX)
3981                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3982                                 else
3983                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3984                             }
3985                             nxt1 = nnxt;
3986                         }
3987 #endif
3988                         /* Optimize again: */
3989                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3990                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3991                     }
3992                     else
3993                         oscan->flags = 0;
3994                 }
3995                 else if ((OP(oscan) == CURLYX)
3996                          && (flags & SCF_WHILEM_VISITED_POS)
3997                          /* See the comment on a similar expression above.
3998                             However, this time it's not a subexpression
3999                             we care about, but the expression itself. */
4000                          && (maxcount == REG_INFTY)
4001                          && data && ++data->whilem_c < 16) {
4002                     /* This stays as CURLYX, we can put the count/of pair. */
4003                     /* Find WHILEM (as in regexec.c) */
4004                     regnode *nxt = oscan + NEXT_OFF(oscan);
4005
4006                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4007                         nxt += ARG(nxt);
4008                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4009                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4010                 }
4011                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4012                     pars++;
4013                 if (flags & SCF_DO_SUBSTR) {
4014                     SV *last_str = NULL;
4015                     int counted = mincount != 0;
4016
4017                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4018 #if defined(SPARC64_GCC_WORKAROUND)
4019                         I32 b = 0;
4020                         STRLEN l = 0;
4021                         const char *s = NULL;
4022                         I32 old = 0;
4023
4024                         if (pos_before >= data->last_start_min)
4025                             b = pos_before;
4026                         else
4027                             b = data->last_start_min;
4028
4029                         l = 0;
4030                         s = SvPV_const(data->last_found, l);
4031                         old = b - data->last_start_min;
4032
4033 #else
4034                         I32 b = pos_before >= data->last_start_min
4035                             ? pos_before : data->last_start_min;
4036                         STRLEN l;
4037                         const char * const s = SvPV_const(data->last_found, l);
4038                         I32 old = b - data->last_start_min;
4039 #endif
4040
4041                         if (UTF)
4042                             old = utf8_hop((U8*)s, old) - (U8*)s;
4043                         l -= old;
4044                         /* Get the added string: */
4045                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4046                         if (deltanext == 0 && pos_before == b) {
4047                             /* What was added is a constant string */
4048                             if (mincount > 1) {
4049                                 SvGROW(last_str, (mincount * l) + 1);
4050                                 repeatcpy(SvPVX(last_str) + l,
4051                                           SvPVX_const(last_str), l, mincount - 1);
4052                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4053                                 /* Add additional parts. */
4054                                 SvCUR_set(data->last_found,
4055                                           SvCUR(data->last_found) - l);
4056                                 sv_catsv(data->last_found, last_str);
4057                                 {
4058                                     SV * sv = data->last_found;
4059                                     MAGIC *mg =
4060                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4061                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4062                                     if (mg && mg->mg_len >= 0)
4063                                         mg->mg_len += CHR_SVLEN(last_str) - l;
4064                                 }
4065                                 data->last_end += l * (mincount - 1);
4066                             }
4067                         } else {
4068                             /* start offset must point into the last copy */
4069                             data->last_start_min += minnext * (mincount - 1);
4070                             data->last_start_max += is_inf ? I32_MAX
4071                                 : (maxcount - 1) * (minnext + data->pos_delta);
4072                         }
4073                     }
4074                     /* It is counted once already... */
4075                     data->pos_min += minnext * (mincount - counted);
4076                     data->pos_delta += - counted * deltanext +
4077                         (minnext + deltanext) * maxcount - minnext * mincount;
4078                     if (mincount != maxcount) {
4079                          /* Cannot extend fixed substrings found inside
4080                             the group.  */
4081                         SCAN_COMMIT(pRExC_state,data,minlenp);
4082                         if (mincount && last_str) {
4083                             SV * const sv = data->last_found;
4084                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4085                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4086
4087                             if (mg)
4088                                 mg->mg_len = -1;
4089                             sv_setsv(sv, last_str);
4090                             data->last_end = data->pos_min;
4091                             data->last_start_min =
4092                                 data->pos_min - CHR_SVLEN(last_str);
4093                             data->last_start_max = is_inf
4094                                 ? I32_MAX
4095                                 : data->pos_min + data->pos_delta
4096                                 - CHR_SVLEN(last_str);
4097                         }
4098                         data->longest = &(data->longest_float);
4099                     }
4100                     SvREFCNT_dec(last_str);
4101                 }
4102                 if (data && (fl & SF_HAS_EVAL))
4103                     data->flags |= SF_HAS_EVAL;
4104               optimize_curly_tail:
4105                 if (OP(oscan) != CURLYX) {
4106                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4107                            && NEXT_OFF(next))
4108                         NEXT_OFF(oscan) += NEXT_OFF(next);
4109                 }
4110                 continue;
4111             default:                    /* REF, ANYOFV, and CLUMP only? */
4112                 if (flags & SCF_DO_SUBSTR) {
4113                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
4114                     data->longest = &(data->longest_float);
4115                 }
4116                 is_inf = is_inf_internal = 1;
4117                 if (flags & SCF_DO_STCLASS_OR)
4118                     cl_anything(pRExC_state, data->start_class);
4119                 flags &= ~SCF_DO_STCLASS;
4120                 break;
4121             }
4122         }
4123         else if (OP(scan) == LNBREAK) {
4124             if (flags & SCF_DO_STCLASS) {
4125                 int value = 0;
4126                 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4127                 if (flags & SCF_DO_STCLASS_AND) {
4128                     for (value = 0; value < 256; value++)
4129                         if (!is_VERTWS_cp(value))
4130                             ANYOF_BITMAP_CLEAR(data->start_class, value);
4131                 }
4132                 else {
4133                     for (value = 0; value < 256; value++)
4134                         if (is_VERTWS_cp(value))
4135                             ANYOF_BITMAP_SET(data->start_class, value);
4136                 }
4137                 if (flags & SCF_DO_STCLASS_OR)
4138                     cl_and(data->start_class, and_withp);
4139                 flags &= ~SCF_DO_STCLASS;
4140             }
4141             min++;
4142             delta++;    /* Because of the 2 char string cr-lf */
4143             if (flags & SCF_DO_SUBSTR) {
4144                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4145                 data->pos_min += 1;
4146                 data->pos_delta += 1;
4147                 data->longest = &(data->longest_float);
4148             }
4149         }
4150         else if (REGNODE_SIMPLE(OP(scan))) {
4151             int value = 0;
4152
4153             if (flags & SCF_DO_SUBSTR) {
4154                 SCAN_COMMIT(pRExC_state,data,minlenp);
4155                 data->pos_min++;
4156             }
4157             min++;
4158             if (flags & SCF_DO_STCLASS) {
4159                 int loop_max = 256;
4160                 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4161
4162                 /* Some of the logic below assumes that switching
4163                    locale on will only add false positives. */
4164                 switch (PL_regkind[OP(scan)]) {
4165                     U8 classnum;
4166
4167                 case SANY:
4168                 default:
4169 #ifdef DEBUGGING
4170                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4171 #endif
4172                  do_default:
4173                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4174                         cl_anything(pRExC_state, data->start_class);
4175                     break;
4176                 case REG_ANY:
4177                     if (OP(scan) == SANY)
4178                         goto do_default;
4179                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4180                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4181                                 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4182                         cl_anything(pRExC_state, data->start_class);
4183                     }
4184                     if (flags & SCF_DO_STCLASS_AND || !value)
4185                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4186                     break;
4187                 case ANYOF:
4188                     if (flags & SCF_DO_STCLASS_AND)
4189                         cl_and(data->start_class,
4190                                (struct regnode_charclass_class*)scan);
4191                     else
4192                         cl_or(pRExC_state, data->start_class,
4193                               (struct regnode_charclass_class*)scan);
4194                     break;
4195                 case POSIXA:
4196                     loop_max = 128;
4197                     /* FALL THROUGH */
4198                 case POSIXL:
4199                 case POSIXD:
4200                 case POSIXU:
4201                     classnum = FLAGS(scan);
4202                     if (flags & SCF_DO_STCLASS_AND) {
4203                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4204                             ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
4205                             for (value = 0; value < loop_max; value++) {
4206                                 if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4207                                     ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4208                                 }
4209                             }
4210                         }
4211                     }
4212                     else {
4213                         if (data->start_class->flags & ANYOF_LOCALE) {
4214                             ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum));
4215                         }
4216                         else {
4217
4218                         /* Even if under locale, set the bits for non-locale
4219                          * in case it isn't a true locale-node.  This will
4220                          * create false positives if it truly is locale */
4221                         for (value = 0; value < loop_max; value++) {
4222                             if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4223                                 ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4224                             }
4225                         }
4226                         }
4227                     }
4228                     break;
4229                 case NPOSIXA:
4230                     loop_max = 128;
4231                     /* FALL THROUGH */
4232                 case NPOSIXL:
4233                 case NPOSIXU:
4234                 case NPOSIXD:
4235                     classnum = FLAGS(scan);
4236                     if (flags & SCF_DO_STCLASS_AND) {
4237                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4238                             ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
4239                             for (value = 0; value < loop_max; value++) {
4240                                 if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4241                                     ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4242                                 }
4243                             }
4244                         }
4245                     }
4246                     else {
4247                         if (data->start_class->flags & ANYOF_LOCALE) {
4248                             ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1);
4249                         }
4250                         else {
4251
4252                         /* Even if under locale, set the bits for non-locale in
4253                          * case it isn't a true locale-node.  This will create
4254                          * false positives if it truly is locale */
4255                         for (value = 0; value < loop_max; value++) {
4256                             if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4257                                 ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4258                             }
4259                         }
4260                         if (PL_regkind[OP(scan)] == NPOSIXD) {
4261                             data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
4262                         }
4263                         }
4264                     }
4265                     break;
4266                 }
4267                 if (flags & SCF_DO_STCLASS_OR)
4268                     cl_and(data->start_class, and_withp);
4269                 flags &= ~SCF_DO_STCLASS;
4270             }
4271         }
4272         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4273             data->flags |= (OP(scan) == MEOL
4274                             ? SF_BEFORE_MEOL
4275                             : SF_BEFORE_SEOL);
4276             SCAN_COMMIT(pRExC_state, data, minlenp);
4277
4278         }
4279         else if (  PL_regkind[OP(scan)] == BRANCHJ
4280                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4281                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4282                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4283             if ( OP(scan) == UNLESSM &&
4284                  scan->flags == 0 &&
4285                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4286                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4287             ) {
4288                 regnode *opt;
4289                 regnode *upto= regnext(scan);
4290                 DEBUG_PARSE_r({
4291                     SV * const mysv_val=sv_newmortal();
4292                     DEBUG_STUDYDATA("OPFAIL",data,depth);
4293
4294                     /*DEBUG_PARSE_MSG("opfail");*/
4295                     regprop(RExC_rx, mysv_val, upto);
4296                     PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4297                                   SvPV_nolen_const(mysv_val),
4298                                   (IV)REG_NODE_NUM(upto),
4299                                   (IV)(upto - scan)
4300                     );
4301                 });
4302                 OP(scan) = OPFAIL;
4303                 NEXT_OFF(scan) = upto - scan;
4304                 for (opt= scan + 1; opt < upto ; opt++)
4305                     OP(opt) = OPTIMIZED;
4306                 scan= upto;
4307                 continue;
4308             }
4309             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4310                 || OP(scan) == UNLESSM )
4311             {
4312                 /* Negative Lookahead/lookbehind
4313                    In this case we can't do fixed string optimisation.
4314                 */
4315
4316                 I32 deltanext, minnext, fake = 0;
4317                 regnode *nscan;
4318                 struct regnode_charclass_class intrnl;
4319                 int f = 0;
4320
4321                 data_fake.flags = 0;
4322                 if (data) {
4323                     data_fake.whilem_c = data->whilem_c;
4324                     data_fake.last_closep = data->last_closep;
4325                 }
4326                 else
4327                     data_fake.last_closep = &fake;
4328                 data_fake.pos_delta = delta;
4329                 if ( flags & SCF_DO_STCLASS && !scan->flags
4330                      && OP(scan) == IFMATCH ) { /* Lookahead */
4331                     cl_init(pRExC_state, &intrnl);
4332                     data_fake.start_class = &intrnl;
4333                     f |= SCF_DO_STCLASS_AND;
4334                 }
4335                 if (flags & SCF_WHILEM_VISITED_POS)
4336                     f |= SCF_WHILEM_VISITED_POS;
4337                 next = regnext(scan);
4338                 nscan = NEXTOPER(NEXTOPER(scan));
4339                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4340                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4341                 if (scan->flags) {
4342                     if (deltanext) {
4343                         FAIL("Variable length lookbehind not implemented");
4344                     }
4345                     else if (minnext > (I32)U8_MAX) {
4346                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4347                     }
4348                     scan->flags = (U8)minnext;
4349                 }
4350                 if (data) {
4351                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4352                         pars++;
4353                     if (data_fake.flags & SF_HAS_EVAL)
4354                         data->flags |= SF_HAS_EVAL;
4355                     data->whilem_c = data_fake.whilem_c;
4356                 }
4357                 if (f & SCF_DO_STCLASS_AND) {
4358                     if (flags & SCF_DO_STCLASS_OR) {
4359                         /* OR before, AND after: ideally we would recurse with
4360                          * data_fake to get the AND applied by study of the
4361                          * remainder of the pattern, and then derecurse;
4362                          * *** HACK *** for now just treat as "no information".
4363                          * See [perl #56690].
4364                          */
4365                         cl_init(pRExC_state, data->start_class);
4366                     }  else {
4367                         /* AND before and after: combine and continue */
4368                         const int was = TEST_SSC_EOS(data->start_class);
4369
4370                         cl_and(data->start_class, &intrnl);
4371                         if (was)
4372                             SET_SSC_EOS(data->start_class);
4373                     }
4374                 }
4375             }
4376 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4377             else {
4378                 /* Positive Lookahead/lookbehind
4379                    In this case we can do fixed string optimisation,
4380                    but we must be careful about it. Note in the case of
4381                    lookbehind the positions will be offset by the minimum
4382                    length of the pattern, something we won't know about
4383                    until after the recurse.
4384                 */
4385                 I32 deltanext, fake = 0;
4386                 regnode *nscan;
4387                 struct regnode_charclass_class intrnl;
4388                 int f = 0;
4389                 /* We use SAVEFREEPV so that when the full compile
4390                     is finished perl will clean up the allocated
4391                     minlens when it's all done. This way we don't
4392                     have to worry about freeing them when we know
4393                     they wont be used, which would be a pain.
4394                  */
4395                 I32 *minnextp;
4396                 Newx( minnextp, 1, I32 );
4397                 SAVEFREEPV(minnextp);
4398
4399                 if (data) {
4400                     StructCopy(data, &data_fake, scan_data_t);
4401                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4402                         f |= SCF_DO_SUBSTR;
4403                         if (scan->flags)
4404                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4405                         data_fake.last_found=newSVsv(data->last_found);
4406                     }
4407                 }
4408                 else
4409                     data_fake.last_closep = &fake;
4410                 data_fake.flags = 0;
4411                 data_fake.pos_delta = delta;
4412                 if (is_inf)
4413                     data_fake.flags |= SF_IS_INF;
4414                 if ( flags & SCF_DO_STCLASS && !scan->flags
4415                      && OP(scan) == IFMATCH ) { /* Lookahead */
4416                     cl_init(pRExC_state, &intrnl);
4417                     data_fake.start_class = &intrnl;
4418                     f |= SCF_DO_STCLASS_AND;
4419                 }
4420                 if (flags & SCF_WHILEM_VISITED_POS)
4421                     f |= SCF_WHILEM_VISITED_POS;
4422                 next = regnext(scan);
4423                 nscan = NEXTOPER(NEXTOPER(scan));
4424
4425                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4426                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4427                 if (scan->flags) {
4428                     if (deltanext) {
4429                         FAIL("Variable length lookbehind not implemented");
4430                     }
4431                     else if (*minnextp > (I32)U8_MAX) {
4432                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4433                     }
4434                     scan->flags = (U8)*minnextp;
4435                 }
4436
4437                 *minnextp += min;
4438
4439                 if (f & SCF_DO_STCLASS_AND) {
4440                     const int was = TEST_SSC_EOS(data.start_class);
4441
4442                     cl_and(data->start_class, &intrnl);
4443                     if (was)
4444                         SET_SSC_EOS(data->start_class);
4445                 }
4446                 if (data) {
4447                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4448                         pars++;
4449                     if (data_fake.flags & SF_HAS_EVAL)
4450                         data->flags |= SF_HAS_EVAL;
4451                     data->whilem_c = data_fake.whilem_c;
4452                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4453                         if (RExC_rx->minlen<*minnextp)
4454                             RExC_rx->minlen=*minnextp;
4455                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4456                         SvREFCNT_dec_NN(data_fake.last_found);
4457
4458                         if ( data_fake.minlen_fixed != minlenp )
4459                         {
4460                             data->offset_fixed= data_fake.offset_fixed;
4461                             data->minlen_fixed= data_fake.minlen_fixed;
4462                             data->lookbehind_fixed+= scan->flags;
4463                         }
4464                         if ( data_fake.minlen_float != minlenp )
4465                         {
4466                             data->minlen_float= data_fake.minlen_float;
4467                             data->offset_float_min=data_fake.offset_float_min;
4468                             data->offset_float_max=data_fake.offset_float_max;
4469                             data->lookbehind_float+= scan->flags;
4470                         }
4471                     }
4472                 }
4473             }
4474 #endif
4475         }
4476         else if (OP(scan) == OPEN) {
4477             if (stopparen != (I32)ARG(scan))
4478                 pars++;
4479         }
4480         else if (OP(scan) == CLOSE) {
4481             if (stopparen == (I32)ARG(scan)) {
4482                 break;
4483             }
4484             if ((I32)ARG(scan) == is_par) {
4485                 next = regnext(scan);
4486
4487                 if ( next && (OP(next) != WHILEM) && next < last)
4488                     is_par = 0;         /* Disable optimization */
4489             }
4490             if (data)
4491                 *(data->last_closep) = ARG(scan);
4492         }
4493         else if (OP(scan) == EVAL) {
4494                 if (data)
4495                     data->flags |= SF_HAS_EVAL;
4496         }
4497         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4498             if (flags & SCF_DO_SUBSTR) {
4499                 SCAN_COMMIT(pRExC_state,data,minlenp);
4500                 flags &= ~SCF_DO_SUBSTR;
4501             }
4502             if (data && OP(scan)==ACCEPT) {
4503                 data->flags |= SCF_SEEN_ACCEPT;
4504                 if (stopmin > min)
4505                     stopmin = min;
4506             }
4507         }
4508         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4509         {
4510                 if (flags & SCF_DO_SUBSTR) {
4511                     SCAN_COMMIT(pRExC_state,data,minlenp);
4512                     data->longest = &(data->longest_float);
4513                 }
4514                 is_inf = is_inf_internal = 1;
4515                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4516                     cl_anything(pRExC_state, data->start_class);
4517                 flags &= ~SCF_DO_STCLASS;
4518         }
4519         else if (OP(scan) == GPOS) {
4520             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4521                 !(delta || is_inf || (data && data->pos_delta)))
4522             {
4523                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4524                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4525                 if (RExC_rx->gofs < (U32)min)
4526                     RExC_rx->gofs = min;
4527             } else {
4528                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4529                 RExC_rx->gofs = 0;
4530             }
4531         }
4532 #ifdef TRIE_STUDY_OPT
4533 #ifdef FULL_TRIE_STUDY
4534         else if (PL_regkind[OP(scan)] == TRIE) {
4535             /* NOTE - There is similar code to this block above for handling
4536                BRANCH nodes on the initial study.  If you change stuff here
4537                check there too. */
4538             regnode *trie_node= scan;
4539             regnode *tail= regnext(scan);
4540             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4541             I32 max1 = 0, min1 = I32_MAX;
4542             struct regnode_charclass_class accum;
4543
4544             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4545                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4546             if (flags & SCF_DO_STCLASS)
4547                 cl_init_zero(pRExC_state, &accum);
4548
4549             if (!trie->jump) {
4550                 min1= trie->minlen;
4551                 max1= trie->maxlen;
4552             } else {
4553                 const regnode *nextbranch= NULL;
4554                 U32 word;
4555
4556                 for ( word=1 ; word <= trie->wordcount ; word++)
4557                 {
4558                     I32 deltanext=0, minnext=0, f = 0, fake;
4559                     struct regnode_charclass_class this_class;
4560
4561                     data_fake.flags = 0;
4562                     if (data) {
4563                         data_fake.whilem_c = data->whilem_c;
4564                         data_fake.last_closep = data->last_closep;
4565                     }
4566                     else
4567                         data_fake.last_closep = &fake;
4568                     data_fake.pos_delta = delta;
4569                     if (flags & SCF_DO_STCLASS) {
4570                         cl_init(pRExC_state, &this_class);
4571                         data_fake.start_class = &this_class;
4572                         f = SCF_DO_STCLASS_AND;
4573                     }
4574                     if (flags & SCF_WHILEM_VISITED_POS)
4575                         f |= SCF_WHILEM_VISITED_POS;
4576
4577                     if (trie->jump[word]) {
4578                         if (!nextbranch)
4579                             nextbranch = trie_node + trie->jump[0];
4580                         scan= trie_node + trie->jump[word];
4581                         /* We go from the jump point to the branch that follows
4582                            it. Note this means we need the vestigal unused branches
4583                            even though they arent otherwise used.
4584                          */
4585                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4586                             &deltanext, (regnode *)nextbranch, &data_fake,
4587                             stopparen, recursed, NULL, f,depth+1);
4588                     }
4589                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4590                         nextbranch= regnext((regnode*)nextbranch);
4591
4592                     if (min1 > (I32)(minnext + trie->minlen))
4593                         min1 = minnext + trie->minlen;
4594                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4595                         max1 = minnext + deltanext + trie->maxlen;
4596                     if (deltanext == I32_MAX)
4597                         is_inf = is_inf_internal = 1;
4598
4599                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4600                         pars++;
4601                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4602                         if ( stopmin > min + min1)
4603                             stopmin = min + min1;
4604                         flags &= ~SCF_DO_SUBSTR;
4605                         if (data)
4606                             data->flags |= SCF_SEEN_ACCEPT;
4607                     }
4608                     if (data) {
4609                         if (data_fake.flags & SF_HAS_EVAL)
4610                             data->flags |= SF_HAS_EVAL;
4611                         data->whilem_c = data_fake.whilem_c;
4612                     }
4613                     if (flags & SCF_DO_STCLASS)
4614                         cl_or(pRExC_state, &accum, &this_class);
4615                 }
4616             }
4617             if (flags & SCF_DO_SUBSTR) {
4618                 data->pos_min += min1;
4619                 data->pos_delta += max1 - min1;
4620                 if (max1 != min1 || is_inf)
4621                     data->longest = &(data->longest_float);
4622             }
4623             min += min1;
4624             delta += max1 - min1;
4625             if (flags & SCF_DO_STCLASS_OR) {
4626                 cl_or(pRExC_state, data->start_class, &accum);
4627                 if (min1) {
4628                     cl_and(data->start_class, and_withp);
4629                     flags &= ~SCF_DO_STCLASS;
4630                 }
4631             }
4632             else if (flags & SCF_DO_STCLASS_AND) {
4633                 if (min1) {
4634                     cl_and(data->start_class, &accum);
4635                     flags &= ~SCF_DO_STCLASS;
4636                 }
4637                 else {
4638                     /* Switch to OR mode: cache the old value of
4639                      * data->start_class */
4640                     INIT_AND_WITHP;
4641                     StructCopy(data->start_class, and_withp,
4642                                struct regnode_charclass_class);
4643                     flags &= ~SCF_DO_STCLASS_AND;
4644                     StructCopy(&accum, data->start_class,
4645                                struct regnode_charclass_class);
4646                     flags |= SCF_DO_STCLASS_OR;
4647                     SET_SSC_EOS(data->start_class);
4648                 }
4649             }
4650             scan= tail;
4651             continue;
4652         }
4653 #else
4654         else if (PL_regkind[OP(scan)] == TRIE) {
4655             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4656             U8*bang=NULL;
4657
4658             min += trie->minlen;
4659             delta += (trie->maxlen - trie->minlen);
4660             flags &= ~SCF_DO_STCLASS; /* xxx */
4661             if (flags & SCF_DO_SUBSTR) {
4662                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4663                 data->pos_min += trie->minlen;
4664                 data->pos_delta += (trie->maxlen - trie->minlen);
4665                 if (trie->maxlen != trie->minlen)
4666                     data->longest = &(data->longest_float);
4667             }
4668             if (trie->jump) /* no more substrings -- for now /grr*/
4669                 flags &= ~SCF_DO_SUBSTR;
4670         }
4671 #endif /* old or new */
4672 #endif /* TRIE_STUDY_OPT */
4673
4674         /* Else: zero-length, ignore. */
4675         scan = regnext(scan);
4676     }
4677     if (frame) {
4678         last = frame->last;
4679         scan = frame->next;
4680         stopparen = frame->stop;
4681         frame = frame->prev;
4682         goto fake_study_recurse;
4683     }
4684
4685   finish:
4686     assert(!frame);
4687     DEBUG_STUDYDATA("pre-fin:",data,depth);
4688
4689     *scanp = scan;
4690     *deltap = is_inf_internal ? I32_MAX : delta;
4691     if (flags & SCF_DO_SUBSTR && is_inf)
4692         data->pos_delta = I32_MAX - data->pos_min;
4693     if (is_par > (I32)U8_MAX)
4694         is_par = 0;
4695     if (is_par && pars==1 && data) {
4696         data->flags |= SF_IN_PAR;
4697         data->flags &= ~SF_HAS_PAR;
4698     }
4699     else if (pars && data) {
4700         data->flags |= SF_HAS_PAR;
4701         data->flags &= ~SF_IN_PAR;
4702     }
4703     if (flags & SCF_DO_STCLASS_OR)
4704         cl_and(data->start_class, and_withp);
4705     if (flags & SCF_TRIE_RESTUDY)
4706         data->flags |=  SCF_TRIE_RESTUDY;
4707
4708     DEBUG_STUDYDATA("post-fin:",data,depth);
4709
4710     return min < stopmin ? min : stopmin;
4711 }
4712
4713 STATIC U32
4714 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4715 {
4716     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4717
4718     PERL_ARGS_ASSERT_ADD_DATA;
4719
4720     Renewc(RExC_rxi->data,
4721            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4722            char, struct reg_data);
4723     if(count)
4724         Renew(RExC_rxi->data->what, count + n, U8);
4725     else
4726         Newx(RExC_rxi->data->what, n, U8);
4727     RExC_rxi->data->count = count + n;
4728     Copy(s, RExC_rxi->data->what + count, n, U8);
4729     return count;
4730 }
4731
4732 /*XXX: todo make this not included in a non debugging perl */
4733 #ifndef PERL_IN_XSUB_RE
4734 void
4735 Perl_reginitcolors(pTHX)
4736 {
4737     dVAR;
4738     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4739     if (s) {
4740         char *t = savepv(s);
4741         int i = 0;
4742         PL_colors[0] = t;
4743         while (++i < 6) {
4744             t = strchr(t, '\t');
4745             if (t) {
4746                 *t = '\0';
4747                 PL_colors[i] = ++t;
4748             }
4749             else
4750                 PL_colors[i] = t = (char *)"";
4751         }
4752     } else {
4753         int i = 0;
4754         while (i < 6)
4755             PL_colors[i++] = (char *)"";
4756     }
4757     PL_colorset = 1;
4758 }
4759 #endif
4760
4761
4762 #ifdef TRIE_STUDY_OPT
4763 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
4764     STMT_START {                                            \
4765         if (                                                \
4766               (data.flags & SCF_TRIE_RESTUDY)               \
4767               && ! restudied++                              \
4768         ) {                                                 \
4769             dOsomething;                                    \
4770             goto reStudy;                                   \
4771         }                                                   \
4772     } STMT_END
4773 #else
4774 #define CHECK_RESTUDY_GOTO_butfirst
4775 #endif
4776
4777 /*
4778  * pregcomp - compile a regular expression into internal code
4779  *
4780  * Decides which engine's compiler to call based on the hint currently in
4781  * scope
4782  */
4783
4784 #ifndef PERL_IN_XSUB_RE
4785
4786 /* return the currently in-scope regex engine (or the default if none)  */
4787
4788 regexp_engine const *
4789 Perl_current_re_engine(pTHX)
4790 {
4791     dVAR;
4792
4793     if (IN_PERL_COMPILETIME) {
4794         HV * const table = GvHV(PL_hintgv);
4795         SV **ptr;
4796
4797         if (!table)
4798             return &PL_core_reg_engine;
4799         ptr = hv_fetchs(table, "regcomp", FALSE);
4800         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4801             return &PL_core_reg_engine;
4802         return INT2PTR(regexp_engine*,SvIV(*ptr));
4803     }
4804     else {
4805         SV *ptr;
4806         if (!PL_curcop->cop_hints_hash)
4807             return &PL_core_reg_engine;
4808         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4809         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4810             return &PL_core_reg_engine;
4811         return INT2PTR(regexp_engine*,SvIV(ptr));
4812     }
4813 }
4814
4815
4816 REGEXP *
4817 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4818 {
4819     dVAR;
4820     regexp_engine const *eng = current_re_engine();
4821     GET_RE_DEBUG_FLAGS_DECL;
4822
4823     PERL_ARGS_ASSERT_PREGCOMP;
4824
4825     /* Dispatch a request to compile a regexp to correct regexp engine. */
4826     DEBUG_COMPILE_r({
4827         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4828                         PTR2UV(eng));
4829     });
4830     return CALLREGCOMP_ENG(eng, pattern, flags);
4831 }
4832 #endif
4833
4834 /* public(ish) entry point for the perl core's own regex compiling code.
4835  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4836  * pattern rather than a list of OPs, and uses the internal engine rather
4837  * than the current one */
4838
4839 REGEXP *
4840 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4841 {
4842     SV *pat = pattern; /* defeat constness! */
4843     PERL_ARGS_ASSERT_RE_COMPILE;
4844     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4845 #ifdef PERL_IN_XSUB_RE
4846                                 &my_reg_engine,
4847 #else
4848                                 &PL_core_reg_engine,
4849 #endif
4850                                 NULL, NULL, rx_flags, 0);
4851 }
4852
4853 /* see if there are any run-time code blocks in the pattern.
4854  * False positives are allowed */
4855
4856 static bool
4857 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4858                     U32 pm_flags, char *pat, STRLEN plen)
4859 {
4860     int n = 0;
4861     STRLEN s;
4862
4863     /* avoid infinitely recursing when we recompile the pattern parcelled up
4864      * as qr'...'. A single constant qr// string can't have have any
4865      * run-time component in it, and thus, no runtime code. (A non-qr
4866      * string, however, can, e.g. $x =~ '(?{})') */
4867     if  ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
4868         return 0;
4869
4870     for (s = 0; s < plen; s++) {
4871         if (n < pRExC_state->num_code_blocks
4872             && s == pRExC_state->code_blocks[n].start)
4873         {
4874             s = pRExC_state->code_blocks[n].end;
4875             n++;
4876             continue;
4877         }
4878         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
4879          * positives here */
4880         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
4881             (pat[s+2] == '{'
4882                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
4883         )
4884             return 1;
4885     }
4886     return 0;
4887 }
4888
4889 /* Handle run-time code blocks. We will already have compiled any direct
4890  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
4891  * copy of it, but with any literal code blocks blanked out and
4892  * appropriate chars escaped; then feed it into
4893  *
4894  *    eval "qr'modified_pattern'"
4895  *
4896  * For example,
4897  *
4898  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
4899  *
4900  * becomes
4901  *
4902  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
4903  *
4904  * After eval_sv()-ing that, grab any new code blocks from the returned qr
4905  * and merge them with any code blocks of the original regexp.
4906  *
4907  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
4908  * instead, just save the qr and return FALSE; this tells our caller that
4909  * the original pattern needs upgrading to utf8.
4910  */
4911
4912 static bool
4913 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
4914     char *pat, STRLEN plen)
4915 {
4916     SV *qr;
4917
4918     GET_RE_DEBUG_FLAGS_DECL;
4919
4920     if (pRExC_state->runtime_code_qr) {
4921         /* this is the second time we've been called; this should
4922          * only happen if the main pattern got upgraded to utf8
4923          * during compilation; re-use the qr we compiled first time
4924          * round (which should be utf8 too)
4925          */
4926         qr = pRExC_state->runtime_code_qr;
4927         pRExC_state->runtime_code_qr = NULL;
4928         assert(RExC_utf8 && SvUTF8(qr));
4929     }
4930     else {
4931         int n = 0;
4932         STRLEN s;
4933         char *p, *newpat;
4934         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
4935         SV *sv, *qr_ref;
4936         dSP;
4937
4938         /* determine how many extra chars we need for ' and \ escaping */
4939         for (s = 0; s < plen; s++) {
4940             if (pat[s] == '\'' || pat[s] == '\\')
4941                 newlen++;
4942         }
4943
4944         Newx(newpat, newlen, char);
4945         p = newpat;
4946         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
4947
4948         for (s = 0; s < plen; s++) {
4949             if (n < pRExC_state->num_code_blocks
4950                 && s == pRExC_state->code_blocks[n].start)
4951             {
4952                 /* blank out literal code block */
4953                 assert(pat[s] == '(');
4954                 while (s <= pRExC_state->code_blocks[n].end) {
4955                     *p++ = '_';
4956                     s++;
4957                 }
4958                 s--;
4959                 n++;
4960                 continue;
4961             }
4962             if (pat[s] == '\'' || pat[s] == '\\')
4963                 *p++ = '\\';
4964             *p++ = pat[s];
4965         }
4966         *p++ = '\'';
4967         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
4968             *p++ = 'x';
4969         *p++ = '\0';
4970         DEBUG_COMPILE_r({
4971             PerlIO_printf(Perl_debug_log,
4972                 "%sre-parsing pattern for runtime code:%s %s\n",
4973                 PL_colors[4],PL_colors[5],newpat);
4974         });
4975
4976         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
4977         Safefree(newpat);
4978
4979         ENTER;
4980         SAVETMPS;
4981         save_re_context();
4982         PUSHSTACKi(PERLSI_REQUIRE);
4983         /* this causes the toker to collapse \\ into \ when parsing
4984          * qr''; normally only q'' does this. It also alters hints
4985          * handling */
4986         PL_reg_state.re_reparsing = TRUE;
4987         eval_sv(sv, G_SCALAR);
4988         SvREFCNT_dec_NN(sv);
4989         SPAGAIN;
4990         qr_ref = POPs;
4991         PUTBACK;
4992         {
4993             SV * const errsv = ERRSV;
4994             if (SvTRUE_NN(errsv))
4995             {
4996                 Safefree(pRExC_state->code_blocks);
4997                 /* use croak_sv ? */
4998                 Perl_croak_nocontext("%s", SvPV_nolen_const(errsv));
4999             }
5000         }
5001         assert(SvROK(qr_ref));
5002         qr = SvRV(qr_ref);
5003         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5004         /* the leaving below frees the tmp qr_ref.
5005          * Give qr a life of its own */
5006         SvREFCNT_inc(qr);
5007         POPSTACK;
5008         FREETMPS;
5009         LEAVE;
5010
5011     }
5012
5013     if (!RExC_utf8 && SvUTF8(qr)) {
5014         /* first time through; the pattern got upgraded; save the
5015          * qr for the next time through */
5016         assert(!pRExC_state->runtime_code_qr);
5017         pRExC_state->runtime_code_qr = qr;
5018         return 0;
5019     }
5020
5021
5022     /* extract any code blocks within the returned qr//  */
5023
5024
5025     /* merge the main (r1) and run-time (r2) code blocks into one */
5026     {
5027         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5028         struct reg_code_block *new_block, *dst;
5029         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5030         int i1 = 0, i2 = 0;
5031
5032         if (!r2->num_code_blocks) /* we guessed wrong */
5033         {
5034             SvREFCNT_dec_NN(qr);
5035             return 1;
5036         }
5037
5038         Newx(new_block,
5039             r1->num_code_blocks + r2->num_code_blocks,
5040             struct reg_code_block);
5041         dst = new_block;
5042
5043         while (    i1 < r1->num_code_blocks
5044                 || i2 < r2->num_code_blocks)
5045         {
5046             struct reg_code_block *src;
5047             bool is_qr = 0;
5048
5049             if (i1 == r1->num_code_blocks) {
5050                 src = &r2->code_blocks[i2++];
5051                 is_qr = 1;
5052             }
5053             else if (i2 == r2->num_code_blocks)
5054                 src = &r1->code_blocks[i1++];
5055             else if (  r1->code_blocks[i1].start
5056                      < r2->code_blocks[i2].start)
5057             {
5058                 src = &r1->code_blocks[i1++];
5059                 assert(src->end < r2->code_blocks[i2].start);
5060             }
5061             else {
5062                 assert(  r1->code_blocks[i1].start
5063                        > r2->code_blocks[i2].start);
5064                 src = &r2->code_blocks[i2++];
5065                 is_qr = 1;
5066                 assert(src->end < r1->code_blocks[i1].start);
5067             }
5068
5069             assert(pat[src->start] == '(');
5070             assert(pat[src->end]   == ')');
5071             dst->start      = src->start;
5072             dst->end        = src->end;
5073             dst->block      = src->block;
5074             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5075                                     : src->src_regex;
5076             dst++;
5077         }
5078         r1->num_code_blocks += r2->num_code_blocks;
5079         Safefree(r1->code_blocks);
5080         r1->code_blocks = new_block;
5081     }
5082
5083     SvREFCNT_dec_NN(qr);
5084     return 1;
5085 }
5086
5087
5088 STATIC bool
5089 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5090 {
5091     /* This is the common code for setting up the floating and fixed length
5092      * string data extracted from Perlre_op_compile() below.  Returns a boolean
5093      * as to whether succeeded or not */
5094
5095     I32 t,ml;
5096
5097     if (! (longest_length
5098            || (eol /* Can't have SEOL and MULTI */
5099                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5100           )
5101             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5102         || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5103     {
5104         return FALSE;
5105     }
5106
5107     /* copy the information about the longest from the reg_scan_data
5108         over to the program. */
5109     if (SvUTF8(sv_longest)) {
5110         *rx_utf8 = sv_longest;
5111         *rx_substr = NULL;
5112     } else {
5113         *rx_substr = sv_longest;
5114         *rx_utf8 = NULL;
5115     }
5116     /* end_shift is how many chars that must be matched that
5117         follow this item. We calculate it ahead of time as once the
5118         lookbehind offset is added in we lose the ability to correctly
5119         calculate it.*/
5120     ml = minlen ? *(minlen) : (I32)longest_length;
5121     *rx_end_shift = ml - offset
5122         - longest_length + (SvTAIL(sv_longest) != 0)
5123         + lookbehind;
5124
5125     t = (eol/* Can't have SEOL and MULTI */
5126          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5127     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5128
5129     return TRUE;
5130 }
5131
5132 /*
5133  * Perl_re_op_compile - the perl internal RE engine's function to compile a
5134  * regular expression into internal code.
5135  * The pattern may be passed either as:
5136  *    a list of SVs (patternp plus pat_count)
5137  *    a list of OPs (expr)
5138  * If both are passed, the SV list is used, but the OP list indicates
5139  * which SVs are actually pre-compiled code blocks
5140  *
5141  * The SVs in the list have magic and qr overloading applied to them (and
5142  * the list may be modified in-place with replacement SVs in the latter
5143  * case).
5144  *
5145  * If the pattern hasn't changed from old_re, then old_re will be
5146  * returned.
5147  *
5148  * eng is the current engine. If that engine has an op_comp method, then
5149  * handle directly (i.e. we assume that op_comp was us); otherwise, just
5150  * do the initial concatenation of arguments and pass on to the external
5151  * engine.
5152  *
5153  * If is_bare_re is not null, set it to a boolean indicating whether the
5154  * arg list reduced (after overloading) to a single bare regex which has
5155  * been returned (i.e. /$qr/).
5156  *
5157  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5158  *
5159  * pm_flags contains the PMf_* flags, typically based on those from the
5160  * pm_flags field of the related PMOP. Currently we're only interested in
5161  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5162  *
5163  * We can't allocate space until we know how big the compiled form will be,
5164  * but we can't compile it (and thus know how big it is) until we've got a
5165  * place to put the code.  So we cheat:  we compile it twice, once with code
5166  * generation turned off and size counting turned on, and once "for real".
5167  * This also means that we don't allocate space until we are sure that the
5168  * thing really will compile successfully, and we never have to move the
5169  * code and thus invalidate pointers into it.  (Note that it has to be in
5170  * one piece because free() must be able to free it all.) [NB: not true in perl]
5171  *
5172  * Beware that the optimization-preparation code in here knows about some
5173  * of the structure of the compiled regexp.  [I'll say.]
5174  */
5175
5176 REGEXP *
5177 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5178                     OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5179                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5180 {
5181     dVAR;
5182     REGEXP *rx;
5183     struct regexp *r;
5184     regexp_internal *ri;
5185     STRLEN plen;
5186     char  * VOL exp;
5187     char* xend;
5188     regnode *scan;
5189     I32 flags;
5190     I32 minlen = 0;
5191     U32 rx_flags;
5192     SV * VOL pat;
5193     SV * VOL code_blocksv = NULL;
5194
5195     /* these are all flags - maybe they should be turned
5196      * into a single int with different bit masks */
5197     I32 sawlookahead = 0;
5198     I32 sawplus = 0;
5199     I32 sawopen = 0;
5200     bool used_setjump = FALSE;
5201     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5202     bool code_is_utf8 = 0;
5203     bool VOL recompile = 0;
5204     bool runtime_code = 0;
5205     U8 jump_ret = 0;
5206     dJMPENV;
5207     scan_data_t data;
5208     RExC_state_t RExC_state;
5209     RExC_state_t * const pRExC_state = &RExC_state;
5210 #ifdef TRIE_STUDY_OPT
5211     int restudied;
5212     RExC_state_t copyRExC_state;
5213 #endif
5214     GET_RE_DEBUG_FLAGS_DECL;
5215
5216     PERL_ARGS_ASSERT_RE_OP_COMPILE;
5217
5218     DEBUG_r(if (!PL_colorset) reginitcolors());
5219
5220 #ifndef PERL_IN_XSUB_RE
5221     /* Initialize these here instead of as-needed, as is quick and avoids
5222      * having to test them each time otherwise */
5223     if (! PL_AboveLatin1) {
5224         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5225         PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5226         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5227
5228         PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5229                                 = _new_invlist_C_array(L1PosixAlnum_invlist);
5230         PL_Posix_ptrs[_CC_ALPHANUMERIC]
5231                                 = _new_invlist_C_array(PosixAlnum_invlist);
5232
5233         PL_L1Posix_ptrs[_CC_ALPHA]
5234                                 = _new_invlist_C_array(L1PosixAlpha_invlist);
5235         PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5236
5237         PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5238         PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5239
5240         /* Cased is the same as Alpha in the ASCII range */
5241         PL_L1Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(L1Cased_invlist);
5242         PL_Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(PosixAlpha_invlist);
5243
5244         PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5245         PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5246
5247         PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5248         PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5249
5250         PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5251         PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5252
5253         PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5254         PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5255
5256         PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5257         PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5258
5259         PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5260         PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5261
5262         PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5263         PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5264         PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5265         PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5266
5267         PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5268         PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
5269
5270         PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
5271
5272         PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
5273         PL_L1Posix_ptrs[_CC_WORDCHAR]
5274                                 = _new_invlist_C_array(L1PosixWord_invlist);
5275
5276         PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
5277         PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
5278
5279         PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5280     }
5281 #endif
5282
5283     pRExC_state->code_blocks = NULL;
5284     pRExC_state->num_code_blocks = 0;
5285
5286     if (is_bare_re)
5287         *is_bare_re = FALSE;
5288
5289     if (expr && (expr->op_type == OP_LIST ||
5290                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5291
5292         /* is the source UTF8, and how many code blocks are there? */
5293         OP *o;
5294         int ncode = 0;
5295
5296         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5297             if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5298                 code_is_utf8 = 1;
5299             else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5300                 /* count of DO blocks */
5301                 ncode++;
5302         }
5303         if (ncode) {
5304             pRExC_state->num_code_blocks = ncode;
5305             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5306         }
5307     }
5308
5309     if (pat_count) {
5310         /* handle a list of SVs */
5311
5312         SV **svp;
5313
5314         /* apply magic and RE overloading to each arg */
5315         for (svp = patternp; svp < patternp + pat_count; svp++) {
5316             SV *rx = *svp;
5317             SvGETMAGIC(rx);
5318             if (SvROK(rx) && SvAMAGIC(rx)) {
5319                 SV *sv = AMG_CALLunary(rx, regexp_amg);
5320                 if (sv) {
5321                     if (SvROK(sv))
5322                         sv = SvRV(sv);
5323                     if (SvTYPE(sv) != SVt_REGEXP)
5324                         Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5325                     *svp = sv;
5326                 }
5327             }
5328         }
5329
5330         if (pat_count > 1) {
5331             /* concat multiple args and find any code block indexes */
5332
5333             OP *o = NULL;
5334             int n = 0;
5335             bool utf8 = 0;
5336             STRLEN orig_patlen = 0;
5337
5338             if (pRExC_state->num_code_blocks) {
5339                 o = cLISTOPx(expr)->op_first;
5340                 assert(   o->op_type == OP_PUSHMARK
5341                        || (o->op_type == OP_NULL && o->op_targ == OP_PUSHMARK)
5342                        || o->op_type == OP_PADRANGE);
5343                 o = o->op_sibling;
5344             }
5345
5346             pat = newSVpvn("", 0);
5347             SAVEFREESV(pat);
5348
5349             /* determine if the pattern is going to be utf8 (needed
5350              * in advance to align code block indices correctly).
5351              * XXX This could fail to be detected for an arg with
5352              * overloading but not concat overloading; but the main effect
5353              * in this obscure case is to need a 'use re eval' for a
5354              * literal code block */
5355             for (svp = patternp; svp < patternp + pat_count; svp++) {
5356                 if (SvUTF8(*svp))
5357                     utf8 = 1;
5358             }
5359             if (utf8)
5360                 SvUTF8_on(pat);
5361
5362             for (svp = patternp; svp < patternp + pat_count; svp++) {
5363                 SV *sv, *msv = *svp;
5364                 SV *rx;
5365                 bool code = 0;
5366                 /* we make the assumption here that each op in the list of
5367                  * op_siblings maps to one SV pushed onto the stack,
5368                  * except for code blocks, with have both an OP_NULL and
5369                  * and OP_CONST.
5370                  * This allows us to match up the list of SVs against the
5371                  * list of OPs to find the next code block.
5372                  *
5373                  * Note that       PUSHMARK PADSV PADSV ..
5374                  * is optimised to
5375                  *                 PADRANGE NULL  NULL  ..
5376                  * so the alignment still works. */
5377                 if (o) {
5378                     if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5379                         assert(n < pRExC_state->num_code_blocks);
5380                         pRExC_state->code_blocks[n].start = SvCUR(pat);
5381                         pRExC_state->code_blocks[n].block = o;
5382                         pRExC_state->code_blocks[n].src_regex = NULL;
5383                         n++;
5384                         code = 1;
5385                         o = o->op_sibling; /* skip CONST */
5386                         assert(o);
5387                     }
5388                     o = o->op_sibling;;
5389                 }
5390
5391                 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5392                         (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5393                 {
5394                     sv_setsv(pat, sv);
5395                     /* overloading involved: all bets are off over literal
5396                      * code. Pretend we haven't seen it */
5397                     pRExC_state->num_code_blocks -= n;
5398                     n = 0;
5399                     rx = NULL;
5400
5401                 }
5402                 else  {
5403                     while (SvAMAGIC(msv)
5404                             && (sv = AMG_CALLunary(msv, string_amg))
5405                             && sv != msv
5406                             &&  !(   SvROK(msv)
5407                                   && SvROK(sv)
5408                                   && SvRV(msv) == SvRV(sv))
5409                     ) {
5410                         msv = sv;
5411                         SvGETMAGIC(msv);
5412                     }
5413                     if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5414                         msv = SvRV(msv);
5415                     orig_patlen = SvCUR(pat);
5416                     sv_catsv_nomg(pat, msv);
5417                     rx = msv;
5418                     if (code)
5419                         pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5420                 }
5421
5422                 /* extract any code blocks within any embedded qr//'s */
5423                 if (rx && SvTYPE(rx) == SVt_REGEXP
5424                     && RX_ENGINE((REGEXP*)rx)->op_comp)
5425                 {
5426
5427                     RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5428                     if (ri->num_code_blocks) {
5429                         int i;
5430                         /* the presence of an embedded qr// with code means
5431                          * we should always recompile: the text of the
5432                          * qr// may not have changed, but it may be a
5433                          * different closure than last time */
5434                         recompile = 1;
5435                         Renew(pRExC_state->code_blocks,
5436                             pRExC_state->num_code_blocks + ri->num_code_blocks,
5437                             struct reg_code_block);
5438                         pRExC_state->num_code_blocks += ri->num_code_blocks;
5439                         for (i=0; i < ri->num_code_blocks; i++) {
5440                             struct reg_code_block *src, *dst;
5441                             STRLEN offset =  orig_patlen
5442                                 + ReANY((REGEXP *)rx)->pre_prefix;
5443                             assert(n < pRExC_state->num_code_blocks);
5444                             src = &ri->code_blocks[i];
5445                             dst = &pRExC_state->code_blocks[n];
5446                             dst->start      = src->start + offset;
5447                             dst->end        = src->end   + offset;
5448                             dst->block      = src->block;
5449                             dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5450                                                     src->src_regex
5451                                                         ? src->src_regex
5452                                                         : (REGEXP*)rx);
5453                             n++;
5454                         }
5455                     }
5456                 }
5457             }
5458             SvSETMAGIC(pat);
5459         }
5460         else {
5461             SV *sv;
5462             pat = *patternp;
5463             while (SvAMAGIC(pat)
5464                     && (sv = AMG_CALLunary(pat, string_amg))
5465                     && sv != pat)
5466             {
5467                 pat = sv;
5468                 SvGETMAGIC(pat);
5469             }
5470         }
5471
5472         /* handle bare regex: foo =~ $re */
5473         {
5474             SV *re = pat;
5475             if (SvROK(re))
5476                 re = SvRV(re);
5477             if (SvTYPE(re) == SVt_REGEXP) {
5478                 if (is_bare_re)
5479                     *is_bare_re = TRUE;
5480                 SvREFCNT_inc(re);
5481                 Safefree(pRExC_state->code_blocks);
5482                 return (REGEXP*)re;
5483             }
5484         }
5485     }
5486     else {
5487         /* not a list of SVs, so must be a list of OPs */
5488         assert(expr);
5489         if (expr->op_type == OP_LIST) {
5490             int i = -1;
5491             bool is_code = 0;
5492             OP *o;
5493
5494             pat = newSVpvn("", 0);
5495             SAVEFREESV(pat);
5496             if (code_is_utf8)
5497                 SvUTF8_on(pat);
5498
5499             /* given a list of CONSTs and DO blocks in expr, append all
5500              * the CONSTs to pat, and record the start and end of each
5501              * code block in code_blocks[] (each DO{} op is followed by an
5502              * OP_CONST containing the corresponding literal '(?{...})
5503              * text)
5504              */
5505             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5506                 if (o->op_type == OP_CONST) {
5507                     sv_catsv(pat, cSVOPo_sv);
5508                     if (is_code) {
5509                         pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5510                         is_code = 0;
5511                     }
5512                 }
5513                 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5514                     assert(i+1 < pRExC_state->num_code_blocks);
5515                     pRExC_state->code_blocks[++i].start = SvCUR(pat);
5516                     pRExC_state->code_blocks[i].block = o;
5517                     pRExC_state->code_blocks[i].src_regex = NULL;
5518                     is_code = 1;
5519                 }
5520             }
5521         }
5522         else {
5523             assert(expr->op_type == OP_CONST);
5524             pat = cSVOPx_sv(expr);
5525         }
5526     }
5527
5528     exp = SvPV_nomg(pat, plen);
5529
5530     if (!eng->op_comp) {
5531         if ((SvUTF8(pat) && IN_BYTES)
5532                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5533         {
5534             /* make a temporary copy; either to convert to bytes,
5535              * or to avoid repeating get-magic / overloaded stringify */
5536             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5537                                         (IN_BYTES ? 0 : SvUTF8(pat)));
5538         }
5539         Safefree(pRExC_state->code_blocks);
5540         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5541     }
5542
5543     /* ignore the utf8ness if the pattern is 0 length */
5544     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5545     RExC_uni_semantics = 0;
5546     RExC_contains_locale = 0;
5547     pRExC_state->runtime_code_qr = NULL;
5548
5549     /****************** LONG JUMP TARGET HERE***********************/
5550     /* Longjmp back to here if have to switch in midstream to utf8 */
5551     if (! RExC_orig_utf8) {
5552         JMPENV_PUSH(jump_ret);
5553         used_setjump = TRUE;
5554     }
5555
5556     if (jump_ret == 0) {    /* First time through */
5557         xend = exp + plen;
5558
5559         DEBUG_COMPILE_r({
5560             SV *dsv= sv_newmortal();
5561             RE_PV_QUOTED_DECL(s, RExC_utf8,
5562                 dsv, exp, plen, 60);
5563             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5564                            PL_colors[4],PL_colors[5],s);
5565         });
5566     }
5567     else {  /* longjumped back */
5568         U8 *src, *dst;
5569         int n=0;
5570         STRLEN s = 0, d = 0;
5571         bool do_end = 0;
5572
5573         /* If the cause for the longjmp was other than changing to utf8, pop
5574          * our own setjmp, and longjmp to the correct handler */
5575         if (jump_ret != UTF8_LONGJMP) {
5576             JMPENV_POP;
5577             JMPENV_JUMP(jump_ret);
5578         }
5579
5580         GET_RE_DEBUG_FLAGS;
5581
5582         /* It's possible to write a regexp in ascii that represents Unicode
5583         codepoints outside of the byte range, such as via \x{100}. If we
5584         detect such a sequence we have to convert the entire pattern to utf8
5585         and then recompile, as our sizing calculation will have been based
5586         on 1 byte == 1 character, but we will need to use utf8 to encode
5587         at least some part of the pattern, and therefore must convert the whole
5588         thing.
5589         -- dmq */
5590         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5591             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5592
5593         /* upgrade pattern to UTF8, and if there are code blocks,
5594          * recalculate the indices.
5595          * This is essentially an unrolled Perl_bytes_to_utf8() */
5596
5597         src = (U8*)SvPV_nomg(pat, plen);
5598         Newx(dst, plen * 2 + 1, U8);
5599
5600         while (s < plen) {
5601             const UV uv = NATIVE_TO_ASCII(src[s]);
5602             if (UNI_IS_INVARIANT(uv))
5603                 dst[d]   = (U8)UTF_TO_NATIVE(uv);
5604             else {
5605                 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5606                 dst[d]   = (U8)UTF8_EIGHT_BIT_LO(uv);
5607             }
5608             if (n < pRExC_state->num_code_blocks) {
5609                 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5610                     pRExC_state->code_blocks[n].start = d;
5611                     assert(dst[d] == '(');
5612                     do_end = 1;
5613                 }
5614                 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5615                     pRExC_state->code_blocks[n].end = d;
5616                     assert(dst[d] == ')');
5617                     do_end = 0;
5618                     n++;
5619                 }
5620             }
5621             s++;
5622             d++;
5623         }
5624         dst[d] = '\0';
5625         plen = d;
5626         exp = (char*) dst;
5627         xend = exp + plen;
5628         SAVEFREEPV(exp);
5629         RExC_orig_utf8 = RExC_utf8 = 1;
5630     }
5631
5632     /* return old regex if pattern hasn't changed */
5633
5634     if (   old_re
5635         && !recompile
5636         && !!RX_UTF8(old_re) == !!RExC_utf8
5637         && RX_PRECOMP(old_re)
5638         && RX_PRELEN(old_re) == plen
5639         && memEQ(RX_PRECOMP(old_re), exp, plen))
5640     {
5641         /* with runtime code, always recompile */
5642         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5643                                             exp, plen);
5644         if (!runtime_code) {
5645             if (used_setjump) {
5646                 JMPENV_POP;
5647             }
5648             Safefree(pRExC_state->code_blocks);
5649             return old_re;
5650         }
5651     }
5652     else if ((pm_flags & PMf_USE_RE_EVAL)
5653                 /* this second condition covers the non-regex literal case,
5654                  * i.e.  $foo =~ '(?{})'. */
5655                 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5656                     && (PL_hints & HINT_RE_EVAL))
5657     )
5658         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5659                             exp, plen);
5660
5661 #ifdef TRIE_STUDY_OPT
5662     restudied = 0;
5663 #endif
5664
5665     rx_flags = orig_rx_flags;
5666
5667     if (initial_charset == REGEX_LOCALE_CHARSET) {
5668         RExC_contains_locale = 1;
5669     }
5670     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5671
5672         /* Set to use unicode semantics if the pattern is in utf8 and has the
5673          * 'depends' charset specified, as it means unicode when utf8  */
5674         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5675     }
5676
5677     RExC_precomp = exp;
5678     RExC_flags = rx_flags;
5679     RExC_pm_flags = pm_flags;
5680
5681     if (runtime_code) {
5682         if (TAINTING_get && TAINT_get)
5683             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5684
5685         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5686             /* whoops, we have a non-utf8 pattern, whilst run-time code
5687              * got compiled as utf8. Try again with a utf8 pattern */
5688              JMPENV_JUMP(UTF8_LONGJMP);
5689         }
5690     }
5691     assert(!pRExC_state->runtime_code_qr);
5692
5693     RExC_sawback = 0;
5694
5695     RExC_seen = 0;
5696     RExC_in_lookbehind = 0;
5697     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5698     RExC_extralen = 0;
5699     RExC_override_recoding = 0;
5700     RExC_in_multi_char_class = 0;
5701
5702     /* First pass: determine size, legality. */
5703     RExC_parse = exp;
5704     RExC_start = exp;
5705     RExC_end = xend;
5706     RExC_naughty = 0;
5707     RExC_npar = 1;
5708     RExC_nestroot = 0;
5709     RExC_size = 0L;
5710     RExC_emit = &PL_regdummy;
5711     RExC_whilem_seen = 0;
5712     RExC_open_parens = NULL;
5713     RExC_close_parens = NULL;
5714     RExC_opend = NULL;
5715     RExC_paren_names = NULL;
5716 #ifdef DEBUGGING
5717     RExC_paren_name_list = NULL;
5718 #endif
5719     RExC_recurse = NULL;
5720     RExC_recurse_count = 0;
5721     pRExC_state->code_index = 0;
5722
5723 #if 0 /* REGC() is (currently) a NOP at the first pass.
5724        * Clever compilers notice this and complain. --jhi */
5725     REGC((U8)REG_MAGIC, (char*)RExC_emit);
5726 #endif
5727     DEBUG_PARSE_r(
5728         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5729         RExC_lastnum=0;
5730         RExC_lastparse=NULL;
5731     );
5732     /* reg may croak on us, not giving us a chance to free
5733        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
5734        need it to survive as long as the regexp (qr/(?{})/).
5735        We must check that code_blocksv is not already set, because we may
5736        have longjmped back. */
5737     if (pRExC_state->code_blocks && !code_blocksv) {
5738         code_blocksv = newSV_type(SVt_PV);
5739         SAVEFREESV(code_blocksv);
5740         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
5741         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
5742     }
5743     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5744         RExC_precomp = NULL;
5745         return(NULL);
5746     }
5747     if (code_blocksv)
5748         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
5749
5750     /* Here, finished first pass.  Get rid of any added setjmp */
5751     if (used_setjump) {
5752         JMPENV_POP;
5753     }
5754
5755     DEBUG_PARSE_r({
5756         PerlIO_printf(Perl_debug_log,
5757             "Required size %"IVdf" nodes\n"
5758             "Starting second pass (creation)\n",
5759             (IV)RExC_size);
5760         RExC_lastnum=0;
5761         RExC_lastparse=NULL;
5762     });
5763
5764     /* The first pass could have found things that force Unicode semantics */
5765     if ((RExC_utf8 || RExC_uni_semantics)
5766          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5767     {
5768         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5769     }
5770
5771     /* Small enough for pointer-storage convention?
5772        If extralen==0, this means that we will not need long jumps. */
5773     if (RExC_size >= 0x10000L && RExC_extralen)
5774         RExC_size += RExC_extralen;
5775     else
5776         RExC_extralen = 0;
5777     if (RExC_whilem_seen > 15)
5778         RExC_whilem_seen = 15;
5779
5780     /* Allocate space and zero-initialize. Note, the two step process
5781        of zeroing when in debug mode, thus anything assigned has to
5782        happen after that */
5783     rx = (REGEXP*) newSV_type(SVt_REGEXP);
5784     r = ReANY(rx);
5785     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5786          char, regexp_internal);
5787     if ( r == NULL || ri == NULL )
5788         FAIL("Regexp out of space");
5789 #ifdef DEBUGGING
5790     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5791     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5792 #else
5793     /* bulk initialize base fields with 0. */
5794     Zero(ri, sizeof(regexp_internal), char);
5795 #endif
5796
5797     /* non-zero initialization begins here */
5798     RXi_SET( r, ri );
5799     r->engine= eng;
5800     r->extflags = rx_flags;
5801     if (pm_flags & PMf_IS_QR) {
5802         ri->code_blocks = pRExC_state->code_blocks;
5803         ri->num_code_blocks = pRExC_state->num_code_blocks;
5804     }
5805     else
5806     {
5807         int n;
5808         for (n = 0; n < pRExC_state->num_code_blocks; n++)
5809             if (pRExC_state->code_blocks[n].src_regex)
5810                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
5811         SAVEFREEPV(pRExC_state->code_blocks);
5812     }
5813
5814     {
5815         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5816         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5817
5818         /* The caret is output if there are any defaults: if not all the STD
5819          * flags are set, or if no character set specifier is needed */
5820         bool has_default =
5821                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5822                     || ! has_charset);
5823         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5824         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5825                             >> RXf_PMf_STD_PMMOD_SHIFT);
5826         const char *fptr = STD_PAT_MODS;        /*"msix"*/
5827         char *p;
5828         /* Allocate for the worst case, which is all the std flags are turned
5829          * on.  If more precision is desired, we could do a population count of
5830          * the flags set.  This could be done with a small lookup table, or by
5831          * shifting, masking and adding, or even, when available, assembly
5832          * language for a machine-language population count.
5833          * We never output a minus, as all those are defaults, so are
5834          * covered by the caret */
5835         const STRLEN wraplen = plen + has_p + has_runon
5836             + has_default       /* If needs a caret */
5837
5838                 /* If needs a character set specifier */
5839             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5840             + (sizeof(STD_PAT_MODS) - 1)
5841             + (sizeof("(?:)") - 1);
5842
5843         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
5844         r->xpv_len_u.xpvlenu_pv = p;
5845         if (RExC_utf8)
5846             SvFLAGS(rx) |= SVf_UTF8;
5847         *p++='('; *p++='?';
5848
5849         /* If a default, cover it using the caret */
5850         if (has_default) {
5851             *p++= DEFAULT_PAT_MOD;
5852         }
5853         if (has_charset) {
5854             STRLEN len;
5855             const char* const name = get_regex_charset_name(r->extflags, &len);
5856             Copy(name, p, len, char);
5857             p += len;
5858         }
5859         if (has_p)
5860             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5861         {
5862             char ch;
5863             while((ch = *fptr++)) {
5864                 if(reganch & 1)
5865                     *p++ = ch;
5866                 reganch >>= 1;
5867             }
5868         }
5869
5870         *p++ = ':';
5871         Copy(RExC_precomp, p, plen, char);
5872         assert ((RX_WRAPPED(rx) - p) < 16);
5873         r->pre_prefix = p - RX_WRAPPED(rx);
5874         p += plen;
5875         if (has_runon)
5876             *p++ = '\n';
5877         *p++ = ')';
5878         *p = 0;
5879         SvCUR_set(rx, p - RX_WRAPPED(rx));
5880     }
5881
5882     r->intflags = 0;
5883     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5884
5885     if (RExC_seen & REG_SEEN_RECURSE) {
5886         Newxz(RExC_open_parens, RExC_npar,regnode *);
5887         SAVEFREEPV(RExC_open_parens);
5888         Newxz(RExC_close_parens,RExC_npar,regnode *);
5889         SAVEFREEPV(RExC_close_parens);
5890     }
5891
5892     /* Useful during FAIL. */
5893 #ifdef RE_TRACK_PATTERN_OFFSETS
5894     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5895     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5896                           "%s %"UVuf" bytes for offset annotations.\n",
5897                           ri->u.offsets ? "Got" : "Couldn't get",
5898                           (UV)((2*RExC_size+1) * sizeof(U32))));
5899 #endif
5900     SetProgLen(ri,RExC_size);
5901     RExC_rx_sv = rx;
5902     RExC_rx = r;
5903     RExC_rxi = ri;
5904
5905     /* Second pass: emit code. */
5906     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
5907     RExC_pm_flags = pm_flags;
5908     RExC_parse = exp;
5909     RExC_end = xend;
5910     RExC_naughty = 0;
5911     RExC_npar = 1;
5912     RExC_emit_start = ri->program;
5913     RExC_emit = ri->program;
5914     RExC_emit_bound = ri->program + RExC_size + 1;
5915     pRExC_state->code_index = 0;
5916
5917     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5918     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5919         ReREFCNT_dec(rx);
5920         return(NULL);
5921     }
5922     /* XXXX To minimize changes to RE engine we always allocate
5923        3-units-long substrs field. */
5924     Newx(r->substrs, 1, struct reg_substr_data);
5925     if (RExC_recurse_count) {
5926         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5927         SAVEFREEPV(RExC_recurse);
5928     }
5929
5930 reStudy:
5931     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5932     Zero(r->substrs, 1, struct reg_substr_data);
5933
5934 #ifdef TRIE_STUDY_OPT
5935     if (!restudied) {
5936         StructCopy(&zero_scan_data, &data, scan_data_t);
5937         copyRExC_state = RExC_state;
5938     } else {
5939         U32 seen=RExC_seen;
5940         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5941
5942         RExC_state = copyRExC_state;
5943         if (seen & REG_TOP_LEVEL_BRANCHES)
5944             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5945         else
5946             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5947         StructCopy(&zero_scan_data, &data, scan_data_t);
5948     }
5949 #else
5950     StructCopy(&zero_scan_data, &data, scan_data_t);
5951 #endif
5952
5953     /* Dig out information for optimizations. */
5954     r->extflags = RExC_flags; /* was pm_op */
5955     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5956
5957     if (UTF)
5958         SvUTF8_on(rx);  /* Unicode in it? */
5959     ri->regstclass = NULL;
5960     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
5961         r->intflags |= PREGf_NAUGHTY;
5962     scan = ri->program + 1;             /* First BRANCH. */
5963
5964     /* testing for BRANCH here tells us whether there is "must appear"
5965        data in the pattern. If there is then we can use it for optimisations */
5966     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
5967         I32 fake;
5968         STRLEN longest_float_length, longest_fixed_length;
5969         struct regnode_charclass_class ch_class; /* pointed to by data */
5970         int stclass_flag;
5971         I32 last_close = 0; /* pointed to by data */
5972         regnode *first= scan;
5973         regnode *first_next= regnext(first);
5974         /*
5975          * Skip introductions and multiplicators >= 1
5976          * so that we can extract the 'meat' of the pattern that must
5977          * match in the large if() sequence following.
5978          * NOTE that EXACT is NOT covered here, as it is normally
5979          * picked up by the optimiser separately.
5980          *
5981          * This is unfortunate as the optimiser isnt handling lookahead
5982          * properly currently.
5983          *
5984          */
5985         while ((OP(first) == OPEN && (sawopen = 1)) ||
5986                /* An OR of *one* alternative - should not happen now. */
5987             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
5988             /* for now we can't handle lookbehind IFMATCH*/
5989             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
5990             (OP(first) == PLUS) ||
5991             (OP(first) == MINMOD) ||
5992                /* An {n,m} with n>0 */
5993             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
5994             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
5995         {
5996                 /*
5997                  * the only op that could be a regnode is PLUS, all the rest
5998                  * will be regnode_1 or regnode_2.
5999                  *
6000                  */
6001                 if (OP(first) == PLUS)
6002                     sawplus = 1;
6003                 else
6004                     first += regarglen[OP(first)];
6005
6006                 first = NEXTOPER(first);
6007                 first_next= regnext(first);
6008         }
6009
6010         /* Starting-point info. */
6011       again:
6012         DEBUG_PEEP("first:",first,0);
6013         /* Ignore EXACT as we deal with it later. */
6014         if (PL_regkind[OP(first)] == EXACT) {
6015             if (OP(first) == EXACT)
6016                 NOOP;   /* Empty, get anchored substr later. */
6017             else
6018                 ri->regstclass = first;
6019         }
6020 #ifdef TRIE_STCLASS
6021         else if (PL_regkind[OP(first)] == TRIE &&
6022                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6023         {
6024             regnode *trie_op;
6025             /* this can happen only on restudy */
6026             if ( OP(first) == TRIE ) {
6027                 struct regnode_1 *trieop = (struct regnode_1 *)
6028                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
6029                 StructCopy(first,trieop,struct regnode_1);
6030                 trie_op=(regnode *)trieop;
6031             } else {
6032                 struct regnode_charclass *trieop = (struct regnode_charclass *)
6033                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6034                 StructCopy(first,trieop,struct regnode_charclass);
6035                 trie_op=(regnode *)trieop;
6036             }
6037             OP(trie_op)+=2;
6038             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6039             ri->regstclass = trie_op;
6040         }
6041 #endif
6042         else if (REGNODE_SIMPLE(OP(first)))
6043             ri->regstclass = first;
6044         else if (PL_regkind[OP(first)] == BOUND ||
6045                  PL_regkind[OP(first)] == NBOUND)
6046             ri->regstclass = first;
6047         else if (PL_regkind[OP(first)] == BOL) {
6048             r->extflags |= (OP(first) == MBOL
6049                            ? RXf_ANCH_MBOL
6050                            : (OP(first) == SBOL
6051                               ? RXf_ANCH_SBOL
6052                               : RXf_ANCH_BOL));
6053             first = NEXTOPER(first);
6054             goto again;
6055         }
6056         else if (OP(first) == GPOS) {
6057             r->extflags |= RXf_ANCH_GPOS;
6058             first = NEXTOPER(first);
6059             goto again;
6060         }
6061         else if ((!sawopen || !RExC_sawback) &&
6062             (OP(first) == STAR &&
6063             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6064             !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6065         {
6066             /* turn .* into ^.* with an implied $*=1 */
6067             const int type =
6068                 (OP(NEXTOPER(first)) == REG_ANY)
6069                     ? RXf_ANCH_MBOL
6070                     : RXf_ANCH_SBOL;
6071             r->extflags |= type;
6072             r->intflags |= PREGf_IMPLICIT;
6073             first = NEXTOPER(first);
6074             goto again;
6075         }
6076         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6077             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6078             /* x+ must match at the 1st pos of run of x's */
6079             r->intflags |= PREGf_SKIP;
6080
6081         /* Scan is after the zeroth branch, first is atomic matcher. */
6082 #ifdef TRIE_STUDY_OPT
6083         DEBUG_PARSE_r(
6084             if (!restudied)
6085                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6086                               (IV)(first - scan + 1))
6087         );
6088 #else
6089         DEBUG_PARSE_r(
6090             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6091                 (IV)(first - scan + 1))
6092         );
6093 #endif
6094
6095
6096         /*
6097         * If there's something expensive in the r.e., find the
6098         * longest literal string that must appear and make it the
6099         * regmust.  Resolve ties in favor of later strings, since
6100         * the regstart check works with the beginning of the r.e.
6101         * and avoiding duplication strengthens checking.  Not a
6102         * strong reason, but sufficient in the absence of others.
6103         * [Now we resolve ties in favor of the earlier string if
6104         * it happens that c_offset_min has been invalidated, since the
6105         * earlier string may buy us something the later one won't.]
6106         */
6107
6108         data.longest_fixed = newSVpvs("");
6109         data.longest_float = newSVpvs("");
6110         data.last_found = newSVpvs("");
6111         data.longest = &(data.longest_fixed);
6112         ENTER_with_name("study_chunk");
6113         SAVEFREESV(data.longest_fixed);
6114         SAVEFREESV(data.longest_float);
6115         SAVEFREESV(data.last_found);
6116         first = scan;
6117         if (!ri->regstclass) {
6118             cl_init(pRExC_state, &ch_class);
6119             data.start_class = &ch_class;
6120             stclass_flag = SCF_DO_STCLASS_AND;
6121         } else                          /* XXXX Check for BOUND? */
6122             stclass_flag = 0;
6123         data.last_closep = &last_close;
6124
6125         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6126             &data, -1, NULL, NULL,
6127             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6128
6129
6130         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6131
6132
6133         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6134              && data.last_start_min == 0 && data.last_end > 0
6135              && !RExC_seen_zerolen
6136              && !(RExC_seen & REG_SEEN_VERBARG)
6137              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6138             r->extflags |= RXf_CHECK_ALL;
6139         scan_commit(pRExC_state, &data,&minlen,0);
6140
6141         longest_float_length = CHR_SVLEN(data.longest_float);
6142
6143         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6144                    && data.offset_fixed == data.offset_float_min
6145                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6146             && S_setup_longest (aTHX_ pRExC_state,
6147                                     data.longest_float,
6148                                     &(r->float_utf8),
6149                                     &(r->float_substr),
6150                                     &(r->float_end_shift),
6151                                     data.lookbehind_float,
6152                                     data.offset_float_min,
6153                                     data.minlen_float,
6154                                     longest_float_length,
6155                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
6156                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6157         {
6158             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6159             r->float_max_offset = data.offset_float_max;
6160             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6161                 r->float_max_offset -= data.lookbehind_float;
6162             SvREFCNT_inc_simple_void_NN(data.longest_float);
6163         }
6164         else {
6165             r->float_substr = r->float_utf8 = NULL;
6166             longest_float_length = 0;
6167         }
6168
6169         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6170
6171         if (S_setup_longest (aTHX_ pRExC_state,
6172                                 data.longest_fixed,
6173                                 &(r->anchored_utf8),
6174                                 &(r->anchored_substr),
6175                                 &(r->anchored_end_shift),
6176                                 data.lookbehind_fixed,
6177                                 data.offset_fixed,
6178                                 data.minlen_fixed,
6179                                 longest_fixed_length,
6180                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6181                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6182         {
6183             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6184             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6185         }
6186         else {
6187             r->anchored_substr = r->anchored_utf8 = NULL;
6188             longest_fixed_length = 0;
6189         }
6190         LEAVE_with_name("study_chunk");
6191
6192         if (ri->regstclass
6193             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6194             ri->regstclass = NULL;
6195
6196         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6197             && stclass_flag
6198             && ! TEST_SSC_EOS(data.start_class)
6199             && !cl_is_anything(data.start_class))
6200         {
6201             const U32 n = add_data(pRExC_state, 1, "f");
6202             OP(data.start_class) = ANYOF_SYNTHETIC;
6203
6204             Newx(RExC_rxi->data->data[n], 1,
6205                 struct regnode_charclass_class);
6206             StructCopy(data.start_class,
6207                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6208                        struct regnode_charclass_class);
6209             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6210             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6211             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6212                       regprop(r, sv, (regnode*)data.start_class);
6213                       PerlIO_printf(Perl_debug_log,
6214                                     "synthetic stclass \"%s\".\n",
6215                                     SvPVX_const(sv));});
6216         }
6217
6218         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6219         if (longest_fixed_length > longest_float_length) {
6220             r->check_end_shift = r->anchored_end_shift;
6221             r->check_substr = r->anchored_substr;
6222             r->check_utf8 = r->anchored_utf8;
6223             r->check_offset_min = r->check_offset_max = r->anchored_offset;
6224             if (r->extflags & RXf_ANCH_SINGLE)
6225                 r->extflags |= RXf_NOSCAN;
6226         }
6227         else {
6228             r->check_end_shift = r->float_end_shift;
6229             r->check_substr = r->float_substr;
6230             r->check_utf8 = r->float_utf8;
6231             r->check_offset_min = r->float_min_offset;
6232             r->check_offset_max = r->float_max_offset;
6233         }
6234         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6235            This should be changed ASAP!  */
6236         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6237             r->extflags |= RXf_USE_INTUIT;
6238             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6239                 r->extflags |= RXf_INTUIT_TAIL;
6240         }
6241         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6242         if ( (STRLEN)minlen < longest_float_length )
6243             minlen= longest_float_length;
6244         if ( (STRLEN)minlen < longest_fixed_length )
6245             minlen= longest_fixed_length;
6246         */
6247     }
6248     else {
6249         /* Several toplevels. Best we can is to set minlen. */
6250         I32 fake;
6251         struct regnode_charclass_class ch_class;
6252         I32 last_close = 0;
6253
6254         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6255
6256         scan = ri->program + 1;
6257         cl_init(pRExC_state, &ch_class);
6258         data.start_class = &ch_class;
6259         data.last_closep = &last_close;
6260
6261
6262         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6263             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6264
6265         CHECK_RESTUDY_GOTO_butfirst(NOOP);
6266
6267         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6268                 = r->float_substr = r->float_utf8 = NULL;
6269
6270         if (! TEST_SSC_EOS(data.start_class)
6271             && !cl_is_anything(data.start_class))
6272         {
6273             const U32 n = add_data(pRExC_state, 1, "f");
6274             OP(data.start_class) = ANYOF_SYNTHETIC;
6275
6276             Newx(RExC_rxi->data->data[n], 1,
6277                 struct regnode_charclass_class);
6278             StructCopy(data.start_class,
6279                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6280                        struct regnode_charclass_class);
6281             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6282             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6283             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6284                       regprop(r, sv, (regnode*)data.start_class);
6285                       PerlIO_printf(Perl_debug_log,
6286                                     "synthetic stclass \"%s\".\n",
6287                                     SvPVX_const(sv));});
6288         }
6289     }
6290
6291     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6292        the "real" pattern. */
6293     DEBUG_OPTIMISE_r({
6294         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6295                       (IV)minlen, (IV)r->minlen);
6296     });
6297     r->minlenret = minlen;
6298     if (r->minlen < minlen)
6299         r->minlen = minlen;
6300
6301     if (RExC_seen & REG_SEEN_GPOS)
6302         r->extflags |= RXf_GPOS_SEEN;
6303     if (RExC_seen & REG_SEEN_LOOKBEHIND)
6304         r->extflags |= RXf_LOOKBEHIND_SEEN;
6305     if (pRExC_state->num_code_blocks)
6306         r->extflags |= RXf_EVAL_SEEN;
6307     if (RExC_seen & REG_SEEN_CANY)
6308         r->extflags |= RXf_CANY_SEEN;
6309     if (RExC_seen & REG_SEEN_VERBARG)
6310     {
6311         r->intflags |= PREGf_VERBARG_SEEN;
6312         r->extflags |= RXf_MODIFIES_VARS;
6313     }
6314     if (RExC_seen & REG_SEEN_CUTGROUP)
6315         r->intflags |= PREGf_CUTGROUP_SEEN;
6316     if (pm_flags & PMf_USE_RE_EVAL)
6317         r->intflags |= PREGf_USE_RE_EVAL;
6318     if (RExC_paren_names)
6319         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6320     else
6321         RXp_PAREN_NAMES(r) = NULL;
6322
6323 #ifdef STUPID_PATTERN_CHECKS
6324     if (RX_PRELEN(rx) == 0)
6325         r->extflags |= RXf_NULL;
6326     if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6327         r->extflags |= RXf_WHITE;
6328     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6329         r->extflags |= RXf_START_ONLY;
6330 #else
6331     {
6332         regnode *first = ri->program + 1;
6333         U8 fop = OP(first);
6334
6335         if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6336             r->extflags |= RXf_NULL;
6337         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6338             r->extflags |= RXf_START_ONLY;
6339         else if (fop == PLUS && PL_regkind[OP(NEXTOPER(first))] == POSIXD && FLAGS(NEXTOPER(first)) == _CC_SPACE
6340                              && OP(regnext(first)) == END)
6341             r->extflags |= RXf_WHITE;
6342     }
6343 #endif
6344 #ifdef DEBUGGING
6345     if (RExC_paren_names) {
6346         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6347         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6348     } else
6349 #endif
6350         ri->name_list_idx = 0;
6351
6352     if (RExC_recurse_count) {
6353         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6354             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6355             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6356         }
6357     }
6358     Newxz(r->offs, RExC_npar, regexp_paren_pair);
6359     /* assume we don't need to swap parens around before we match */
6360
6361     DEBUG_DUMP_r({
6362         PerlIO_printf(Perl_debug_log,"Final program:\n");
6363         regdump(r);
6364     });
6365 #ifdef RE_TRACK_PATTERN_OFFSETS
6366     DEBUG_OFFSETS_r(if (ri->u.offsets) {
6367         const U32 len = ri->u.offsets[0];
6368         U32 i;
6369         GET_RE_DEBUG_FLAGS_DECL;
6370         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6371         for (i = 1; i <= len; i++) {
6372             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6373                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6374                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6375             }
6376         PerlIO_printf(Perl_debug_log, "\n");
6377     });
6378 #endif
6379
6380 #ifdef USE_ITHREADS
6381     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6382      * by setting the regexp SV to readonly-only instead. If the
6383      * pattern's been recompiled, the USEDness should remain. */
6384     if (old_re && SvREADONLY(old_re))
6385         SvREADONLY_on(rx);
6386 #endif
6387     return rx;
6388 }
6389
6390
6391 SV*
6392 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6393                     const U32 flags)
6394 {
6395     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6396
6397     PERL_UNUSED_ARG(value);
6398
6399     if (flags & RXapif_FETCH) {
6400         return reg_named_buff_fetch(rx, key, flags);
6401     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6402         Perl_croak_no_modify();
6403         return NULL;
6404     } else if (flags & RXapif_EXISTS) {
6405         return reg_named_buff_exists(rx, key, flags)
6406             ? &PL_sv_yes
6407             : &PL_sv_no;
6408     } else if (flags & RXapif_REGNAMES) {
6409         return reg_named_buff_all(rx, flags);
6410     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6411         return reg_named_buff_scalar(rx, flags);
6412     } else {
6413         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6414         return NULL;
6415     }
6416 }
6417
6418 SV*
6419 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6420                          const U32 flags)
6421 {
6422     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6423     PERL_UNUSED_ARG(lastkey);
6424
6425     if (flags & RXapif_FIRSTKEY)
6426         return reg_named_buff_firstkey(rx, flags);
6427     else if (flags & RXapif_NEXTKEY)
6428         return reg_named_buff_nextkey(rx, flags);
6429     else {
6430         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6431         return NULL;
6432     }
6433 }
6434
6435 SV*
6436 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6437                           const U32 flags)
6438 {
6439     AV *retarray = NULL;
6440     SV *ret;
6441     struct regexp *const rx = ReANY(r);
6442
6443     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6444
6445     if (flags & RXapif_ALL)
6446         retarray=newAV();
6447
6448     if (rx && RXp_PAREN_NAMES(rx)) {
6449         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6450         if (he_str) {
6451             IV i;
6452             SV* sv_dat=HeVAL(he_str);
6453             I32 *nums=(I32*)SvPVX(sv_dat);
6454             for ( i=0; i<SvIVX(sv_dat); i++ ) {
6455                 if ((I32)(rx->nparens) >= nums[i]
6456                     && rx->offs[nums[i]].start != -1
6457                     && rx->offs[nums[i]].end != -1)
6458                 {
6459                     ret = newSVpvs("");
6460                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6461                     if (!retarray)
6462                         return ret;
6463                 } else {
6464                     if (retarray)
6465                         ret = newSVsv(&PL_sv_undef);
6466                 }
6467                 if (retarray)
6468                     av_push(retarray, ret);
6469             }
6470             if (retarray)
6471                 return newRV_noinc(MUTABLE_SV(retarray));
6472         }
6473     }
6474     return NULL;
6475 }
6476
6477 bool
6478 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6479                            const U32 flags)
6480 {
6481     struct regexp *const rx = ReANY(r);
6482
6483     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6484
6485     if (rx && RXp_PAREN_NAMES(rx)) {
6486         if (flags & RXapif_ALL) {
6487             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6488         } else {
6489             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6490             if (sv) {
6491                 SvREFCNT_dec_NN(sv);
6492                 return TRUE;
6493             } else {
6494                 return FALSE;
6495             }
6496         }
6497     } else {
6498         return FALSE;
6499     }
6500 }
6501
6502 SV*
6503 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6504 {
6505     struct regexp *const rx = ReANY(r);
6506
6507     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6508
6509     if ( rx && RXp_PAREN_NAMES(rx) ) {
6510         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6511
6512         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6513     } else {
6514         return FALSE;
6515     }
6516 }
6517
6518 SV*
6519 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6520 {
6521     struct regexp *const rx = ReANY(r);
6522     GET_RE_DEBUG_FLAGS_DECL;
6523
6524     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6525
6526     if (rx && RXp_PAREN_NAMES(rx)) {
6527         HV *hv = RXp_PAREN_NAMES(rx);
6528         HE *temphe;
6529         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6530             IV i;
6531             IV parno = 0;
6532             SV* sv_dat = HeVAL(temphe);
6533             I32 *nums = (I32*)SvPVX(sv_dat);
6534             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6535                 if ((I32)(rx->lastparen) >= nums[i] &&
6536                     rx->offs[nums[i]].start != -1 &&
6537                     rx->offs[nums[i]].end != -1)
6538                 {
6539                     parno = nums[i];
6540                     break;
6541                 }
6542             }
6543             if (parno || flags & RXapif_ALL) {
6544                 return newSVhek(HeKEY_hek(temphe));
6545             }
6546         }
6547     }
6548     return NULL;
6549 }
6550
6551 SV*
6552 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6553 {
6554     SV *ret;
6555     AV *av;
6556     I32 length;
6557     struct regexp *const rx = ReANY(r);
6558
6559     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6560
6561     if (rx && RXp_PAREN_NAMES(rx)) {
6562         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6563             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6564         } else if (flags & RXapif_ONE) {
6565             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6566             av = MUTABLE_AV(SvRV(ret));
6567             length = av_len(av);
6568             SvREFCNT_dec_NN(ret);
6569             return newSViv(length + 1);
6570         } else {
6571             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6572             return NULL;
6573         }
6574     }
6575     return &PL_sv_undef;
6576 }
6577
6578 SV*
6579 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6580 {
6581     struct regexp *const rx = ReANY(r);
6582     AV *av = newAV();
6583
6584     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6585
6586     if (rx && RXp_PAREN_NAMES(rx)) {
6587         HV *hv= RXp_PAREN_NAMES(rx);
6588         HE *temphe;
6589         (void)hv_iterinit(hv);
6590         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6591             IV i;
6592             IV parno = 0;
6593             SV* sv_dat = HeVAL(temphe);
6594             I32 *nums = (I32*)SvPVX(sv_dat);
6595             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6596                 if ((I32)(rx->lastparen) >= nums[i] &&
6597                     rx->offs[nums[i]].start != -1 &&
6598                     rx->offs[nums[i]].end != -1)
6599                 {
6600                     parno = nums[i];
6601                     break;
6602                 }
6603             }
6604             if (parno || flags & RXapif_ALL) {
6605                 av_push(av, newSVhek(HeKEY_hek(temphe)));
6606             }
6607         }
6608     }
6609
6610     return newRV_noinc(MUTABLE_SV(av));
6611 }
6612
6613 void
6614 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6615                              SV * const sv)
6616 {
6617     struct regexp *const rx = ReANY(r);
6618     char *s = NULL;
6619     I32 i = 0;
6620     I32 s1, t1;
6621     I32 n = paren;
6622
6623     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6624
6625     if ( (    n == RX_BUFF_IDX_CARET_PREMATCH
6626            || n == RX_BUFF_IDX_CARET_FULLMATCH
6627            || n == RX_BUFF_IDX_CARET_POSTMATCH
6628          )
6629          && !(rx->extflags & RXf_PMf_KEEPCOPY)
6630     )
6631         goto ret_undef;
6632
6633     if (!rx->subbeg)
6634         goto ret_undef;
6635
6636     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6637         /* no need to distinguish between them any more */
6638         n = RX_BUFF_IDX_FULLMATCH;
6639
6640     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6641         && rx->offs[0].start != -1)
6642     {
6643         /* $`, ${^PREMATCH} */
6644         i = rx->offs[0].start;
6645         s = rx->subbeg;
6646     }
6647     else
6648     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6649         && rx->offs[0].end != -1)
6650     {
6651         /* $', ${^POSTMATCH} */
6652         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6653         i = rx->sublen + rx->suboffset - rx->offs[0].end;
6654     }
6655     else
6656     if ( 0 <= n && n <= (I32)rx->nparens &&
6657         (s1 = rx->offs[n].start) != -1 &&
6658         (t1 = rx->offs[n].end) != -1)
6659     {
6660         /* $&, ${^MATCH},  $1 ... */
6661         i = t1 - s1;
6662         s = rx->subbeg + s1 - rx->suboffset;
6663     } else {
6664         goto ret_undef;
6665     }
6666
6667     assert(s >= rx->subbeg);
6668     assert(rx->sublen >= (s - rx->subbeg) + i );
6669     if (i >= 0) {
6670 #if NO_TAINT_SUPPORT
6671         sv_setpvn(sv, s, i);
6672 #else
6673         const int oldtainted = TAINT_get;
6674         TAINT_NOT;
6675         sv_setpvn(sv, s, i);
6676         TAINT_set(oldtainted);
6677 #endif
6678         if ( (rx->extflags & RXf_CANY_SEEN)
6679             ? (RXp_MATCH_UTF8(rx)
6680                         && (!i || is_utf8_string((U8*)s, i)))
6681             : (RXp_MATCH_UTF8(rx)) )
6682         {
6683             SvUTF8_on(sv);
6684         }
6685         else
6686             SvUTF8_off(sv);
6687         if (TAINTING_get) {
6688             if (RXp_MATCH_TAINTED(rx)) {
6689                 if (SvTYPE(sv) >= SVt_PVMG) {
6690                     MAGIC* const mg = SvMAGIC(sv);
6691                     MAGIC* mgt;
6692                     TAINT;
6693                     SvMAGIC_set(sv, mg->mg_moremagic);
6694                     SvTAINT(sv);
6695                     if ((mgt = SvMAGIC(sv))) {
6696                         mg->mg_moremagic = mgt;
6697                         SvMAGIC_set(sv, mg);
6698                     }
6699                 } else {
6700                     TAINT;
6701                     SvTAINT(sv);
6702                 }
6703             } else
6704                 SvTAINTED_off(sv);
6705         }
6706     } else {
6707       ret_undef:
6708         sv_setsv(sv,&PL_sv_undef);
6709         return;
6710     }
6711 }
6712
6713 void
6714 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6715                                                          SV const * const value)
6716 {
6717     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6718
6719     PERL_UNUSED_ARG(rx);
6720     PERL_UNUSED_ARG(paren);
6721     PERL_UNUSED_ARG(value);
6722
6723     if (!PL_localizing)
6724         Perl_croak_no_modify();
6725 }
6726
6727 I32
6728 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6729                               const I32 paren)
6730 {
6731     struct regexp *const rx = ReANY(r);
6732     I32 i;
6733     I32 s1, t1;
6734
6735     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6736
6737     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6738     switch (paren) {
6739       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6740          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6741             goto warn_undef;
6742         /*FALLTHROUGH*/
6743
6744       case RX_BUFF_IDX_PREMATCH:       /* $` */
6745         if (rx->offs[0].start != -1) {
6746                         i = rx->offs[0].start;
6747                         if (i > 0) {
6748                                 s1 = 0;
6749                                 t1 = i;
6750                                 goto getlen;
6751                         }
6752             }
6753         return 0;
6754
6755       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6756          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6757             goto warn_undef;
6758       case RX_BUFF_IDX_POSTMATCH:       /* $' */
6759             if (rx->offs[0].end != -1) {
6760                         i = rx->sublen - rx->offs[0].end;
6761                         if (i > 0) {
6762                                 s1 = rx->offs[0].end;
6763                                 t1 = rx->sublen;
6764                                 goto getlen;
6765                         }
6766             }
6767         return 0;
6768
6769       case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
6770          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6771             goto warn_undef;
6772         /*FALLTHROUGH*/
6773
6774       /* $& / ${^MATCH}, $1, $2, ... */
6775       default:
6776             if (paren <= (I32)rx->nparens &&
6777             (s1 = rx->offs[paren].start) != -1 &&
6778             (t1 = rx->offs[paren].end) != -1)
6779             {
6780             i = t1 - s1;
6781             goto getlen;
6782         } else {
6783           warn_undef:
6784             if (ckWARN(WARN_UNINITIALIZED))
6785                 report_uninit((const SV *)sv);
6786             return 0;
6787         }
6788     }
6789   getlen:
6790     if (i > 0 && RXp_MATCH_UTF8(rx)) {
6791         const char * const s = rx->subbeg - rx->suboffset + s1;
6792         const U8 *ep;
6793         STRLEN el;
6794
6795         i = t1 - s1;
6796         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6797                         i = el;
6798     }
6799     return i;
6800 }
6801
6802 SV*
6803 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6804 {
6805     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6806         PERL_UNUSED_ARG(rx);
6807         if (0)
6808             return NULL;
6809         else
6810             return newSVpvs("Regexp");
6811 }
6812
6813 /* Scans the name of a named buffer from the pattern.
6814  * If flags is REG_RSN_RETURN_NULL returns null.
6815  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6816  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6817  * to the parsed name as looked up in the RExC_paren_names hash.
6818  * If there is an error throws a vFAIL().. type exception.
6819  */
6820
6821 #define REG_RSN_RETURN_NULL    0
6822 #define REG_RSN_RETURN_NAME    1
6823 #define REG_RSN_RETURN_DATA    2
6824
6825 STATIC SV*
6826 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6827 {
6828     char *name_start = RExC_parse;
6829
6830     PERL_ARGS_ASSERT_REG_SCAN_NAME;
6831
6832     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6833          /* skip IDFIRST by using do...while */
6834         if (UTF)
6835             do {
6836                 RExC_parse += UTF8SKIP(RExC_parse);
6837             } while (isWORDCHAR_utf8((U8*)RExC_parse));
6838         else
6839             do {
6840                 RExC_parse++;
6841             } while (isWORDCHAR(*RExC_parse));
6842     } else {
6843         RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6844         vFAIL("Group name must start with a non-digit word character");
6845     }
6846     if ( flags ) {
6847         SV* sv_name
6848             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6849                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6850         if ( flags == REG_RSN_RETURN_NAME)
6851             return sv_name;
6852         else if (flags==REG_RSN_RETURN_DATA) {
6853             HE *he_str = NULL;
6854             SV *sv_dat = NULL;
6855             if ( ! sv_name )      /* should not happen*/
6856                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6857             if (RExC_paren_names)
6858                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6859             if ( he_str )
6860                 sv_dat = HeVAL(he_str);
6861             if ( ! sv_dat )
6862                 vFAIL("Reference to nonexistent named group");
6863             return sv_dat;
6864         }
6865         else {
6866             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6867                        (unsigned long) flags);
6868         }
6869         assert(0); /* NOT REACHED */
6870     }
6871     return NULL;
6872 }
6873
6874 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
6875     int rem=(int)(RExC_end - RExC_parse);                       \
6876     int cut;                                                    \
6877     int num;                                                    \
6878     int iscut=0;                                                \
6879     if (rem>10) {                                               \
6880         rem=10;                                                 \
6881         iscut=1;                                                \
6882     }                                                           \
6883     cut=10-rem;                                                 \
6884     if (RExC_lastparse!=RExC_parse)                             \
6885         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
6886             rem, RExC_parse,                                    \
6887             cut + 4,                                            \
6888             iscut ? "..." : "<"                                 \
6889         );                                                      \
6890     else                                                        \
6891         PerlIO_printf(Perl_debug_log,"%16s","");                \
6892                                                                 \
6893     if (SIZE_ONLY)                                              \
6894        num = RExC_size + 1;                                     \
6895     else                                                        \
6896        num=REG_NODE_NUM(RExC_emit);                             \
6897     if (RExC_lastnum!=num)                                      \
6898        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
6899     else                                                        \
6900        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
6901     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
6902         (int)((depth*2)), "",                                   \
6903         (funcname)                                              \
6904     );                                                          \
6905     RExC_lastnum=num;                                           \
6906     RExC_lastparse=RExC_parse;                                  \
6907 })
6908
6909
6910
6911 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
6912     DEBUG_PARSE_MSG((funcname));                            \
6913     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
6914 })
6915 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
6916     DEBUG_PARSE_MSG((funcname));                            \
6917     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
6918 })
6919
6920 /* This section of code defines the inversion list object and its methods.  The
6921  * interfaces are highly subject to change, so as much as possible is static to
6922  * this file.  An inversion list is here implemented as a malloc'd C UV array
6923  * with some added info that is placed as UVs at the beginning in a header
6924  * portion.  An inversion list for Unicode is an array of code points, sorted
6925  * by ordinal number.  The zeroth element is the first code point in the list.
6926  * The 1th element is the first element beyond that not in the list.  In other
6927  * words, the first range is
6928  *  invlist[0]..(invlist[1]-1)
6929  * The other ranges follow.  Thus every element whose index is divisible by two
6930  * marks the beginning of a range that is in the list, and every element not
6931  * divisible by two marks the beginning of a range not in the list.  A single
6932  * element inversion list that contains the single code point N generally
6933  * consists of two elements
6934  *  invlist[0] == N
6935  *  invlist[1] == N+1
6936  * (The exception is when N is the highest representable value on the
6937  * machine, in which case the list containing just it would be a single
6938  * element, itself.  By extension, if the last range in the list extends to
6939  * infinity, then the first element of that range will be in the inversion list
6940  * at a position that is divisible by two, and is the final element in the
6941  * list.)
6942  * Taking the complement (inverting) an inversion list is quite simple, if the
6943  * first element is 0, remove it; otherwise add a 0 element at the beginning.
6944  * This implementation reserves an element at the beginning of each inversion
6945  * list to contain 0 when the list contains 0, and contains 1 otherwise.  The
6946  * actual beginning of the list is either that element if 0, or the next one if
6947  * 1.
6948  *
6949  * More about inversion lists can be found in "Unicode Demystified"
6950  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6951  * More will be coming when functionality is added later.
6952  *
6953  * The inversion list data structure is currently implemented as an SV pointing
6954  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
6955  * array of UV whose memory management is automatically handled by the existing
6956  * facilities for SV's.
6957  *
6958  * Some of the methods should always be private to the implementation, and some
6959  * should eventually be made public */
6960
6961 /* The header definitions are in F<inline_invlist.c> */
6962 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
6963 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
6964
6965 #define INVLIST_INITIAL_LEN 10
6966
6967 PERL_STATIC_INLINE UV*
6968 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
6969 {
6970     /* Returns a pointer to the first element in the inversion list's array.
6971      * This is called upon initialization of an inversion list.  Where the
6972      * array begins depends on whether the list has the code point U+0000
6973      * in it or not.  The other parameter tells it whether the code that
6974      * follows this call is about to put a 0 in the inversion list or not.
6975      * The first element is either the element with 0, if 0, or the next one,
6976      * if 1 */
6977
6978     UV* zero = get_invlist_zero_addr(invlist);
6979
6980     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
6981
6982     /* Must be empty */
6983     assert(! *_get_invlist_len_addr(invlist));
6984
6985     /* 1^1 = 0; 1^0 = 1 */
6986     *zero = 1 ^ will_have_0;
6987     return zero + *zero;
6988 }
6989
6990 PERL_STATIC_INLINE UV*
6991 S_invlist_array(pTHX_ SV* const invlist)
6992 {
6993     /* Returns the pointer to the inversion list's array.  Every time the
6994      * length changes, this needs to be called in case malloc or realloc moved
6995      * it */
6996
6997     PERL_ARGS_ASSERT_INVLIST_ARRAY;
6998
6999     /* Must not be empty.  If these fail, you probably didn't check for <len>
7000      * being non-zero before trying to get the array */
7001     assert(*_get_invlist_len_addr(invlist));
7002     assert(*get_invlist_zero_addr(invlist) == 0
7003            || *get_invlist_zero_addr(invlist) == 1);
7004
7005     /* The array begins either at the element reserved for zero if the
7006      * list contains 0 (that element will be set to 0), or otherwise the next
7007      * element (in which case the reserved element will be set to 1). */
7008     return (UV *) (get_invlist_zero_addr(invlist)
7009                    + *get_invlist_zero_addr(invlist));
7010 }
7011
7012 PERL_STATIC_INLINE void
7013 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7014 {
7015     /* Sets the current number of elements stored in the inversion list */
7016
7017     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7018
7019     *_get_invlist_len_addr(invlist) = len;
7020
7021     assert(len <= SvLEN(invlist));
7022
7023     SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7024     /* If the list contains U+0000, that element is part of the header,
7025      * and should not be counted as part of the array.  It will contain
7026      * 0 in that case, and 1 otherwise.  So we could flop 0=>1, 1=>0 and
7027      * subtract:
7028      *  SvCUR_set(invlist,
7029      *            TO_INTERNAL_SIZE(len
7030      *                             - (*get_invlist_zero_addr(inv_list) ^ 1)));
7031      * But, this is only valid if len is not 0.  The consequences of not doing
7032      * this is that the memory allocation code may think that 1 more UV is
7033      * being used than actually is, and so might do an unnecessary grow.  That
7034      * seems worth not bothering to make this the precise amount.
7035      *
7036      * Note that when inverting, SvCUR shouldn't change */
7037 }
7038
7039 PERL_STATIC_INLINE IV*
7040 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7041 {
7042     /* Return the address of the UV that is reserved to hold the cached index
7043      * */
7044
7045     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7046
7047     return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
7048 }
7049
7050 PERL_STATIC_INLINE IV
7051 S_invlist_previous_index(pTHX_ SV* const invlist)
7052 {
7053     /* Returns cached index of previous search */
7054
7055     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7056
7057     return *get_invlist_previous_index_addr(invlist);
7058 }
7059
7060 PERL_STATIC_INLINE void
7061 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7062 {
7063     /* Caches <index> for later retrieval */
7064
7065     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7066
7067     assert(index == 0 || index < (int) _invlist_len(invlist));
7068
7069     *get_invlist_previous_index_addr(invlist) = index;
7070 }
7071
7072 PERL_STATIC_INLINE UV
7073 S_invlist_max(pTHX_ SV* const invlist)
7074 {
7075     /* Returns the maximum number of elements storable in the inversion list's
7076      * array, without having to realloc() */
7077
7078     PERL_ARGS_ASSERT_INVLIST_MAX;
7079
7080     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7081            ? _invlist_len(invlist)
7082            : FROM_INTERNAL_SIZE(SvLEN(invlist));
7083 }
7084
7085 PERL_STATIC_INLINE UV*
7086 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7087 {
7088     /* Return the address of the UV that is reserved to hold 0 if the inversion
7089      * list contains 0.  This has to be the last element of the heading, as the
7090      * list proper starts with either it if 0, or the next element if not.
7091      * (But we force it to contain either 0 or 1) */
7092
7093     PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7094
7095     return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7096 }
7097
7098 #ifndef PERL_IN_XSUB_RE
7099 SV*
7100 Perl__new_invlist(pTHX_ IV initial_size)
7101 {
7102
7103     /* Return a pointer to a newly constructed inversion list, with enough
7104      * space to store 'initial_size' elements.  If that number is negative, a
7105      * system default is used instead */
7106
7107     SV* new_list;
7108
7109     if (initial_size < 0) {
7110         initial_size = INVLIST_INITIAL_LEN;
7111     }
7112
7113     /* Allocate the initial space */
7114     new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7115     invlist_set_len(new_list, 0);
7116
7117     /* Force iterinit() to be used to get iteration to work */
7118     *get_invlist_iter_addr(new_list) = UV_MAX;
7119
7120     /* This should force a segfault if a method doesn't initialize this
7121      * properly */
7122     *get_invlist_zero_addr(new_list) = UV_MAX;
7123
7124     *get_invlist_previous_index_addr(new_list) = 0;
7125     *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7126 #if HEADER_LENGTH != 5
7127 #   error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7128 #endif
7129
7130     return new_list;
7131 }
7132 #endif
7133
7134 STATIC SV*
7135 S__new_invlist_C_array(pTHX_ UV* list)
7136 {
7137     /* Return a pointer to a newly constructed inversion list, initialized to
7138      * point to <list>, which has to be in the exact correct inversion list
7139      * form, including internal fields.  Thus this is a dangerous routine that
7140      * should not be used in the wrong hands */
7141
7142     SV* invlist = newSV_type(SVt_PV);
7143
7144     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7145
7146     SvPV_set(invlist, (char *) list);
7147     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7148                                shouldn't touch it */
7149     SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
7150
7151     if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7152         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7153     }
7154
7155     /* Initialize the iteration pointer.
7156      * XXX This could be done at compile time in charclass_invlists.h, but I
7157      * (khw) am not confident that the suffixes for specifying the C constant
7158      * UV_MAX are portable, e.g.  'ull' on a 32 bit machine that is configured
7159      * to use 64 bits; might need a Configure probe */
7160     invlist_iterfinish(invlist);
7161
7162     return invlist;
7163 }
7164
7165 STATIC void
7166 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7167 {
7168     /* Grow the maximum size of an inversion list */
7169
7170     PERL_ARGS_ASSERT_INVLIST_EXTEND;
7171
7172     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7173 }
7174
7175 PERL_STATIC_INLINE void
7176 S_invlist_trim(pTHX_ SV* const invlist)
7177 {
7178     PERL_ARGS_ASSERT_INVLIST_TRIM;
7179
7180     /* Change the length of the inversion list to how many entries it currently
7181      * has */
7182
7183     SvPV_shrink_to_cur((SV *) invlist);
7184 }
7185
7186 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7187
7188 STATIC void
7189 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7190 {
7191    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7192     * the end of the inversion list.  The range must be above any existing
7193     * ones. */
7194
7195     UV* array;
7196     UV max = invlist_max(invlist);
7197     UV len = _invlist_len(invlist);
7198
7199     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7200
7201     if (len == 0) { /* Empty lists must be initialized */
7202         array = _invlist_array_init(invlist, start == 0);
7203     }
7204     else {
7205         /* Here, the existing list is non-empty. The current max entry in the
7206          * list is generally the first value not in the set, except when the
7207          * set extends to the end of permissible values, in which case it is
7208          * the first entry in that final set, and so this call is an attempt to
7209          * append out-of-order */
7210
7211         UV final_element = len - 1;
7212         array = invlist_array(invlist);
7213         if (array[final_element] > start
7214             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7215         {
7216             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7217                        array[final_element], start,
7218                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7219         }
7220
7221         /* Here, it is a legal append.  If the new range begins with the first
7222          * value not in the set, it is extending the set, so the new first
7223          * value not in the set is one greater than the newly extended range.
7224          * */
7225         if (array[final_element] == start) {
7226             if (end != UV_MAX) {
7227                 array[final_element] = end + 1;
7228             }
7229             else {
7230                 /* But if the end is the maximum representable on the machine,
7231                  * just let the range that this would extend to have no end */
7232                 invlist_set_len(invlist, len - 1);
7233             }
7234             return;
7235         }
7236     }
7237
7238     /* Here the new range doesn't extend any existing set.  Add it */
7239
7240     len += 2;   /* Includes an element each for the start and end of range */
7241
7242     /* If overflows the existing space, extend, which may cause the array to be
7243      * moved */
7244     if (max < len) {
7245         invlist_extend(invlist, len);
7246         invlist_set_len(invlist, len);  /* Have to set len here to avoid assert
7247                                            failure in invlist_array() */
7248         array = invlist_array(invlist);
7249     }
7250     else {
7251         invlist_set_len(invlist, len);
7252     }
7253
7254     /* The next item on the list starts the range, the one after that is
7255      * one past the new range.  */
7256     array[len - 2] = start;
7257     if (end != UV_MAX) {
7258         array[len - 1] = end + 1;
7259     }
7260     else {
7261         /* But if the end is the maximum representable on the machine, just let
7262          * the range have no end */
7263         invlist_set_len(invlist, len - 1);
7264     }
7265 }
7266
7267 #ifndef PERL_IN_XSUB_RE
7268
7269 IV
7270 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7271 {
7272     /* Searches the inversion list for the entry that contains the input code
7273      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
7274      * return value is the index into the list's array of the range that
7275      * contains <cp> */
7276
7277     IV low = 0;
7278     IV mid;
7279     IV high = _invlist_len(invlist);
7280     const IV highest_element = high - 1;
7281     const UV* array;
7282
7283     PERL_ARGS_ASSERT__INVLIST_SEARCH;
7284
7285     /* If list is empty, return failure. */
7286     if (high == 0) {
7287         return -1;
7288     }
7289
7290     /* (We can't get the array unless we know the list is non-empty) */
7291     array = invlist_array(invlist);
7292
7293     mid = invlist_previous_index(invlist);
7294     assert(mid >=0 && mid <= highest_element);
7295
7296     /* <mid> contains the cache of the result of the previous call to this
7297      * function (0 the first time).  See if this call is for the same result,
7298      * or if it is for mid-1.  This is under the theory that calls to this
7299      * function will often be for related code points that are near each other.
7300      * And benchmarks show that caching gives better results.  We also test
7301      * here if the code point is within the bounds of the list.  These tests
7302      * replace others that would have had to be made anyway to make sure that
7303      * the array bounds were not exceeded, and these give us extra information
7304      * at the same time */
7305     if (cp >= array[mid]) {
7306         if (cp >= array[highest_element]) {
7307             return highest_element;
7308         }
7309
7310         /* Here, array[mid] <= cp < array[highest_element].  This means that
7311          * the final element is not the answer, so can exclude it; it also
7312          * means that <mid> is not the final element, so can refer to 'mid + 1'
7313          * safely */
7314         if (cp < array[mid + 1]) {
7315             return mid;
7316         }
7317         high--;
7318         low = mid + 1;
7319     }
7320     else { /* cp < aray[mid] */
7321         if (cp < array[0]) { /* Fail if outside the array */
7322             return -1;
7323         }
7324         high = mid;
7325         if (cp >= array[mid - 1]) {
7326             goto found_entry;
7327         }
7328     }
7329
7330     /* Binary search.  What we are looking for is <i> such that
7331      *  array[i] <= cp < array[i+1]
7332      * The loop below converges on the i+1.  Note that there may not be an
7333      * (i+1)th element in the array, and things work nonetheless */
7334     while (low < high) {
7335         mid = (low + high) / 2;
7336         assert(mid <= highest_element);
7337         if (array[mid] <= cp) { /* cp >= array[mid] */
7338             low = mid + 1;
7339
7340             /* We could do this extra test to exit the loop early.
7341             if (cp < array[low]) {
7342                 return mid;
7343             }
7344             */
7345         }
7346         else { /* cp < array[mid] */
7347             high = mid;
7348         }
7349     }
7350
7351   found_entry:
7352     high--;
7353     invlist_set_previous_index(invlist, high);
7354     return high;
7355 }
7356
7357 void
7358 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7359 {
7360     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7361      * but is used when the swash has an inversion list.  This makes this much
7362      * faster, as it uses a binary search instead of a linear one.  This is
7363      * intimately tied to that function, and perhaps should be in utf8.c,
7364      * except it is intimately tied to inversion lists as well.  It assumes
7365      * that <swatch> is all 0's on input */
7366
7367     UV current = start;
7368     const IV len = _invlist_len(invlist);
7369     IV i;
7370     const UV * array;
7371
7372     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7373
7374     if (len == 0) { /* Empty inversion list */
7375         return;
7376     }
7377
7378     array = invlist_array(invlist);
7379
7380     /* Find which element it is */
7381     i = _invlist_search(invlist, start);
7382
7383     /* We populate from <start> to <end> */
7384     while (current < end) {
7385         UV upper;
7386
7387         /* The inversion list gives the results for every possible code point
7388          * after the first one in the list.  Only those ranges whose index is
7389          * even are ones that the inversion list matches.  For the odd ones,
7390          * and if the initial code point is not in the list, we have to skip
7391          * forward to the next element */
7392         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7393             i++;
7394             if (i >= len) { /* Finished if beyond the end of the array */
7395                 return;
7396             }
7397             current = array[i];
7398             if (current >= end) {   /* Finished if beyond the end of what we
7399                                        are populating */
7400                 if (LIKELY(end < UV_MAX)) {
7401                     return;
7402                 }
7403
7404                 /* We get here when the upper bound is the maximum
7405                  * representable on the machine, and we are looking for just
7406                  * that code point.  Have to special case it */
7407                 i = len;
7408                 goto join_end_of_list;
7409             }
7410         }
7411         assert(current >= start);
7412
7413         /* The current range ends one below the next one, except don't go past
7414          * <end> */
7415         i++;
7416         upper = (i < len && array[i] < end) ? array[i] : end;
7417
7418         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
7419          * for each code point in it */
7420         for (; current < upper; current++) {
7421             const STRLEN offset = (STRLEN)(current - start);
7422             swatch[offset >> 3] |= 1 << (offset & 7);
7423         }
7424
7425     join_end_of_list:
7426
7427         /* Quit if at the end of the list */
7428         if (i >= len) {
7429
7430             /* But first, have to deal with the highest possible code point on
7431              * the platform.  The previous code assumes that <end> is one
7432              * beyond where we want to populate, but that is impossible at the
7433              * platform's infinity, so have to handle it specially */
7434             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7435             {
7436                 const STRLEN offset = (STRLEN)(end - start);
7437                 swatch[offset >> 3] |= 1 << (offset & 7);
7438             }
7439             return;
7440         }
7441
7442         /* Advance to the next range, which will be for code points not in the
7443          * inversion list */
7444         current = array[i];
7445     }
7446
7447     return;
7448 }
7449
7450 void
7451 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7452 {
7453     /* Take the union of two inversion lists and point <output> to it.  *output
7454      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7455      * the reference count to that list will be decremented.  The first list,
7456      * <a>, may be NULL, in which case a copy of the second list is returned.
7457      * If <complement_b> is TRUE, the union is taken of the complement
7458      * (inversion) of <b> instead of b itself.
7459      *
7460      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7461      * Richard Gillam, published by Addison-Wesley, and explained at some
7462      * length there.  The preface says to incorporate its examples into your
7463      * code at your own risk.
7464      *
7465      * The algorithm is like a merge sort.
7466      *
7467      * XXX A potential performance improvement is to keep track as we go along
7468      * if only one of the inputs contributes to the result, meaning the other
7469      * is a subset of that one.  In that case, we can skip the final copy and
7470      * return the larger of the input lists, but then outside code might need
7471      * to keep track of whether to free the input list or not */
7472
7473     UV* array_a;    /* a's array */
7474     UV* array_b;
7475     UV len_a;       /* length of a's array */
7476     UV len_b;
7477
7478     SV* u;                      /* the resulting union */
7479     UV* array_u;
7480     UV len_u;
7481
7482     UV i_a = 0;             /* current index into a's array */
7483     UV i_b = 0;
7484     UV i_u = 0;
7485
7486     /* running count, as explained in the algorithm source book; items are
7487      * stopped accumulating and are output when the count changes to/from 0.
7488      * The count is incremented when we start a range that's in the set, and
7489      * decremented when we start a range that's not in the set.  So its range
7490      * is 0 to 2.  Only when the count is zero is something not in the set.
7491      */
7492     UV count = 0;
7493
7494     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7495     assert(a != b);
7496
7497     /* If either one is empty, the union is the other one */
7498     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7499         if (*output == a) {
7500             if (a != NULL) {
7501                 SvREFCNT_dec_NN(a);
7502             }
7503         }
7504         if (*output != b) {
7505             *output = invlist_clone(b);
7506             if (complement_b) {
7507                 _invlist_invert(*output);
7508             }
7509         } /* else *output already = b; */
7510         return;
7511     }
7512     else if ((len_b = _invlist_len(b)) == 0) {
7513         if (*output == b) {
7514             SvREFCNT_dec_NN(b);
7515         }
7516
7517         /* The complement of an empty list is a list that has everything in it,
7518          * so the union with <a> includes everything too */
7519         if (complement_b) {
7520             if (a == *output) {
7521                 SvREFCNT_dec_NN(a);
7522             }
7523             *output = _new_invlist(1);
7524             _append_range_to_invlist(*output, 0, UV_MAX);
7525         }
7526         else if (*output != a) {
7527             *output = invlist_clone(a);
7528         }
7529         /* else *output already = a; */
7530         return;
7531     }
7532
7533     /* Here both lists exist and are non-empty */
7534     array_a = invlist_array(a);
7535     array_b = invlist_array(b);
7536
7537     /* If are to take the union of 'a' with the complement of b, set it
7538      * up so are looking at b's complement. */
7539     if (complement_b) {
7540
7541         /* To complement, we invert: if the first element is 0, remove it.  To
7542          * do this, we just pretend the array starts one later, and clear the
7543          * flag as we don't have to do anything else later */
7544         if (array_b[0] == 0) {
7545             array_b++;
7546             len_b--;
7547             complement_b = FALSE;
7548         }
7549         else {
7550
7551             /* But if the first element is not zero, we unshift a 0 before the
7552              * array.  The data structure reserves a space for that 0 (which
7553              * should be a '1' right now), so physical shifting is unneeded,
7554              * but temporarily change that element to 0.  Before exiting the
7555              * routine, we must restore the element to '1' */
7556             array_b--;
7557             len_b++;
7558             array_b[0] = 0;
7559         }
7560     }
7561
7562     /* Size the union for the worst case: that the sets are completely
7563      * disjoint */
7564     u = _new_invlist(len_a + len_b);
7565
7566     /* Will contain U+0000 if either component does */
7567     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7568                                       || (len_b > 0 && array_b[0] == 0));
7569
7570     /* Go through each list item by item, stopping when exhausted one of
7571      * them */
7572     while (i_a < len_a && i_b < len_b) {
7573         UV cp;      /* The element to potentially add to the union's array */
7574         bool cp_in_set;   /* is it in the the input list's set or not */
7575
7576         /* We need to take one or the other of the two inputs for the union.
7577          * Since we are merging two sorted lists, we take the smaller of the
7578          * next items.  In case of a tie, we take the one that is in its set
7579          * first.  If we took one not in the set first, it would decrement the
7580          * count, possibly to 0 which would cause it to be output as ending the
7581          * range, and the next time through we would take the same number, and
7582          * output it again as beginning the next range.  By doing it the
7583          * opposite way, there is no possibility that the count will be
7584          * momentarily decremented to 0, and thus the two adjoining ranges will
7585          * be seamlessly merged.  (In a tie and both are in the set or both not
7586          * in the set, it doesn't matter which we take first.) */
7587         if (array_a[i_a] < array_b[i_b]
7588             || (array_a[i_a] == array_b[i_b]
7589                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7590         {
7591             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7592             cp= array_a[i_a++];
7593         }
7594         else {
7595             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7596             cp= array_b[i_b++];
7597         }
7598
7599         /* Here, have chosen which of the two inputs to look at.  Only output
7600          * if the running count changes to/from 0, which marks the
7601          * beginning/end of a range in that's in the set */
7602         if (cp_in_set) {
7603             if (count == 0) {
7604                 array_u[i_u++] = cp;
7605             }
7606             count++;
7607         }
7608         else {
7609             count--;
7610             if (count == 0) {
7611                 array_u[i_u++] = cp;
7612             }
7613         }
7614     }
7615
7616     /* Here, we are finished going through at least one of the lists, which
7617      * means there is something remaining in at most one.  We check if the list
7618      * that hasn't been exhausted is positioned such that we are in the middle
7619      * of a range in its set or not.  (i_a and i_b point to the element beyond
7620      * the one we care about.) If in the set, we decrement 'count'; if 0, there
7621      * is potentially more to output.
7622      * There are four cases:
7623      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
7624      *     in the union is entirely from the non-exhausted set.
7625      *  2) Both were in their sets, count is 2.  Nothing further should
7626      *     be output, as everything that remains will be in the exhausted
7627      *     list's set, hence in the union; decrementing to 1 but not 0 insures
7628      *     that
7629      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
7630      *     Nothing further should be output because the union includes
7631      *     everything from the exhausted set.  Not decrementing ensures that.
7632      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7633      *     decrementing to 0 insures that we look at the remainder of the
7634      *     non-exhausted set */
7635     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7636         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7637     {
7638         count--;
7639     }
7640
7641     /* The final length is what we've output so far, plus what else is about to
7642      * be output.  (If 'count' is non-zero, then the input list we exhausted
7643      * has everything remaining up to the machine's limit in its set, and hence
7644      * in the union, so there will be no further output. */
7645     len_u = i_u;
7646     if (count == 0) {
7647         /* At most one of the subexpressions will be non-zero */
7648         len_u += (len_a - i_a) + (len_b - i_b);
7649     }
7650
7651     /* Set result to final length, which can change the pointer to array_u, so
7652      * re-find it */
7653     if (len_u != _invlist_len(u)) {
7654         invlist_set_len(u, len_u);
7655         invlist_trim(u);
7656         array_u = invlist_array(u);
7657     }
7658
7659     /* When 'count' is 0, the list that was exhausted (if one was shorter than
7660      * the other) ended with everything above it not in its set.  That means
7661      * that the remaining part of the union is precisely the same as the
7662      * non-exhausted list, so can just copy it unchanged.  (If both list were
7663      * exhausted at the same time, then the operations below will be both 0.)
7664      */
7665     if (count == 0) {
7666         IV copy_count; /* At most one will have a non-zero copy count */
7667         if ((copy_count = len_a - i_a) > 0) {
7668             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7669         }
7670         else if ((copy_count = len_b - i_b) > 0) {
7671             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7672         }
7673     }
7674
7675     /* If we've changed b, restore it */
7676     if (complement_b) {
7677         array_b[0] = 1;
7678     }
7679
7680     /*  We may be removing a reference to one of the inputs */
7681     if (a == *output || b == *output) {
7682         assert(! invlist_is_iterating(*output));
7683         SvREFCNT_dec_NN(*output);
7684     }
7685
7686     *output = u;
7687     return;
7688 }
7689
7690 void
7691 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7692 {
7693     /* Take the intersection of two inversion lists and point <i> to it.  *i
7694      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7695      * the reference count to that list will be decremented.
7696      * If <complement_b> is TRUE, the result will be the intersection of <a>
7697      * and the complement (or inversion) of <b> instead of <b> directly.
7698      *
7699      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7700      * Richard Gillam, published by Addison-Wesley, and explained at some
7701      * length there.  The preface says to incorporate its examples into your
7702      * code at your own risk.  In fact, it had bugs
7703      *
7704      * The algorithm is like a merge sort, and is essentially the same as the
7705      * union above
7706      */
7707
7708     UV* array_a;                /* a's array */
7709     UV* array_b;
7710     UV len_a;   /* length of a's array */
7711     UV len_b;
7712
7713     SV* r;                   /* the resulting intersection */
7714     UV* array_r;
7715     UV len_r;
7716
7717     UV i_a = 0;             /* current index into a's array */
7718     UV i_b = 0;
7719     UV i_r = 0;
7720
7721     /* running count, as explained in the algorithm source book; items are
7722      * stopped accumulating and are output when the count changes to/from 2.
7723      * The count is incremented when we start a range that's in the set, and
7724      * decremented when we start a range that's not in the set.  So its range
7725      * is 0 to 2.  Only when the count is 2 is something in the intersection.
7726      */
7727     UV count = 0;
7728
7729     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7730     assert(a != b);
7731
7732     /* Special case if either one is empty */
7733     len_a = _invlist_len(a);
7734     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7735
7736         if (len_a != 0 && complement_b) {
7737
7738             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7739              * be empty.  Here, also we are using 'b's complement, which hence
7740              * must be every possible code point.  Thus the intersection is
7741              * simply 'a'. */
7742             if (*i != a) {
7743                 *i = invlist_clone(a);
7744
7745                 if (*i == b) {
7746                     SvREFCNT_dec_NN(b);
7747                 }
7748             }
7749             /* else *i is already 'a' */
7750             return;
7751         }
7752
7753         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
7754          * intersection must be empty */
7755         if (*i == a) {
7756             SvREFCNT_dec_NN(a);
7757         }
7758         else if (*i == b) {
7759             SvREFCNT_dec_NN(b);
7760         }
7761         *i = _new_invlist(0);
7762         return;
7763     }
7764
7765     /* Here both lists exist and are non-empty */
7766     array_a = invlist_array(a);
7767     array_b = invlist_array(b);
7768
7769     /* If are to take the intersection of 'a' with the complement of b, set it
7770      * up so are looking at b's complement. */
7771     if (complement_b) {
7772
7773         /* To complement, we invert: if the first element is 0, remove it.  To
7774          * do this, we just pretend the array starts one later, and clear the
7775          * flag as we don't have to do anything else later */
7776         if (array_b[0] == 0) {
7777             array_b++;
7778             len_b--;
7779             complement_b = FALSE;
7780         }
7781         else {
7782
7783             /* But if the first element is not zero, we unshift a 0 before the
7784              * array.  The data structure reserves a space for that 0 (which
7785              * should be a '1' right now), so physical shifting is unneeded,
7786              * but temporarily change that element to 0.  Before exiting the
7787              * routine, we must restore the element to '1' */
7788             array_b--;
7789             len_b++;
7790             array_b[0] = 0;
7791         }
7792     }
7793
7794     /* Size the intersection for the worst case: that the intersection ends up
7795      * fragmenting everything to be completely disjoint */
7796     r= _new_invlist(len_a + len_b);
7797
7798     /* Will contain U+0000 iff both components do */
7799     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7800                                      && len_b > 0 && array_b[0] == 0);
7801
7802     /* Go through each list item by item, stopping when exhausted one of
7803      * them */
7804     while (i_a < len_a && i_b < len_b) {
7805         UV cp;      /* The element to potentially add to the intersection's
7806                        array */
7807         bool cp_in_set; /* Is it in the input list's set or not */
7808
7809         /* We need to take one or the other of the two inputs for the
7810          * intersection.  Since we are merging two sorted lists, we take the
7811          * smaller of the next items.  In case of a tie, we take the one that
7812          * is not in its set first (a difference from the union algorithm).  If
7813          * we took one in the set first, it would increment the count, possibly
7814          * to 2 which would cause it to be output as starting a range in the
7815          * intersection, and the next time through we would take that same
7816          * number, and output it again as ending the set.  By doing it the
7817          * opposite of this, there is no possibility that the count will be
7818          * momentarily incremented to 2.  (In a tie and both are in the set or
7819          * both not in the set, it doesn't matter which we take first.) */
7820         if (array_a[i_a] < array_b[i_b]
7821             || (array_a[i_a] == array_b[i_b]
7822                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7823         {
7824             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7825             cp= array_a[i_a++];
7826         }
7827         else {
7828             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7829             cp= array_b[i_b++];
7830         }
7831
7832         /* Here, have chosen which of the two inputs to look at.  Only output
7833          * if the running count changes to/from 2, which marks the
7834          * beginning/end of a range that's in the intersection */
7835         if (cp_in_set) {
7836             count++;
7837             if (count == 2) {
7838                 array_r[i_r++] = cp;
7839             }
7840         }
7841         else {
7842             if (count == 2) {
7843                 array_r[i_r++] = cp;
7844             }
7845             count--;
7846         }
7847     }
7848
7849     /* Here, we are finished going through at least one of the lists, which
7850      * means there is something remaining in at most one.  We check if the list
7851      * that has been exhausted is positioned such that we are in the middle
7852      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
7853      * the ones we care about.)  There are four cases:
7854      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
7855      *     nothing left in the intersection.
7856      *  2) Both were in their sets, count is 2 and perhaps is incremented to
7857      *     above 2.  What should be output is exactly that which is in the
7858      *     non-exhausted set, as everything it has is also in the intersection
7859      *     set, and everything it doesn't have can't be in the intersection
7860      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7861      *     gets incremented to 2.  Like the previous case, the intersection is
7862      *     everything that remains in the non-exhausted set.
7863      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7864      *     remains 1.  And the intersection has nothing more. */
7865     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7866         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7867     {
7868         count++;
7869     }
7870
7871     /* The final length is what we've output so far plus what else is in the
7872      * intersection.  At most one of the subexpressions below will be non-zero */
7873     len_r = i_r;
7874     if (count >= 2) {
7875         len_r += (len_a - i_a) + (len_b - i_b);
7876     }
7877
7878     /* Set result to final length, which can change the pointer to array_r, so
7879      * re-find it */
7880     if (len_r != _invlist_len(r)) {
7881         invlist_set_len(r, len_r);
7882         invlist_trim(r);
7883         array_r = invlist_array(r);
7884     }
7885
7886     /* Finish outputting any remaining */
7887     if (count >= 2) { /* At most one will have a non-zero copy count */
7888         IV copy_count;
7889         if ((copy_count = len_a - i_a) > 0) {
7890             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7891         }
7892         else if ((copy_count = len_b - i_b) > 0) {
7893             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7894         }
7895     }
7896
7897     /* If we've changed b, restore it */
7898     if (complement_b) {
7899         array_b[0] = 1;
7900     }
7901
7902     /*  We may be removing a reference to one of the inputs */
7903     if (a == *i || b == *i) {
7904         assert(! invlist_is_iterating(*i));
7905         SvREFCNT_dec_NN(*i);
7906     }
7907
7908     *i = r;
7909     return;
7910 }
7911
7912 SV*
7913 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7914 {
7915     /* Add the range from 'start' to 'end' inclusive to the inversion list's
7916      * set.  A pointer to the inversion list is returned.  This may actually be
7917      * a new list, in which case the passed in one has been destroyed.  The
7918      * passed in inversion list can be NULL, in which case a new one is created
7919      * with just the one range in it */
7920
7921     SV* range_invlist;
7922     UV len;
7923
7924     if (invlist == NULL) {
7925         invlist = _new_invlist(2);
7926         len = 0;
7927     }
7928     else {
7929         len = _invlist_len(invlist);
7930     }
7931
7932     /* If comes after the final entry actually in the list, can just append it
7933      * to the end, */
7934     if (len == 0
7935         || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
7936             && start >= invlist_array(invlist)[len - 1]))
7937     {
7938         _append_range_to_invlist(invlist, start, end);
7939         return invlist;
7940     }
7941
7942     /* Here, can't just append things, create and return a new inversion list
7943      * which is the union of this range and the existing inversion list */
7944     range_invlist = _new_invlist(2);
7945     _append_range_to_invlist(range_invlist, start, end);
7946
7947     _invlist_union(invlist, range_invlist, &invlist);
7948
7949     /* The temporary can be freed */
7950     SvREFCNT_dec_NN(range_invlist);
7951
7952     return invlist;
7953 }
7954
7955 #endif
7956
7957 PERL_STATIC_INLINE SV*
7958 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7959     return _add_range_to_invlist(invlist, cp, cp);
7960 }
7961
7962 #ifndef PERL_IN_XSUB_RE
7963 void
7964 Perl__invlist_invert(pTHX_ SV* const invlist)
7965 {
7966     /* Complement the input inversion list.  This adds a 0 if the list didn't
7967      * have a zero; removes it otherwise.  As described above, the data
7968      * structure is set up so that this is very efficient */
7969
7970     UV* len_pos = _get_invlist_len_addr(invlist);
7971
7972     PERL_ARGS_ASSERT__INVLIST_INVERT;
7973
7974     assert(! invlist_is_iterating(invlist));
7975
7976     /* The inverse of matching nothing is matching everything */
7977     if (*len_pos == 0) {
7978         _append_range_to_invlist(invlist, 0, UV_MAX);
7979         return;
7980     }
7981
7982     /* The exclusive or complents 0 to 1; and 1 to 0.  If the result is 1, the
7983      * zero element was a 0, so it is being removed, so the length decrements
7984      * by 1; and vice-versa.  SvCUR is unaffected */
7985     if (*get_invlist_zero_addr(invlist) ^= 1) {
7986         (*len_pos)--;
7987     }
7988     else {
7989         (*len_pos)++;
7990     }
7991 }
7992
7993 void
7994 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7995 {
7996     /* Complement the input inversion list (which must be a Unicode property,
7997      * all of which don't match above the Unicode maximum code point.)  And
7998      * Perl has chosen to not have the inversion match above that either.  This
7999      * adds a 0x110000 if the list didn't end with it, and removes it if it did
8000      */
8001
8002     UV len;
8003     UV* array;
8004
8005     PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8006
8007     _invlist_invert(invlist);
8008
8009     len = _invlist_len(invlist);
8010
8011     if (len != 0) { /* If empty do nothing */
8012         array = invlist_array(invlist);
8013         if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8014             /* Add 0x110000.  First, grow if necessary */
8015             len++;
8016             if (invlist_max(invlist) < len) {
8017                 invlist_extend(invlist, len);
8018                 array = invlist_array(invlist);
8019             }
8020             invlist_set_len(invlist, len);
8021             array[len - 1] = PERL_UNICODE_MAX + 1;
8022         }
8023         else {  /* Remove the 0x110000 */
8024             invlist_set_len(invlist, len - 1);
8025         }
8026     }
8027
8028     return;
8029 }
8030 #endif
8031
8032 PERL_STATIC_INLINE SV*
8033 S_invlist_clone(pTHX_ SV* const invlist)
8034 {
8035
8036     /* Return a new inversion list that is a copy of the input one, which is
8037      * unchanged */
8038
8039     /* Need to allocate extra space to accommodate Perl's addition of a
8040      * trailing NUL to SvPV's, since it thinks they are always strings */
8041     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8042     STRLEN length = SvCUR(invlist);
8043
8044     PERL_ARGS_ASSERT_INVLIST_CLONE;
8045
8046     SvCUR_set(new_invlist, length); /* This isn't done automatically */
8047     Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8048
8049     return new_invlist;
8050 }
8051
8052 PERL_STATIC_INLINE UV*
8053 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8054 {
8055     /* Return the address of the UV that contains the current iteration
8056      * position */
8057
8058     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8059
8060     return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8061 }
8062
8063 PERL_STATIC_INLINE UV*
8064 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8065 {
8066     /* Return the address of the UV that contains the version id. */
8067
8068     PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8069
8070     return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8071 }
8072
8073 PERL_STATIC_INLINE void
8074 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
8075 {
8076     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8077
8078     *get_invlist_iter_addr(invlist) = 0;
8079 }
8080
8081 PERL_STATIC_INLINE void
8082 S_invlist_iterfinish(pTHX_ SV* invlist)
8083 {
8084     /* Terminate iterator for invlist.  This is to catch development errors.
8085      * Any iteration that is interrupted before completed should call this
8086      * function.  Functions that add code points anywhere else but to the end
8087      * of an inversion list assert that they are not in the middle of an
8088      * iteration.  If they were, the addition would make the iteration
8089      * problematical: if the iteration hadn't reached the place where things
8090      * were being added, it would be ok */
8091
8092     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8093
8094     *get_invlist_iter_addr(invlist) = UV_MAX;
8095 }
8096
8097 STATIC bool
8098 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8099 {
8100     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8101      * This call sets in <*start> and <*end>, the next range in <invlist>.
8102      * Returns <TRUE> if successful and the next call will return the next
8103      * range; <FALSE> if was already at the end of the list.  If the latter,
8104      * <*start> and <*end> are unchanged, and the next call to this function
8105      * will start over at the beginning of the list */
8106
8107     UV* pos = get_invlist_iter_addr(invlist);
8108     UV len = _invlist_len(invlist);
8109     UV *array;
8110
8111     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8112
8113     if (*pos >= len) {
8114         *pos = UV_MAX;  /* Force iterinit() to be required next time */
8115         return FALSE;
8116     }
8117
8118     array = invlist_array(invlist);
8119
8120     *start = array[(*pos)++];
8121
8122     if (*pos >= len) {
8123         *end = UV_MAX;
8124     }
8125     else {
8126         *end = array[(*pos)++] - 1;
8127     }
8128
8129     return TRUE;
8130 }
8131
8132 PERL_STATIC_INLINE bool
8133 S_invlist_is_iterating(pTHX_ SV* const invlist)
8134 {
8135     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8136
8137     return *(get_invlist_iter_addr(invlist)) < UV_MAX;
8138 }
8139
8140 PERL_STATIC_INLINE UV
8141 S_invlist_highest(pTHX_ SV* const invlist)
8142 {
8143     /* Returns the highest code point that matches an inversion list.  This API
8144      * has an ambiguity, as it returns 0 under either the highest is actually
8145      * 0, or if the list is empty.  If this distinction matters to you, check
8146      * for emptiness before calling this function */
8147
8148     UV len = _invlist_len(invlist);
8149     UV *array;
8150
8151     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8152
8153     if (len == 0) {
8154         return 0;
8155     }
8156
8157     array = invlist_array(invlist);
8158
8159     /* The last element in the array in the inversion list always starts a
8160      * range that goes to infinity.  That range may be for code points that are
8161      * matched in the inversion list, or it may be for ones that aren't
8162      * matched.  In the latter case, the highest code point in the set is one
8163      * less than the beginning of this range; otherwise it is the final element
8164      * of this range: infinity */
8165     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8166            ? UV_MAX
8167            : array[len - 1] - 1;
8168 }
8169
8170 #ifndef PERL_IN_XSUB_RE
8171 SV *
8172 Perl__invlist_contents(pTHX_ SV* const invlist)
8173 {
8174     /* Get the contents of an inversion list into a string SV so that they can
8175      * be printed out.  It uses the format traditionally done for debug tracing
8176      */
8177
8178     UV start, end;
8179     SV* output = newSVpvs("\n");
8180
8181     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8182
8183     assert(! invlist_is_iterating(invlist));
8184
8185     invlist_iterinit(invlist);
8186     while (invlist_iternext(invlist, &start, &end)) {
8187         if (end == UV_MAX) {
8188             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8189         }
8190         else if (end != start) {
8191             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8192                     start,       end);
8193         }
8194         else {
8195             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8196         }
8197     }
8198
8199     return output;
8200 }
8201 #endif
8202
8203 #ifdef PERL_ARGS_ASSERT__INVLIST_DUMP
8204 void
8205 Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header)
8206 {
8207     /* Dumps out the ranges in an inversion list.  The string 'header'
8208      * if present is output on a line before the first range */
8209
8210     UV start, end;
8211
8212     PERL_ARGS_ASSERT__INVLIST_DUMP;
8213
8214     if (header && strlen(header)) {
8215         PerlIO_printf(Perl_debug_log, "%s\n", header);
8216     }
8217     if (invlist_is_iterating(invlist)) {
8218         PerlIO_printf(Perl_debug_log, "Can't dump because is in middle of iterating\n");
8219         return;
8220     }
8221
8222     invlist_iterinit(invlist);
8223     while (invlist_iternext(invlist, &start, &end)) {
8224         if (end == UV_MAX) {
8225             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8226         }
8227         else if (end != start) {
8228             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n",
8229                                                  start,         end);
8230         }
8231         else {
8232             PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start);
8233         }
8234     }
8235 }
8236 #endif
8237
8238 #if 0
8239 bool
8240 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8241 {
8242     /* Return a boolean as to if the two passed in inversion lists are
8243      * identical.  The final argument, if TRUE, says to take the complement of
8244      * the second inversion list before doing the comparison */
8245
8246     UV* array_a = invlist_array(a);
8247     UV* array_b = invlist_array(b);
8248     UV len_a = _invlist_len(a);
8249     UV len_b = _invlist_len(b);
8250
8251     UV i = 0;               /* current index into the arrays */
8252     bool retval = TRUE;     /* Assume are identical until proven otherwise */
8253
8254     PERL_ARGS_ASSERT__INVLISTEQ;
8255
8256     /* If are to compare 'a' with the complement of b, set it
8257      * up so are looking at b's complement. */
8258     if (complement_b) {
8259
8260         /* The complement of nothing is everything, so <a> would have to have
8261          * just one element, starting at zero (ending at infinity) */
8262         if (len_b == 0) {
8263             return (len_a == 1 && array_a[0] == 0);
8264         }
8265         else if (array_b[0] == 0) {
8266
8267             /* Otherwise, to complement, we invert.  Here, the first element is
8268              * 0, just remove it.  To do this, we just pretend the array starts
8269              * one later, and clear the flag as we don't have to do anything
8270              * else later */
8271
8272             array_b++;
8273             len_b--;
8274             complement_b = FALSE;
8275         }
8276         else {
8277
8278             /* But if the first element is not zero, we unshift a 0 before the
8279              * array.  The data structure reserves a space for that 0 (which
8280              * should be a '1' right now), so physical shifting is unneeded,
8281              * but temporarily change that element to 0.  Before exiting the
8282              * routine, we must restore the element to '1' */
8283             array_b--;
8284             len_b++;
8285             array_b[0] = 0;
8286         }
8287     }
8288
8289     /* Make sure that the lengths are the same, as well as the final element
8290      * before looping through the remainder.  (Thus we test the length, final,
8291      * and first elements right off the bat) */
8292     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8293         retval = FALSE;
8294     }
8295     else for (i = 0; i < len_a - 1; i++) {
8296         if (array_a[i] != array_b[i]) {
8297             retval = FALSE;
8298             break;
8299         }
8300     }
8301
8302     if (complement_b) {
8303         array_b[0] = 1;
8304     }
8305     return retval;
8306 }
8307 #endif
8308
8309 #undef HEADER_LENGTH
8310 #undef INVLIST_INITIAL_LENGTH
8311 #undef TO_INTERNAL_SIZE
8312 #undef FROM_INTERNAL_SIZE
8313 #undef INVLIST_LEN_OFFSET
8314 #undef INVLIST_ZERO_OFFSET
8315 #undef INVLIST_ITER_OFFSET
8316 #undef INVLIST_VERSION_ID
8317 #undef INVLIST_PREVIOUS_INDEX_OFFSET
8318
8319 /* End of inversion list object */
8320
8321 STATIC void
8322 S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state)
8323 {
8324     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
8325      * constructs, and updates RExC_flags with them.  On input, RExC_parse
8326      * should point to the first flag; it is updated on output to point to the
8327      * final ')' or ':'.  There needs to be at least one flag, or this will
8328      * abort */
8329
8330     /* for (?g), (?gc), and (?o) warnings; warning
8331        about (?c) will warn about (?g) -- japhy    */
8332
8333 #define WASTED_O  0x01
8334 #define WASTED_G  0x02
8335 #define WASTED_C  0x04
8336 #define WASTED_GC (0x02|0x04)
8337     I32 wastedflags = 0x00;
8338     U32 posflags = 0, negflags = 0;
8339     U32 *flagsp = &posflags;
8340     char has_charset_modifier = '\0';
8341     regex_charset cs;
8342     bool has_use_defaults = FALSE;
8343     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
8344
8345     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
8346
8347     /* '^' as an initial flag sets certain defaults */
8348     if (UCHARAT(RExC_parse) == '^') {
8349         RExC_parse++;
8350         has_use_defaults = TRUE;
8351         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8352         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8353                                         ? REGEX_UNICODE_CHARSET
8354                                         : REGEX_DEPENDS_CHARSET);
8355     }
8356
8357     cs = get_regex_charset(RExC_flags);
8358     if (cs == REGEX_DEPENDS_CHARSET
8359         && (RExC_utf8 || RExC_uni_semantics))
8360     {
8361         cs = REGEX_UNICODE_CHARSET;
8362     }
8363
8364     while (*RExC_parse) {
8365         /* && strchr("iogcmsx", *RExC_parse) */
8366         /* (?g), (?gc) and (?o) are useless here
8367            and must be globally applied -- japhy */
8368         switch (*RExC_parse) {
8369
8370             /* Code for the imsx flags */
8371             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8372
8373             case LOCALE_PAT_MOD:
8374                 if (has_charset_modifier) {
8375                     goto excess_modifier;
8376                 }
8377                 else if (flagsp == &negflags) {
8378                     goto neg_modifier;
8379                 }
8380                 cs = REGEX_LOCALE_CHARSET;
8381                 has_charset_modifier = LOCALE_PAT_MOD;
8382                 RExC_contains_locale = 1;
8383                 break;
8384             case UNICODE_PAT_MOD:
8385                 if (has_charset_modifier) {
8386                     goto excess_modifier;
8387                 }
8388                 else if (flagsp == &negflags) {
8389                     goto neg_modifier;
8390                 }
8391                 cs = REGEX_UNICODE_CHARSET;
8392                 has_charset_modifier = UNICODE_PAT_MOD;
8393                 break;
8394             case ASCII_RESTRICT_PAT_MOD:
8395                 if (flagsp == &negflags) {
8396                     goto neg_modifier;
8397                 }
8398                 if (has_charset_modifier) {
8399                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8400                         goto excess_modifier;
8401                     }
8402                     /* Doubled modifier implies more restricted */
8403                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8404                 }
8405                 else {
8406                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
8407                 }
8408                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8409                 break;
8410             case DEPENDS_PAT_MOD:
8411                 if (has_use_defaults) {
8412                     goto fail_modifiers;
8413                 }
8414                 else if (flagsp == &negflags) {
8415                     goto neg_modifier;
8416                 }
8417                 else if (has_charset_modifier) {
8418                     goto excess_modifier;
8419                 }
8420
8421                 /* The dual charset means unicode semantics if the
8422                  * pattern (or target, not known until runtime) are
8423                  * utf8, or something in the pattern indicates unicode
8424                  * semantics */
8425                 cs = (RExC_utf8 || RExC_uni_semantics)
8426                      ? REGEX_UNICODE_CHARSET
8427                      : REGEX_DEPENDS_CHARSET;
8428                 has_charset_modifier = DEPENDS_PAT_MOD;
8429                 break;
8430             excess_modifier:
8431                 RExC_parse++;
8432                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8433                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8434                 }
8435                 else if (has_charset_modifier == *(RExC_parse - 1)) {
8436                     vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8437                 }
8438                 else {
8439                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8440                 }
8441                 /*NOTREACHED*/
8442             neg_modifier:
8443                 RExC_parse++;
8444                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8445                 /*NOTREACHED*/
8446             case ONCE_PAT_MOD: /* 'o' */
8447             case GLOBAL_PAT_MOD: /* 'g' */
8448                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8449                     const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8450                     if (! (wastedflags & wflagbit) ) {
8451                         wastedflags |= wflagbit;
8452                         vWARN5(
8453                             RExC_parse + 1,
8454                             "Useless (%s%c) - %suse /%c modifier",
8455                             flagsp == &negflags ? "?-" : "?",
8456                             *RExC_parse,
8457                             flagsp == &negflags ? "don't " : "",
8458                             *RExC_parse
8459                         );
8460                     }
8461                 }
8462                 break;
8463
8464             case CONTINUE_PAT_MOD: /* 'c' */
8465                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8466                     if (! (wastedflags & WASTED_C) ) {
8467                         wastedflags |= WASTED_GC;
8468                         vWARN3(
8469                             RExC_parse + 1,
8470                             "Useless (%sc) - %suse /gc modifier",
8471                             flagsp == &negflags ? "?-" : "?",
8472                             flagsp == &negflags ? "don't " : ""
8473                         );
8474                     }
8475                 }
8476                 break;
8477             case KEEPCOPY_PAT_MOD: /* 'p' */
8478                 if (flagsp == &negflags) {
8479                     if (SIZE_ONLY)
8480                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8481                 } else {
8482                     *flagsp |= RXf_PMf_KEEPCOPY;
8483                 }
8484                 break;
8485             case '-':
8486                 /* A flag is a default iff it is following a minus, so
8487                  * if there is a minus, it means will be trying to
8488                  * re-specify a default which is an error */
8489                 if (has_use_defaults || flagsp == &negflags) {
8490     fail_modifiers:
8491                     RExC_parse++;
8492                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8493                     /*NOTREACHED*/
8494                 }
8495                 flagsp = &negflags;
8496                 wastedflags = 0;  /* reset so (?g-c) warns twice */
8497                 break;
8498             case ':':
8499             case ')':
8500                 RExC_flags |= posflags;
8501                 RExC_flags &= ~negflags;
8502                 set_regex_charset(&RExC_flags, cs);
8503                 return;
8504                 /*NOTREACHED*/
8505             default:
8506                 RExC_parse++;
8507                 vFAIL3("Sequence (%.*s...) not recognized",
8508                        RExC_parse-seqstart, seqstart);
8509                 /*NOTREACHED*/
8510         }
8511
8512         ++RExC_parse;
8513     }
8514 }
8515
8516 /*
8517  - reg - regular expression, i.e. main body or parenthesized thing
8518  *
8519  * Caller must absorb opening parenthesis.
8520  *
8521  * Combining parenthesis handling with the base level of regular expression
8522  * is a trifle forced, but the need to tie the tails of the branches to what
8523  * follows makes it hard to avoid.
8524  */
8525 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8526 #ifdef DEBUGGING
8527 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8528 #else
8529 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8530 #endif
8531
8532 STATIC regnode *
8533 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8534     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8535 {
8536     dVAR;
8537     regnode *ret;               /* Will be the head of the group. */
8538     regnode *br;
8539     regnode *lastbr;
8540     regnode *ender = NULL;
8541     I32 parno = 0;
8542     I32 flags;
8543     U32 oregflags = RExC_flags;
8544     bool have_branch = 0;
8545     bool is_open = 0;
8546     I32 freeze_paren = 0;
8547     I32 after_freeze = 0;
8548
8549     char * parse_start = RExC_parse; /* MJD */
8550     char * const oregcomp_parse = RExC_parse;
8551
8552     GET_RE_DEBUG_FLAGS_DECL;
8553
8554     PERL_ARGS_ASSERT_REG;
8555     DEBUG_PARSE("reg ");
8556
8557     *flagp = 0;                         /* Tentatively. */
8558
8559
8560     /* Make an OPEN node, if parenthesized. */
8561     if (paren) {
8562         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8563             char *start_verb = RExC_parse;
8564             STRLEN verb_len = 0;
8565             char *start_arg = NULL;
8566             unsigned char op = 0;
8567             int argok = 1;
8568             int internal_argval = 0; /* internal_argval is only useful if !argok */
8569             while ( *RExC_parse && *RExC_parse != ')' ) {
8570                 if ( *RExC_parse == ':' ) {
8571                     start_arg = RExC_parse + 1;
8572                     break;
8573                 }
8574                 RExC_parse++;
8575             }
8576             ++start_verb;
8577             verb_len = RExC_parse - start_verb;
8578             if ( start_arg ) {
8579                 RExC_parse++;
8580                 while ( *RExC_parse && *RExC_parse != ')' )
8581                     RExC_parse++;
8582                 if ( *RExC_parse != ')' )
8583                     vFAIL("Unterminated verb pattern argument");
8584                 if ( RExC_parse == start_arg )
8585                     start_arg = NULL;
8586             } else {
8587                 if ( *RExC_parse != ')' )
8588                     vFAIL("Unterminated verb pattern");
8589             }
8590
8591             switch ( *start_verb ) {
8592             case 'A':  /* (*ACCEPT) */
8593                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8594                     op = ACCEPT;
8595                     internal_argval = RExC_nestroot;
8596                 }
8597                 break;
8598             case 'C':  /* (*COMMIT) */
8599                 if ( memEQs(start_verb,verb_len,"COMMIT") )
8600                     op = COMMIT;
8601                 break;
8602             case 'F':  /* (*FAIL) */
8603                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8604                     op = OPFAIL;
8605                     argok = 0;
8606                 }
8607                 break;
8608             case ':':  /* (*:NAME) */
8609             case 'M':  /* (*MARK:NAME) */
8610                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8611                     op = MARKPOINT;
8612                     argok = -1;
8613                 }
8614                 break;
8615             case 'P':  /* (*PRUNE) */
8616                 if ( memEQs(start_verb,verb_len,"PRUNE") )
8617                     op = PRUNE;
8618                 break;
8619             case 'S':   /* (*SKIP) */
8620                 if ( memEQs(start_verb,verb_len,"SKIP") )
8621                     op = SKIP;
8622                 break;
8623             case 'T':  /* (*THEN) */
8624                 /* [19:06] <TimToady> :: is then */
8625                 if ( memEQs(start_verb,verb_len,"THEN") ) {
8626                     op = CUTGROUP;
8627                     RExC_seen |= REG_SEEN_CUTGROUP;
8628                 }
8629                 break;
8630             }
8631             if ( ! op ) {
8632                 RExC_parse++;
8633                 vFAIL3("Unknown verb pattern '%.*s'",
8634                     verb_len, start_verb);
8635             }
8636             if ( argok ) {
8637                 if ( start_arg && internal_argval ) {
8638                     vFAIL3("Verb pattern '%.*s' may not have an argument",
8639                         verb_len, start_verb);
8640                 } else if ( argok < 0 && !start_arg ) {
8641                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8642                         verb_len, start_verb);
8643                 } else {
8644                     ret = reganode(pRExC_state, op, internal_argval);
8645                     if ( ! internal_argval && ! SIZE_ONLY ) {
8646                         if (start_arg) {
8647                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8648                             ARG(ret) = add_data( pRExC_state, 1, "S" );
8649                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8650                             ret->flags = 0;
8651                         } else {
8652                             ret->flags = 1;
8653                         }
8654                     }
8655                 }
8656                 if (!internal_argval)
8657                     RExC_seen |= REG_SEEN_VERBARG;
8658             } else if ( start_arg ) {
8659                 vFAIL3("Verb pattern '%.*s' may not have an argument",
8660                         verb_len, start_verb);
8661             } else {
8662                 ret = reg_node(pRExC_state, op);
8663             }
8664             nextchar(pRExC_state);
8665             return ret;
8666         } else
8667         if (*RExC_parse == '?') { /* (?...) */
8668             bool is_logical = 0;
8669             const char * const seqstart = RExC_parse;
8670
8671             RExC_parse++;
8672             paren = *RExC_parse++;
8673             ret = NULL;                 /* For look-ahead/behind. */
8674             switch (paren) {
8675
8676             case 'P':   /* (?P...) variants for those used to PCRE/Python */
8677                 paren = *RExC_parse++;
8678                 if ( paren == '<')         /* (?P<...>) named capture */
8679                     goto named_capture;
8680                 else if (paren == '>') {   /* (?P>name) named recursion */
8681                     goto named_recursion;
8682                 }
8683                 else if (paren == '=') {   /* (?P=...)  named backref */
8684                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
8685                        you change this make sure you change that */
8686                     char* name_start = RExC_parse;
8687                     U32 num = 0;
8688                     SV *sv_dat = reg_scan_name(pRExC_state,
8689                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8690                     if (RExC_parse == name_start || *RExC_parse != ')')
8691                         vFAIL2("Sequence %.3s... not terminated",parse_start);
8692
8693                     if (!SIZE_ONLY) {
8694                         num = add_data( pRExC_state, 1, "S" );
8695                         RExC_rxi->data->data[num]=(void*)sv_dat;
8696                         SvREFCNT_inc_simple_void(sv_dat);
8697                     }
8698                     RExC_sawback = 1;
8699                     ret = reganode(pRExC_state,
8700                                    ((! FOLD)
8701                                      ? NREF
8702                                      : (ASCII_FOLD_RESTRICTED)
8703                                        ? NREFFA
8704                                        : (AT_LEAST_UNI_SEMANTICS)
8705                                          ? NREFFU
8706                                          : (LOC)
8707                                            ? NREFFL
8708                                            : NREFF),
8709                                     num);
8710                     *flagp |= HASWIDTH;
8711
8712                     Set_Node_Offset(ret, parse_start+1);
8713                     Set_Node_Cur_Length(ret); /* MJD */
8714
8715                     nextchar(pRExC_state);
8716                     return ret;
8717                 }
8718                 RExC_parse++;
8719                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8720                 /*NOTREACHED*/
8721             case '<':           /* (?<...) */
8722                 if (*RExC_parse == '!')
8723                     paren = ',';
8724                 else if (*RExC_parse != '=')
8725               named_capture:
8726                 {               /* (?<...>) */
8727                     char *name_start;
8728                     SV *svname;
8729                     paren= '>';
8730             case '\'':          /* (?'...') */
8731                     name_start= RExC_parse;
8732                     svname = reg_scan_name(pRExC_state,
8733                         SIZE_ONLY ?  /* reverse test from the others */
8734                         REG_RSN_RETURN_NAME :
8735                         REG_RSN_RETURN_NULL);
8736                     if (RExC_parse == name_start) {
8737                         RExC_parse++;
8738                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8739                         /*NOTREACHED*/
8740                     }
8741                     if (*RExC_parse != paren)
8742                         vFAIL2("Sequence (?%c... not terminated",
8743                             paren=='>' ? '<' : paren);
8744                     if (SIZE_ONLY) {
8745                         HE *he_str;
8746                         SV *sv_dat = NULL;
8747                         if (!svname) /* shouldn't happen */
8748                             Perl_croak(aTHX_
8749                                 "panic: reg_scan_name returned NULL");
8750                         if (!RExC_paren_names) {
8751                             RExC_paren_names= newHV();
8752                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
8753 #ifdef DEBUGGING
8754                             RExC_paren_name_list= newAV();
8755                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8756 #endif
8757                         }
8758                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8759                         if ( he_str )
8760                             sv_dat = HeVAL(he_str);
8761                         if ( ! sv_dat ) {
8762                             /* croak baby croak */
8763                             Perl_croak(aTHX_
8764                                 "panic: paren_name hash element allocation failed");
8765                         } else if ( SvPOK(sv_dat) ) {
8766                             /* (?|...) can mean we have dupes so scan to check
8767                                its already been stored. Maybe a flag indicating
8768                                we are inside such a construct would be useful,
8769                                but the arrays are likely to be quite small, so
8770                                for now we punt -- dmq */
8771                             IV count = SvIV(sv_dat);
8772                             I32 *pv = (I32*)SvPVX(sv_dat);
8773                             IV i;
8774                             for ( i = 0 ; i < count ; i++ ) {
8775                                 if ( pv[i] == RExC_npar ) {
8776                                     count = 0;
8777                                     break;
8778                                 }
8779                             }
8780                             if ( count ) {
8781                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8782                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8783                                 pv[count] = RExC_npar;
8784                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8785                             }
8786                         } else {
8787                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
8788                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8789                             SvIOK_on(sv_dat);
8790                             SvIV_set(sv_dat, 1);
8791                         }
8792 #ifdef DEBUGGING
8793                         /* Yes this does cause a memory leak in debugging Perls */
8794                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8795                             SvREFCNT_dec_NN(svname);
8796 #endif
8797
8798                         /*sv_dump(sv_dat);*/
8799                     }
8800                     nextchar(pRExC_state);
8801                     paren = 1;
8802                     goto capturing_parens;
8803                 }
8804                 RExC_seen |= REG_SEEN_LOOKBEHIND;
8805                 RExC_in_lookbehind++;
8806                 RExC_parse++;
8807             case '=':           /* (?=...) */
8808                 RExC_seen_zerolen++;
8809                 break;
8810             case '!':           /* (?!...) */
8811                 RExC_seen_zerolen++;
8812                 if (*RExC_parse == ')') {
8813                     ret=reg_node(pRExC_state, OPFAIL);
8814                     nextchar(pRExC_state);
8815                     return ret;
8816                 }
8817                 break;
8818             case '|':           /* (?|...) */
8819                 /* branch reset, behave like a (?:...) except that
8820                    buffers in alternations share the same numbers */
8821                 paren = ':';
8822                 after_freeze = freeze_paren = RExC_npar;
8823                 break;
8824             case ':':           /* (?:...) */
8825             case '>':           /* (?>...) */
8826                 break;
8827             case '$':           /* (?$...) */
8828             case '@':           /* (?@...) */
8829                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8830                 break;
8831             case '#':           /* (?#...) */
8832                 while (*RExC_parse && *RExC_parse != ')')
8833                     RExC_parse++;
8834                 if (*RExC_parse != ')')
8835                     FAIL("Sequence (?#... not terminated");
8836                 nextchar(pRExC_state);
8837                 *flagp = TRYAGAIN;
8838                 return NULL;
8839             case '0' :           /* (?0) */
8840             case 'R' :           /* (?R) */
8841                 if (*RExC_parse != ')')
8842                     FAIL("Sequence (?R) not terminated");
8843                 ret = reg_node(pRExC_state, GOSTART);
8844                 *flagp |= POSTPONED;
8845                 nextchar(pRExC_state);
8846                 return ret;
8847                 /*notreached*/
8848             { /* named and numeric backreferences */
8849                 I32 num;
8850             case '&':            /* (?&NAME) */
8851                 parse_start = RExC_parse - 1;
8852               named_recursion:
8853                 {
8854                     SV *sv_dat = reg_scan_name(pRExC_state,
8855                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8856                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8857                 }
8858                 goto gen_recurse_regop;
8859                 assert(0); /* NOT REACHED */
8860             case '+':
8861                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8862                     RExC_parse++;
8863                     vFAIL("Illegal pattern");
8864                 }
8865                 goto parse_recursion;
8866                 /* NOT REACHED*/
8867             case '-': /* (?-1) */
8868                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8869                     RExC_parse--; /* rewind to let it be handled later */
8870                     goto parse_flags;
8871                 }
8872                 /*FALLTHROUGH */
8873             case '1': case '2': case '3': case '4': /* (?1) */
8874             case '5': case '6': case '7': case '8': case '9':
8875                 RExC_parse--;
8876               parse_recursion:
8877                 num = atoi(RExC_parse);
8878                 parse_start = RExC_parse - 1; /* MJD */
8879                 if (*RExC_parse == '-')
8880                     RExC_parse++;
8881                 while (isDIGIT(*RExC_parse))
8882                         RExC_parse++;
8883                 if (*RExC_parse!=')')
8884                     vFAIL("Expecting close bracket");
8885
8886               gen_recurse_regop:
8887                 if ( paren == '-' ) {
8888                     /*
8889                     Diagram of capture buffer numbering.
8890                     Top line is the normal capture buffer numbers
8891                     Bottom line is the negative indexing as from
8892                     the X (the (?-2))
8893
8894                     +   1 2    3 4 5 X          6 7
8895                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8896                     -   5 4    3 2 1 X          x x
8897
8898                     */
8899                     num = RExC_npar + num;
8900                     if (num < 1)  {
8901                         RExC_parse++;
8902                         vFAIL("Reference to nonexistent group");
8903                     }
8904                 } else if ( paren == '+' ) {
8905                     num = RExC_npar + num - 1;
8906                 }
8907
8908                 ret = reganode(pRExC_state, GOSUB, num);
8909                 if (!SIZE_ONLY) {
8910                     if (num > (I32)RExC_rx->nparens) {
8911                         RExC_parse++;
8912                         vFAIL("Reference to nonexistent group");
8913                     }
8914                     ARG2L_SET( ret, RExC_recurse_count++);
8915                     RExC_emit++;
8916                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8917                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8918                 } else {
8919                     RExC_size++;
8920                 }
8921                 RExC_seen |= REG_SEEN_RECURSE;
8922                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8923                 Set_Node_Offset(ret, parse_start); /* MJD */
8924
8925                 *flagp |= POSTPONED;
8926                 nextchar(pRExC_state);
8927                 return ret;
8928             } /* named and numeric backreferences */
8929             assert(0); /* NOT REACHED */
8930
8931             case '?':           /* (??...) */
8932                 is_logical = 1;
8933                 if (*RExC_parse != '{') {
8934                     RExC_parse++;
8935                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8936                     /*NOTREACHED*/
8937                 }
8938                 *flagp |= POSTPONED;
8939                 paren = *RExC_parse++;
8940                 /* FALL THROUGH */
8941             case '{':           /* (?{...}) */
8942             {
8943                 U32 n = 0;
8944                 struct reg_code_block *cb;
8945
8946                 RExC_seen_zerolen++;
8947
8948                 if (   !pRExC_state->num_code_blocks
8949                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
8950                     || pRExC_state->code_blocks[pRExC_state->code_index].start
8951                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8952                             - RExC_start)
8953                 ) {
8954                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
8955                         FAIL("panic: Sequence (?{...}): no code block found\n");
8956                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
8957                 }
8958                 /* this is a pre-compiled code block (?{...}) */
8959                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8960                 RExC_parse = RExC_start + cb->end;
8961                 if (!SIZE_ONLY) {
8962                     OP *o = cb->block;
8963                     if (cb->src_regex) {
8964                         n = add_data(pRExC_state, 2, "rl");
8965                         RExC_rxi->data->data[n] =
8966                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
8967                         RExC_rxi->data->data[n+1] = (void*)o;
8968                     }
8969                     else {
8970                         n = add_data(pRExC_state, 1,
8971                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8972                         RExC_rxi->data->data[n] = (void*)o;
8973                     }
8974                 }
8975                 pRExC_state->code_index++;
8976                 nextchar(pRExC_state);
8977
8978                 if (is_logical) {
8979                     regnode *eval;
8980                     ret = reg_node(pRExC_state, LOGICAL);
8981                     eval = reganode(pRExC_state, EVAL, n);
8982                     if (!SIZE_ONLY) {
8983                         ret->flags = 2;
8984                         /* for later propagation into (??{}) return value */
8985                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8986                     }
8987                     REGTAIL(pRExC_state, ret, eval);
8988                     /* deal with the length of this later - MJD */
8989                     return ret;
8990                 }
8991                 ret = reganode(pRExC_state, EVAL, n);
8992                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8993                 Set_Node_Offset(ret, parse_start);
8994                 return ret;
8995             }
8996             case '(':           /* (?(?{...})...) and (?(?=...)...) */
8997             {
8998                 int is_define= 0;
8999                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
9000                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9001                         || RExC_parse[1] == '<'
9002                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
9003                         I32 flag;
9004
9005                         ret = reg_node(pRExC_state, LOGICAL);
9006                         if (!SIZE_ONLY)
9007                             ret->flags = 1;
9008                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
9009                         goto insert_if;
9010                     }
9011                 }
9012                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9013                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9014                 {
9015                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
9016                     char *name_start= RExC_parse++;
9017                     U32 num = 0;
9018                     SV *sv_dat=reg_scan_name(pRExC_state,
9019                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9020                     if (RExC_parse == name_start || *RExC_parse != ch)
9021                         vFAIL2("Sequence (?(%c... not terminated",
9022                             (ch == '>' ? '<' : ch));
9023                     RExC_parse++;
9024                     if (!SIZE_ONLY) {
9025                         num = add_data( pRExC_state, 1, "S" );
9026                         RExC_rxi->data->data[num]=(void*)sv_dat;
9027                         SvREFCNT_inc_simple_void(sv_dat);
9028                     }
9029                     ret = reganode(pRExC_state,NGROUPP,num);
9030                     goto insert_if_check_paren;
9031                 }
9032                 else if (RExC_parse[0] == 'D' &&
9033                          RExC_parse[1] == 'E' &&
9034                          RExC_parse[2] == 'F' &&
9035                          RExC_parse[3] == 'I' &&
9036                          RExC_parse[4] == 'N' &&
9037                          RExC_parse[5] == 'E')
9038                 {
9039                     ret = reganode(pRExC_state,DEFINEP,0);
9040                     RExC_parse +=6 ;
9041                     is_define = 1;
9042                     goto insert_if_check_paren;
9043                 }
9044                 else if (RExC_parse[0] == 'R') {
9045                     RExC_parse++;
9046                     parno = 0;
9047                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9048                         parno = atoi(RExC_parse++);
9049                         while (isDIGIT(*RExC_parse))
9050                             RExC_parse++;
9051                     } else if (RExC_parse[0] == '&') {
9052                         SV *sv_dat;
9053                         RExC_parse++;
9054                         sv_dat = reg_scan_name(pRExC_state,
9055                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9056                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9057                     }
9058                     ret = reganode(pRExC_state,INSUBP,parno);
9059                     goto insert_if_check_paren;
9060                 }
9061                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9062                     /* (?(1)...) */
9063                     char c;
9064                     parno = atoi(RExC_parse++);
9065
9066                     while (isDIGIT(*RExC_parse))
9067                         RExC_parse++;
9068                     ret = reganode(pRExC_state, GROUPP, parno);
9069
9070                  insert_if_check_paren:
9071                     if ((c = *nextchar(pRExC_state)) != ')')
9072                         vFAIL("Switch condition not recognized");
9073                   insert_if:
9074                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9075                     br = regbranch(pRExC_state, &flags, 1,depth+1);
9076                     if (br == NULL)
9077                         br = reganode(pRExC_state, LONGJMP, 0);
9078                     else
9079                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
9080                     c = *nextchar(pRExC_state);
9081                     if (flags&HASWIDTH)
9082                         *flagp |= HASWIDTH;
9083                     if (c == '|') {
9084                         if (is_define)
9085                             vFAIL("(?(DEFINE)....) does not allow branches");
9086                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
9087                         regbranch(pRExC_state, &flags, 1,depth+1);
9088                         REGTAIL(pRExC_state, ret, lastbr);
9089                         if (flags&HASWIDTH)
9090                             *flagp |= HASWIDTH;
9091                         c = *nextchar(pRExC_state);
9092                     }
9093                     else
9094                         lastbr = NULL;
9095                     if (c != ')')
9096                         vFAIL("Switch (?(condition)... contains too many branches");
9097                     ender = reg_node(pRExC_state, TAIL);
9098                     REGTAIL(pRExC_state, br, ender);
9099                     if (lastbr) {
9100                         REGTAIL(pRExC_state, lastbr, ender);
9101                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
9102                     }
9103                     else
9104                         REGTAIL(pRExC_state, ret, ender);
9105                     RExC_size++; /* XXX WHY do we need this?!!
9106                                     For large programs it seems to be required
9107                                     but I can't figure out why. -- dmq*/
9108                     return ret;
9109                 }
9110                 else {
9111                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
9112                 }
9113             }
9114             case '[':           /* (?[ ... ]) */
9115                 return handle_sets(pRExC_state, flagp, depth, oregcomp_parse);
9116             case 0:
9117                 RExC_parse--; /* for vFAIL to print correctly */
9118                 vFAIL("Sequence (? incomplete");
9119                 break;
9120             default: /* e.g., (?i) */
9121                 --RExC_parse;
9122               parse_flags:
9123                 parse_lparen_question_flags(pRExC_state);
9124                 if (UCHARAT(RExC_parse) == ':') {
9125                     paren = ':';
9126                 }
9127                 nextchar(pRExC_state);
9128                 if (paren != ':') {
9129                     *flagp = TRYAGAIN;
9130                     return NULL;
9131                 } else {
9132                     ret = NULL;
9133                     goto parse_rest;
9134                 }
9135                 break;
9136             } /* end switch */
9137         }
9138         else {                  /* (...) */
9139           capturing_parens:
9140             parno = RExC_npar;
9141             RExC_npar++;
9142
9143             ret = reganode(pRExC_state, OPEN, parno);
9144             if (!SIZE_ONLY ){
9145                 if (!RExC_nestroot)
9146                     RExC_nestroot = parno;
9147                 if (RExC_seen & REG_SEEN_RECURSE
9148                     && !RExC_open_parens[parno-1])
9149                 {
9150                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9151                         "Setting open paren #%"IVdf" to %d\n",
9152                         (IV)parno, REG_NODE_NUM(ret)));
9153                     RExC_open_parens[parno-1]= ret;
9154                 }
9155             }
9156             Set_Node_Length(ret, 1); /* MJD */
9157             Set_Node_Offset(ret, RExC_parse); /* MJD */
9158             is_open = 1;
9159         }
9160     }
9161     else                        /* ! paren */
9162         ret = NULL;
9163
9164    parse_rest:
9165     /* Pick up the branches, linking them together. */
9166     parse_start = RExC_parse;   /* MJD */
9167     br = regbranch(pRExC_state, &flags, 1,depth+1);
9168
9169     /*     branch_len = (paren != 0); */
9170
9171     if (br == NULL)
9172         return(NULL);
9173     if (*RExC_parse == '|') {
9174         if (!SIZE_ONLY && RExC_extralen) {
9175             reginsert(pRExC_state, BRANCHJ, br, depth+1);
9176         }
9177         else {                  /* MJD */
9178             reginsert(pRExC_state, BRANCH, br, depth+1);
9179             Set_Node_Length(br, paren != 0);
9180             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9181         }
9182         have_branch = 1;
9183         if (SIZE_ONLY)
9184             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
9185     }
9186     else if (paren == ':') {
9187         *flagp |= flags&SIMPLE;
9188     }
9189     if (is_open) {                              /* Starts with OPEN. */
9190         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
9191     }
9192     else if (paren != '?')              /* Not Conditional */
9193         ret = br;
9194     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9195     lastbr = br;
9196     while (*RExC_parse == '|') {
9197         if (!SIZE_ONLY && RExC_extralen) {
9198             ender = reganode(pRExC_state, LONGJMP,0);
9199             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9200         }
9201         if (SIZE_ONLY)
9202             RExC_extralen += 2;         /* Account for LONGJMP. */
9203         nextchar(pRExC_state);
9204         if (freeze_paren) {
9205             if (RExC_npar > after_freeze)
9206                 after_freeze = RExC_npar;
9207             RExC_npar = freeze_paren;
9208         }
9209         br = regbranch(pRExC_state, &flags, 0, depth+1);
9210
9211         if (br == NULL)
9212             return(NULL);
9213         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
9214         lastbr = br;
9215         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9216     }
9217
9218     if (have_branch || paren != ':') {
9219         /* Make a closing node, and hook it on the end. */
9220         switch (paren) {
9221         case ':':
9222             ender = reg_node(pRExC_state, TAIL);
9223             break;
9224         case 1:
9225             ender = reganode(pRExC_state, CLOSE, parno);
9226             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9227                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9228                         "Setting close paren #%"IVdf" to %d\n",
9229                         (IV)parno, REG_NODE_NUM(ender)));
9230                 RExC_close_parens[parno-1]= ender;
9231                 if (RExC_nestroot == parno)
9232                     RExC_nestroot = 0;
9233             }
9234             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9235             Set_Node_Length(ender,1); /* MJD */
9236             break;
9237         case '<':
9238         case ',':
9239         case '=':
9240         case '!':
9241             *flagp &= ~HASWIDTH;
9242             /* FALL THROUGH */
9243         case '>':
9244             ender = reg_node(pRExC_state, SUCCEED);
9245             break;
9246         case 0:
9247             ender = reg_node(pRExC_state, END);
9248             if (!SIZE_ONLY) {
9249                 assert(!RExC_opend); /* there can only be one! */
9250                 RExC_opend = ender;
9251             }
9252             break;
9253         }
9254         DEBUG_PARSE_r(if (!SIZE_ONLY) {
9255             SV * const mysv_val1=sv_newmortal();
9256             SV * const mysv_val2=sv_newmortal();
9257             DEBUG_PARSE_MSG("lsbr");
9258             regprop(RExC_rx, mysv_val1, lastbr);
9259             regprop(RExC_rx, mysv_val2, ender);
9260             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9261                           SvPV_nolen_const(mysv_val1),
9262                           (IV)REG_NODE_NUM(lastbr),
9263                           SvPV_nolen_const(mysv_val2),
9264                           (IV)REG_NODE_NUM(ender),
9265                           (IV)(ender - lastbr)
9266             );
9267         });
9268         REGTAIL(pRExC_state, lastbr, ender);
9269
9270         if (have_branch && !SIZE_ONLY) {
9271             char is_nothing= 1;
9272             if (depth==1)
9273                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9274
9275             /* Hook the tails of the branches to the closing node. */
9276             for (br = ret; br; br = regnext(br)) {
9277                 const U8 op = PL_regkind[OP(br)];
9278                 if (op == BRANCH) {
9279                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9280                     if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9281                         is_nothing= 0;
9282                 }
9283                 else if (op == BRANCHJ) {
9284                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9285                     /* for now we always disable this optimisation * /
9286                     if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9287                     */
9288                         is_nothing= 0;
9289                 }
9290             }
9291             if (is_nothing) {
9292                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9293                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9294                     SV * const mysv_val1=sv_newmortal();
9295                     SV * const mysv_val2=sv_newmortal();
9296                     DEBUG_PARSE_MSG("NADA");
9297                     regprop(RExC_rx, mysv_val1, ret);
9298                     regprop(RExC_rx, mysv_val2, ender);
9299                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9300                                   SvPV_nolen_const(mysv_val1),
9301                                   (IV)REG_NODE_NUM(ret),
9302                                   SvPV_nolen_const(mysv_val2),
9303                                   (IV)REG_NODE_NUM(ender),
9304                                   (IV)(ender - ret)
9305                     );
9306                 });
9307                 OP(br)= NOTHING;
9308                 if (OP(ender) == TAIL) {
9309                     NEXT_OFF(br)= 0;
9310                     RExC_emit= br + 1;
9311                 } else {
9312                     regnode *opt;
9313                     for ( opt= br + 1; opt < ender ; opt++ )
9314                         OP(opt)= OPTIMIZED;
9315                     NEXT_OFF(br)= ender - br;
9316                 }
9317             }
9318         }
9319     }
9320
9321     {
9322         const char *p;
9323         static const char parens[] = "=!<,>";
9324
9325         if (paren && (p = strchr(parens, paren))) {
9326             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9327             int flag = (p - parens) > 1;
9328
9329             if (paren == '>')
9330                 node = SUSPEND, flag = 0;
9331             reginsert(pRExC_state, node,ret, depth+1);
9332             Set_Node_Cur_Length(ret);
9333             Set_Node_Offset(ret, parse_start + 1);
9334             ret->flags = flag;
9335             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9336         }
9337     }
9338
9339     /* Check for proper termination. */
9340     if (paren) {
9341         RExC_flags = oregflags;
9342         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9343             RExC_parse = oregcomp_parse;
9344             vFAIL("Unmatched (");
9345         }
9346     }
9347     else if (!paren && RExC_parse < RExC_end) {
9348         if (*RExC_parse == ')') {
9349             RExC_parse++;
9350             vFAIL("Unmatched )");
9351         }
9352         else
9353             FAIL("Junk on end of regexp");      /* "Can't happen". */
9354         assert(0); /* NOTREACHED */
9355     }
9356
9357     if (RExC_in_lookbehind) {
9358         RExC_in_lookbehind--;
9359     }
9360     if (after_freeze > RExC_npar)
9361         RExC_npar = after_freeze;
9362     return(ret);
9363 }
9364
9365 /*
9366  - regbranch - one alternative of an | operator
9367  *
9368  * Implements the concatenation operator.
9369  */
9370 STATIC regnode *
9371 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9372 {
9373     dVAR;
9374     regnode *ret;
9375     regnode *chain = NULL;
9376     regnode *latest;
9377     I32 flags = 0, c = 0;
9378     GET_RE_DEBUG_FLAGS_DECL;
9379
9380     PERL_ARGS_ASSERT_REGBRANCH;
9381
9382     DEBUG_PARSE("brnc");
9383
9384     if (first)
9385         ret = NULL;
9386     else {
9387         if (!SIZE_ONLY && RExC_extralen)
9388             ret = reganode(pRExC_state, BRANCHJ,0);
9389         else {
9390             ret = reg_node(pRExC_state, BRANCH);
9391             Set_Node_Length(ret, 1);
9392         }
9393     }
9394
9395     if (!first && SIZE_ONLY)
9396         RExC_extralen += 1;                     /* BRANCHJ */
9397
9398     *flagp = WORST;                     /* Tentatively. */
9399
9400     RExC_parse--;
9401     nextchar(pRExC_state);
9402     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9403         flags &= ~TRYAGAIN;
9404         latest = regpiece(pRExC_state, &flags,depth+1);
9405         if (latest == NULL) {
9406             if (flags & TRYAGAIN)
9407                 continue;
9408             return(NULL);
9409         }
9410         else if (ret == NULL)
9411             ret = latest;
9412         *flagp |= flags&(HASWIDTH|POSTPONED);
9413         if (chain == NULL)      /* First piece. */
9414             *flagp |= flags&SPSTART;
9415         else {
9416             RExC_naughty++;
9417             REGTAIL(pRExC_state, chain, latest);
9418         }
9419         chain = latest;
9420         c++;
9421     }
9422     if (chain == NULL) {        /* Loop ran zero times. */
9423         chain = reg_node(pRExC_state, NOTHING);
9424         if (ret == NULL)
9425             ret = chain;
9426     }
9427     if (c == 1) {
9428         *flagp |= flags&SIMPLE;
9429     }
9430
9431     return ret;
9432 }
9433
9434 /*
9435  - regpiece - something followed by possible [*+?]
9436  *
9437  * Note that the branching code sequences used for ? and the general cases
9438  * of * and + are somewhat optimized:  they use the same NOTHING node as
9439  * both the endmarker for their branch list and the body of the last branch.
9440  * It might seem that this node could be dispensed with entirely, but the
9441  * endmarker role is not redundant.
9442  */
9443 STATIC regnode *
9444 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9445 {
9446     dVAR;
9447     regnode *ret;
9448     char op;
9449     char *next;
9450     I32 flags;
9451     const char * const origparse = RExC_parse;
9452     I32 min;
9453     I32 max = REG_INFTY;
9454 #ifdef RE_TRACK_PATTERN_OFFSETS
9455     char *parse_start;
9456 #endif
9457     const char *maxpos = NULL;
9458
9459     /* Save the original in case we change the emitted regop to a FAIL. */
9460     regnode * const orig_emit = RExC_emit;
9461
9462     GET_RE_DEBUG_FLAGS_DECL;
9463
9464     PERL_ARGS_ASSERT_REGPIECE;
9465
9466     DEBUG_PARSE("piec");
9467
9468     ret = regatom(pRExC_state, &flags,depth+1);
9469     if (ret == NULL) {
9470         if (flags & TRYAGAIN)
9471             *flagp |= TRYAGAIN;
9472         return(NULL);
9473     }
9474
9475     op = *RExC_parse;
9476
9477     if (op == '{' && regcurly(RExC_parse, FALSE)) {
9478         maxpos = NULL;
9479 #ifdef RE_TRACK_PATTERN_OFFSETS
9480         parse_start = RExC_parse; /* MJD */
9481 #endif
9482         next = RExC_parse + 1;
9483         while (isDIGIT(*next) || *next == ',') {
9484             if (*next == ',') {
9485                 if (maxpos)
9486                     break;
9487                 else
9488                     maxpos = next;
9489             }
9490             next++;
9491         }
9492         if (*next == '}') {             /* got one */
9493             if (!maxpos)
9494                 maxpos = next;
9495             RExC_parse++;
9496             min = atoi(RExC_parse);
9497             if (*maxpos == ',')
9498                 maxpos++;
9499             else
9500                 maxpos = RExC_parse;
9501             max = atoi(maxpos);
9502             if (!max && *maxpos != '0')
9503                 max = REG_INFTY;                /* meaning "infinity" */
9504             else if (max >= REG_INFTY)
9505                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9506             RExC_parse = next;
9507             nextchar(pRExC_state);
9508             if (max < min) {    /* If can't match, warn and optimize to fail
9509                                    unconditionally */
9510                 if (SIZE_ONLY) {
9511                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9512
9513                     /* We can't back off the size because we have to reserve
9514                      * enough space for all the things we are about to throw
9515                      * away, but we can shrink it by the ammount we are about
9516                      * to re-use here */
9517                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9518                 }
9519                 else {
9520                     RExC_emit = orig_emit;
9521                 }
9522                 ret = reg_node(pRExC_state, OPFAIL);
9523                 return ret;
9524             }
9525             else if (max == 0) {    /* replace {0} with a nothing node */
9526                 if (SIZE_ONLY) {
9527                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)NOTHING];
9528                 }
9529                 else {
9530                     RExC_emit = orig_emit;
9531                 }
9532                 ret = reg_node(pRExC_state, NOTHING);
9533                 return ret;
9534             }
9535
9536         do_curly:
9537             if ((flags&SIMPLE)) {
9538                 RExC_naughty += 2 + RExC_naughty / 2;
9539                 reginsert(pRExC_state, CURLY, ret, depth+1);
9540                 Set_Node_Offset(ret, parse_start+1); /* MJD */
9541                 Set_Node_Cur_Length(ret);
9542             }
9543             else {
9544                 regnode * const w = reg_node(pRExC_state, WHILEM);
9545
9546                 w->flags = 0;
9547                 REGTAIL(pRExC_state, ret, w);
9548                 if (!SIZE_ONLY && RExC_extralen) {
9549                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
9550                     reginsert(pRExC_state, NOTHING,ret, depth+1);
9551                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
9552                 }
9553                 reginsert(pRExC_state, CURLYX,ret, depth+1);
9554                                 /* MJD hk */
9555                 Set_Node_Offset(ret, parse_start+1);
9556                 Set_Node_Length(ret,
9557                                 op == '{' ? (RExC_parse - parse_start) : 1);
9558
9559                 if (!SIZE_ONLY && RExC_extralen)
9560                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
9561                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9562                 if (SIZE_ONLY)
9563                     RExC_whilem_seen++, RExC_extralen += 3;
9564                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
9565             }
9566             ret->flags = 0;
9567
9568             if (min > 0)
9569                 *flagp = WORST;
9570             if (max > 0)
9571                 *flagp |= HASWIDTH;
9572             if (!SIZE_ONLY) {
9573                 ARG1_SET(ret, (U16)min);
9574                 ARG2_SET(ret, (U16)max);
9575             }
9576
9577             goto nest_check;
9578         }
9579     }
9580
9581     if (!ISMULT1(op)) {
9582         *flagp = flags;
9583         return(ret);
9584     }
9585
9586 #if 0                           /* Now runtime fix should be reliable. */
9587
9588     /* if this is reinstated, don't forget to put this back into perldiag:
9589
9590             =item Regexp *+ operand could be empty at {#} in regex m/%s/
9591
9592            (F) The part of the regexp subject to either the * or + quantifier
9593            could match an empty string. The {#} shows in the regular
9594            expression about where the problem was discovered.
9595
9596     */
9597
9598     if (!(flags&HASWIDTH) && op != '?')
9599       vFAIL("Regexp *+ operand could be empty");
9600 #endif
9601
9602 #ifdef RE_TRACK_PATTERN_OFFSETS
9603     parse_start = RExC_parse;
9604 #endif
9605     nextchar(pRExC_state);
9606
9607     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9608
9609     if (op == '*' && (flags&SIMPLE)) {
9610         reginsert(pRExC_state, STAR, ret, depth+1);
9611         ret->flags = 0;
9612         RExC_naughty += 4;
9613     }
9614     else if (op == '*') {
9615         min = 0;
9616         goto do_curly;
9617     }
9618     else if (op == '+' && (flags&SIMPLE)) {
9619         reginsert(pRExC_state, PLUS, ret, depth+1);
9620         ret->flags = 0;
9621         RExC_naughty += 3;
9622     }
9623     else if (op == '+') {
9624         min = 1;
9625         goto do_curly;
9626     }
9627     else if (op == '?') {
9628         min = 0; max = 1;
9629         goto do_curly;
9630     }
9631   nest_check:
9632     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9633         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
9634         ckWARN3reg(RExC_parse,
9635                    "%.*s matches null string many times",
9636                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9637                    origparse);
9638         (void)ReREFCNT_inc(RExC_rx_sv);
9639     }
9640
9641     if (RExC_parse < RExC_end && *RExC_parse == '?') {
9642         nextchar(pRExC_state);
9643         reginsert(pRExC_state, MINMOD, ret, depth+1);
9644         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9645     }
9646 #ifndef REG_ALLOW_MINMOD_SUSPEND
9647     else
9648 #endif
9649     if (RExC_parse < RExC_end && *RExC_parse == '+') {
9650         regnode *ender;
9651         nextchar(pRExC_state);
9652         ender = reg_node(pRExC_state, SUCCEED);
9653         REGTAIL(pRExC_state, ret, ender);
9654         reginsert(pRExC_state, SUSPEND, ret, depth+1);
9655         ret->flags = 0;
9656         ender = reg_node(pRExC_state, TAIL);
9657         REGTAIL(pRExC_state, ret, ender);
9658         /*ret= ender;*/
9659     }
9660
9661     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9662         RExC_parse++;
9663         vFAIL("Nested quantifiers");
9664     }
9665
9666     return(ret);
9667 }
9668
9669 STATIC bool
9670 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
9671         const bool strict   /* Apply stricter parsing rules? */
9672     )
9673 {
9674
9675  /* This is expected to be called by a parser routine that has recognized '\N'
9676    and needs to handle the rest. RExC_parse is expected to point at the first
9677    char following the N at the time of the call.  On successful return,
9678    RExC_parse has been updated to point to just after the sequence identified
9679    by this routine, and <*flagp> has been updated.
9680
9681    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9682    character class.
9683
9684    \N may begin either a named sequence, or if outside a character class, mean
9685    to match a non-newline.  For non single-quoted regexes, the tokenizer has
9686    attempted to decide which, and in the case of a named sequence, converted it
9687    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9688    where c1... are the characters in the sequence.  For single-quoted regexes,
9689    the tokenizer passes the \N sequence through unchanged; this code will not
9690    attempt to determine this nor expand those, instead raising a syntax error.
9691    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9692    or there is no '}', it signals that this \N occurrence means to match a
9693    non-newline.
9694
9695    Only the \N{U+...} form should occur in a character class, for the same
9696    reason that '.' inside a character class means to just match a period: it
9697    just doesn't make sense.
9698
9699    The function raises an error (via vFAIL), and doesn't return for various
9700    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
9701    success; it returns FALSE otherwise.
9702
9703    If <valuep> is non-null, it means the caller can accept an input sequence
9704    consisting of a just a single code point; <*valuep> is set to that value
9705    if the input is such.
9706
9707    If <node_p> is non-null it signifies that the caller can accept any other
9708    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
9709    is set as follows:
9710     1) \N means not-a-NL: points to a newly created REG_ANY node;
9711     2) \N{}:              points to a new NOTHING node;
9712     3) otherwise:         points to a new EXACT node containing the resolved
9713                           string.
9714    Note that FALSE is returned for single code point sequences if <valuep> is
9715    null.
9716  */
9717
9718     char * endbrace;    /* '}' following the name */
9719     char* p;
9720     char *endchar;      /* Points to '.' or '}' ending cur char in the input
9721                            stream */
9722     bool has_multiple_chars; /* true if the input stream contains a sequence of
9723                                 more than one character */
9724
9725     GET_RE_DEBUG_FLAGS_DECL;
9726
9727     PERL_ARGS_ASSERT_GROK_BSLASH_N;
9728
9729     GET_RE_DEBUG_FLAGS;
9730
9731     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
9732
9733     /* The [^\n] meaning of \N ignores spaces and comments under the /x
9734      * modifier.  The other meaning does not */
9735     p = (RExC_flags & RXf_PMf_EXTENDED)
9736         ? regwhite( pRExC_state, RExC_parse )
9737         : RExC_parse;
9738
9739     /* Disambiguate between \N meaning a named character versus \N meaning
9740      * [^\n].  The former is assumed when it can't be the latter. */
9741     if (*p != '{' || regcurly(p, FALSE)) {
9742         RExC_parse = p;
9743         if (! node_p) {
9744             /* no bare \N in a charclass */
9745             if (in_char_class) {
9746                 vFAIL("\\N in a character class must be a named character: \\N{...}");
9747             }
9748             return FALSE;
9749         }
9750         nextchar(pRExC_state);
9751         *node_p = reg_node(pRExC_state, REG_ANY);
9752         *flagp |= HASWIDTH|SIMPLE;
9753         RExC_naughty++;
9754         RExC_parse--;
9755         Set_Node_Length(*node_p, 1); /* MJD */
9756         return TRUE;
9757     }
9758
9759     /* Here, we have decided it should be a named character or sequence */
9760
9761     /* The test above made sure that the next real character is a '{', but
9762      * under the /x modifier, it could be separated by space (or a comment and
9763      * \n) and this is not allowed (for consistency with \x{...} and the
9764      * tokenizer handling of \N{NAME}). */
9765     if (*RExC_parse != '{') {
9766         vFAIL("Missing braces on \\N{}");
9767     }
9768
9769     RExC_parse++;       /* Skip past the '{' */
9770
9771     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9772         || ! (endbrace == RExC_parse            /* nothing between the {} */
9773               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
9774                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9775     {
9776         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
9777         vFAIL("\\N{NAME} must be resolved by the lexer");
9778     }
9779
9780     if (endbrace == RExC_parse) {   /* empty: \N{} */
9781         bool ret = TRUE;
9782         if (node_p) {
9783             *node_p = reg_node(pRExC_state,NOTHING);
9784         }
9785         else if (in_char_class) {
9786             if (SIZE_ONLY && in_char_class) {
9787                 if (strict) {
9788                     RExC_parse++;   /* Position after the "}" */
9789                     vFAIL("Zero length \\N{}");
9790                 }
9791                 else {
9792                     ckWARNreg(RExC_parse,
9793                               "Ignoring zero length \\N{} in character class");
9794                 }
9795             }
9796             ret = FALSE;
9797         }
9798         else {
9799             return FALSE;
9800         }
9801         nextchar(pRExC_state);
9802         return ret;
9803     }
9804
9805     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9806     RExC_parse += 2;    /* Skip past the 'U+' */
9807
9808     endchar = RExC_parse + strcspn(RExC_parse, ".}");
9809
9810     /* Code points are separated by dots.  If none, there is only one code
9811      * point, and is terminated by the brace */
9812     has_multiple_chars = (endchar < endbrace);
9813
9814     if (valuep && (! has_multiple_chars || in_char_class)) {
9815         /* We only pay attention to the first char of
9816         multichar strings being returned in char classes. I kinda wonder
9817         if this makes sense as it does change the behaviour
9818         from earlier versions, OTOH that behaviour was broken
9819         as well. XXX Solution is to recharacterize as
9820         [rest-of-class]|multi1|multi2... */
9821
9822         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9823         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9824             | PERL_SCAN_DISALLOW_PREFIX
9825             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9826
9827         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9828
9829         /* The tokenizer should have guaranteed validity, but it's possible to
9830          * bypass it by using single quoting, so check */
9831         if (length_of_hex == 0
9832             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9833         {
9834             RExC_parse += length_of_hex;        /* Includes all the valid */
9835             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
9836                             ? UTF8SKIP(RExC_parse)
9837                             : 1;
9838             /* Guard against malformed utf8 */
9839             if (RExC_parse >= endchar) {
9840                 RExC_parse = endchar;
9841             }
9842             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9843         }
9844
9845         if (in_char_class && has_multiple_chars) {
9846             if (strict) {
9847                 RExC_parse = endbrace;
9848                 vFAIL("\\N{} in character class restricted to one character");
9849             }
9850             else {
9851                 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9852             }
9853         }
9854
9855         RExC_parse = endbrace + 1;
9856     }
9857     else if (! node_p || ! has_multiple_chars) {
9858
9859         /* Here, the input is legal, but not according to the caller's
9860          * options.  We fail without advancing the parse, so that the
9861          * caller can try again */
9862         RExC_parse = p;
9863         return FALSE;
9864     }
9865     else {
9866
9867         /* What is done here is to convert this to a sub-pattern of the form
9868          * (?:\x{char1}\x{char2}...)
9869          * and then call reg recursively.  That way, it retains its atomicness,
9870          * while not having to worry about special handling that some code
9871          * points may have.  toke.c has converted the original Unicode values
9872          * to native, so that we can just pass on the hex values unchanged.  We
9873          * do have to set a flag to keep recoding from happening in the
9874          * recursion */
9875
9876         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9877         STRLEN len;
9878         char *orig_end = RExC_end;
9879         I32 flags;
9880
9881         while (RExC_parse < endbrace) {
9882
9883             /* Convert to notation the rest of the code understands */
9884             sv_catpv(substitute_parse, "\\x{");
9885             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9886             sv_catpv(substitute_parse, "}");
9887
9888             /* Point to the beginning of the next character in the sequence. */
9889             RExC_parse = endchar + 1;
9890             endchar = RExC_parse + strcspn(RExC_parse, ".}");
9891         }
9892         sv_catpv(substitute_parse, ")");
9893
9894         RExC_parse = SvPV(substitute_parse, len);
9895
9896         /* Don't allow empty number */
9897         if (len < 8) {
9898             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9899         }
9900         RExC_end = RExC_parse + len;
9901
9902         /* The values are Unicode, and therefore not subject to recoding */
9903         RExC_override_recoding = 1;
9904
9905         *node_p = reg(pRExC_state, 1, &flags, depth+1);
9906         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9907
9908         RExC_parse = endbrace;
9909         RExC_end = orig_end;
9910         RExC_override_recoding = 0;
9911
9912         nextchar(pRExC_state);
9913     }
9914
9915     return TRUE;
9916 }
9917
9918
9919 /*
9920  * reg_recode
9921  *
9922  * It returns the code point in utf8 for the value in *encp.
9923  *    value: a code value in the source encoding
9924  *    encp:  a pointer to an Encode object
9925  *
9926  * If the result from Encode is not a single character,
9927  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9928  */
9929 STATIC UV
9930 S_reg_recode(pTHX_ const char value, SV **encp)
9931 {
9932     STRLEN numlen = 1;
9933     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9934     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9935     const STRLEN newlen = SvCUR(sv);
9936     UV uv = UNICODE_REPLACEMENT;
9937
9938     PERL_ARGS_ASSERT_REG_RECODE;
9939
9940     if (newlen)
9941         uv = SvUTF8(sv)
9942              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9943              : *(U8*)s;
9944
9945     if (!newlen || numlen != newlen) {
9946         uv = UNICODE_REPLACEMENT;
9947         *encp = NULL;
9948     }
9949     return uv;
9950 }
9951
9952 PERL_STATIC_INLINE U8
9953 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
9954 {
9955     U8 op;
9956
9957     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
9958
9959     if (! FOLD) {
9960         return EXACT;
9961     }
9962
9963     op = get_regex_charset(RExC_flags);
9964     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
9965         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
9966                  been, so there is no hole */
9967     }
9968
9969     return op + EXACTF;
9970 }
9971
9972 PERL_STATIC_INLINE void
9973 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
9974 {
9975     /* This knows the details about sizing an EXACTish node, setting flags for
9976      * it (by setting <*flagp>, and potentially populating it with a single
9977      * character.
9978      *
9979      * If <len> (the length in bytes) is non-zero, this function assumes that
9980      * the node has already been populated, and just does the sizing.  In this
9981      * case <code_point> should be the final code point that has already been
9982      * placed into the node.  This value will be ignored except that under some
9983      * circumstances <*flagp> is set based on it.
9984      *
9985      * If <len> is zero, the function assumes that the node is to contain only
9986      * the single character given by <code_point> and calculates what <len>
9987      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
9988      * additionally will populate the node's STRING with <code_point>, if <len>
9989      * is 0.  In both cases <*flagp> is appropriately set
9990      *
9991      * It knows that under FOLD, UTF characters and the Latin Sharp S must be
9992      * folded (the latter only when the rules indicate it can match 'ss') */
9993
9994     bool len_passed_in = cBOOL(len != 0);
9995     U8 character[UTF8_MAXBYTES_CASE+1];
9996
9997     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
9998
9999     if (! len_passed_in) {
10000         if (UTF) {
10001             if (FOLD) {
10002                 to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
10003             }
10004             else {
10005                 uvchr_to_utf8( character, code_point);
10006                 len = UTF8SKIP(character);
10007             }
10008         }
10009         else if (! FOLD
10010                  || code_point != LATIN_SMALL_LETTER_SHARP_S
10011                  || ASCII_FOLD_RESTRICTED
10012                  || ! AT_LEAST_UNI_SEMANTICS)
10013         {
10014             *character = (U8) code_point;
10015             len = 1;
10016         }
10017         else {
10018             *character = 's';
10019             *(character + 1) = 's';
10020             len = 2;
10021         }
10022     }
10023
10024     if (SIZE_ONLY) {
10025         RExC_size += STR_SZ(len);
10026     }
10027     else {
10028         RExC_emit += STR_SZ(len);
10029         STR_LEN(node) = len;
10030         if (! len_passed_in) {
10031             Copy((char *) character, STRING(node), len, char);
10032         }
10033     }
10034
10035     *flagp |= HASWIDTH;
10036
10037     /* A single character node is SIMPLE, except for the special-cased SHARP S
10038      * under /di. */
10039     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10040         && (code_point != LATIN_SMALL_LETTER_SHARP_S
10041             || ! FOLD || ! DEPENDS_SEMANTICS))
10042     {
10043         *flagp |= SIMPLE;
10044     }
10045 }
10046
10047 /*
10048  - regatom - the lowest level
10049
10050    Try to identify anything special at the start of the pattern. If there
10051    is, then handle it as required. This may involve generating a single regop,
10052    such as for an assertion; or it may involve recursing, such as to
10053    handle a () structure.
10054
10055    If the string doesn't start with something special then we gobble up
10056    as much literal text as we can.
10057
10058    Once we have been able to handle whatever type of thing started the
10059    sequence, we return.
10060
10061    Note: we have to be careful with escapes, as they can be both literal
10062    and special, and in the case of \10 and friends, context determines which.
10063
10064    A summary of the code structure is:
10065
10066    switch (first_byte) {
10067         cases for each special:
10068             handle this special;
10069             break;
10070         case '\\':
10071             switch (2nd byte) {
10072                 cases for each unambiguous special:
10073                     handle this special;
10074                     break;
10075                 cases for each ambigous special/literal:
10076                     disambiguate;
10077                     if (special)  handle here
10078                     else goto defchar;
10079                 default: // unambiguously literal:
10080                     goto defchar;
10081             }
10082         default:  // is a literal char
10083             // FALL THROUGH
10084         defchar:
10085             create EXACTish node for literal;
10086             while (more input and node isn't full) {
10087                 switch (input_byte) {
10088                    cases for each special;
10089                        make sure parse pointer is set so that the next call to
10090                            regatom will see this special first
10091                        goto loopdone; // EXACTish node terminated by prev. char
10092                    default:
10093                        append char to EXACTISH node;
10094                 }
10095                 get next input byte;
10096             }
10097         loopdone:
10098    }
10099    return the generated node;
10100
10101    Specifically there are two separate switches for handling
10102    escape sequences, with the one for handling literal escapes requiring
10103    a dummy entry for all of the special escapes that are actually handled
10104    by the other.
10105 */
10106
10107 STATIC regnode *
10108 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10109 {
10110     dVAR;
10111     regnode *ret = NULL;
10112     I32 flags = 0;
10113     char *parse_start = RExC_parse;
10114     U8 op;
10115     int invert = 0;
10116
10117     GET_RE_DEBUG_FLAGS_DECL;
10118
10119     *flagp = WORST;             /* Tentatively. */
10120
10121     DEBUG_PARSE("atom");
10122
10123     PERL_ARGS_ASSERT_REGATOM;
10124
10125 tryagain:
10126     switch ((U8)*RExC_parse) {
10127     case '^':
10128         RExC_seen_zerolen++;
10129         nextchar(pRExC_state);
10130         if (RExC_flags & RXf_PMf_MULTILINE)
10131             ret = reg_node(pRExC_state, MBOL);
10132         else if (RExC_flags & RXf_PMf_SINGLELINE)
10133             ret = reg_node(pRExC_state, SBOL);
10134         else
10135             ret = reg_node(pRExC_state, BOL);
10136         Set_Node_Length(ret, 1); /* MJD */
10137         break;
10138     case '$':
10139         nextchar(pRExC_state);
10140         if (*RExC_parse)
10141             RExC_seen_zerolen++;
10142         if (RExC_flags & RXf_PMf_MULTILINE)
10143             ret = reg_node(pRExC_state, MEOL);
10144         else if (RExC_flags & RXf_PMf_SINGLELINE)
10145             ret = reg_node(pRExC_state, SEOL);
10146         else
10147             ret = reg_node(pRExC_state, EOL);
10148         Set_Node_Length(ret, 1); /* MJD */
10149         break;
10150     case '.':
10151         nextchar(pRExC_state);
10152         if (RExC_flags & RXf_PMf_SINGLELINE)
10153             ret = reg_node(pRExC_state, SANY);
10154         else
10155             ret = reg_node(pRExC_state, REG_ANY);
10156         *flagp |= HASWIDTH|SIMPLE;
10157         RExC_naughty++;
10158         Set_Node_Length(ret, 1); /* MJD */
10159         break;
10160     case '[':
10161     {
10162         char * const oregcomp_parse = ++RExC_parse;
10163         ret = regclass(pRExC_state, flagp,depth+1,
10164                        FALSE, /* means parse the whole char class */
10165                        TRUE, /* allow multi-char folds */
10166                        FALSE, /* don't silence non-portable warnings. */
10167                        NULL);
10168         if (*RExC_parse != ']') {
10169             RExC_parse = oregcomp_parse;
10170             vFAIL("Unmatched [");
10171         }
10172         nextchar(pRExC_state);
10173         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10174         break;
10175     }
10176     case '(':
10177         nextchar(pRExC_state);
10178         ret = reg(pRExC_state, 1, &flags,depth+1);
10179         if (ret == NULL) {
10180                 if (flags & TRYAGAIN) {
10181                     if (RExC_parse == RExC_end) {
10182                          /* Make parent create an empty node if needed. */
10183                         *flagp |= TRYAGAIN;
10184                         return(NULL);
10185                     }
10186                     goto tryagain;
10187                 }
10188                 return(NULL);
10189         }
10190         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10191         break;
10192     case '|':
10193     case ')':
10194         if (flags & TRYAGAIN) {
10195             *flagp |= TRYAGAIN;
10196             return NULL;
10197         }
10198         vFAIL("Internal urp");
10199                                 /* Supposed to be caught earlier. */
10200         break;
10201     case '{':
10202         if (!regcurly(RExC_parse, FALSE)) {
10203             RExC_parse++;
10204             goto defchar;
10205         }
10206         /* FALL THROUGH */
10207     case '?':
10208     case '+':
10209     case '*':
10210         RExC_parse++;
10211         vFAIL("Quantifier follows nothing");
10212         break;
10213     case '\\':
10214         /* Special Escapes
10215
10216            This switch handles escape sequences that resolve to some kind
10217            of special regop and not to literal text. Escape sequnces that
10218            resolve to literal text are handled below in the switch marked
10219            "Literal Escapes".
10220
10221            Every entry in this switch *must* have a corresponding entry
10222            in the literal escape switch. However, the opposite is not
10223            required, as the default for this switch is to jump to the
10224            literal text handling code.
10225         */
10226         switch ((U8)*++RExC_parse) {
10227             U8 arg;
10228         /* Special Escapes */
10229         case 'A':
10230             RExC_seen_zerolen++;
10231             ret = reg_node(pRExC_state, SBOL);
10232             *flagp |= SIMPLE;
10233             goto finish_meta_pat;
10234         case 'G':
10235             ret = reg_node(pRExC_state, GPOS);
10236             RExC_seen |= REG_SEEN_GPOS;
10237             *flagp |= SIMPLE;
10238             goto finish_meta_pat;
10239         case 'K':
10240             RExC_seen_zerolen++;
10241             ret = reg_node(pRExC_state, KEEPS);
10242             *flagp |= SIMPLE;
10243             /* XXX:dmq : disabling in-place substitution seems to
10244              * be necessary here to avoid cases of memory corruption, as
10245              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10246              */
10247             RExC_seen |= REG_SEEN_LOOKBEHIND;
10248             goto finish_meta_pat;
10249         case 'Z':
10250             ret = reg_node(pRExC_state, SEOL);
10251             *flagp |= SIMPLE;
10252             RExC_seen_zerolen++;                /* Do not optimize RE away */
10253             goto finish_meta_pat;
10254         case 'z':
10255             ret = reg_node(pRExC_state, EOS);
10256             *flagp |= SIMPLE;
10257             RExC_seen_zerolen++;                /* Do not optimize RE away */
10258             goto finish_meta_pat;
10259         case 'C':
10260             ret = reg_node(pRExC_state, CANY);
10261             RExC_seen |= REG_SEEN_CANY;
10262             *flagp |= HASWIDTH|SIMPLE;
10263             goto finish_meta_pat;
10264         case 'X':
10265             ret = reg_node(pRExC_state, CLUMP);
10266             *flagp |= HASWIDTH;
10267             goto finish_meta_pat;
10268
10269         case 'W':
10270             invert = 1;
10271             /* FALLTHROUGH */
10272         case 'w':
10273             arg = ANYOF_WORDCHAR;
10274             goto join_posix;
10275
10276         case 'b':
10277             RExC_seen_zerolen++;
10278             RExC_seen |= REG_SEEN_LOOKBEHIND;
10279             op = BOUND + get_regex_charset(RExC_flags);
10280             if (op > BOUNDA) {  /* /aa is same as /a */
10281                 op = BOUNDA;
10282             }
10283             ret = reg_node(pRExC_state, op);
10284             FLAGS(ret) = get_regex_charset(RExC_flags);
10285             *flagp |= SIMPLE;
10286             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10287                 ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
10288             }
10289             goto finish_meta_pat;
10290         case 'B':
10291             RExC_seen_zerolen++;
10292             RExC_seen |= REG_SEEN_LOOKBEHIND;
10293             op = NBOUND + get_regex_charset(RExC_flags);
10294             if (op > NBOUNDA) { /* /aa is same as /a */
10295                 op = NBOUNDA;
10296             }
10297             ret = reg_node(pRExC_state, op);
10298             FLAGS(ret) = get_regex_charset(RExC_flags);
10299             *flagp |= SIMPLE;
10300             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10301                 ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
10302             }
10303             goto finish_meta_pat;
10304
10305         case 'D':
10306             invert = 1;
10307             /* FALLTHROUGH */
10308         case 'd':
10309             arg = ANYOF_DIGIT;
10310             goto join_posix;
10311
10312         case 'R':
10313             ret = reg_node(pRExC_state, LNBREAK);
10314             *flagp |= HASWIDTH|SIMPLE;
10315             goto finish_meta_pat;
10316
10317         case 'H':
10318             invert = 1;
10319             /* FALLTHROUGH */
10320         case 'h':
10321             arg = ANYOF_BLANK;
10322             op = POSIXU;
10323             goto join_posix_op_known;
10324
10325         case 'V':
10326             invert = 1;
10327             /* FALLTHROUGH */
10328         case 'v':
10329             arg = ANYOF_VERTWS;
10330             op = POSIXU;
10331             goto join_posix_op_known;
10332
10333         case 'S':
10334             invert = 1;
10335             /* FALLTHROUGH */
10336         case 's':
10337             arg = ANYOF_SPACE;
10338
10339         join_posix:
10340
10341             op = POSIXD + get_regex_charset(RExC_flags);
10342             if (op > POSIXA) {  /* /aa is same as /a */
10343                 op = POSIXA;
10344             }
10345
10346         join_posix_op_known:
10347
10348             if (invert) {
10349                 op += NPOSIXD - POSIXD;
10350             }
10351
10352             ret = reg_node(pRExC_state, op);
10353             if (! SIZE_ONLY) {
10354                 FLAGS(ret) = namedclass_to_classnum(arg);
10355             }
10356
10357             *flagp |= HASWIDTH|SIMPLE;
10358             /* FALL THROUGH */
10359
10360          finish_meta_pat:
10361             nextchar(pRExC_state);
10362             Set_Node_Length(ret, 2); /* MJD */
10363             break;
10364         case 'p':
10365         case 'P':
10366             {
10367 #ifdef DEBUGGING
10368                 char* parse_start = RExC_parse - 2;
10369 #endif
10370
10371                 RExC_parse--;
10372
10373                 ret = regclass(pRExC_state, flagp,depth+1,
10374                                TRUE, /* means just parse this element */
10375                                FALSE, /* don't allow multi-char folds */
10376                                FALSE, /* don't silence non-portable warnings.
10377                                          It would be a bug if these returned
10378                                          non-portables */
10379                                NULL);
10380
10381                 RExC_parse--;
10382
10383                 Set_Node_Offset(ret, parse_start + 2);
10384                 Set_Node_Cur_Length(ret);
10385                 nextchar(pRExC_state);
10386             }
10387             break;
10388         case 'N':
10389             /* Handle \N and \N{NAME} with multiple code points here and not
10390              * below because it can be multicharacter. join_exact() will join
10391              * them up later on.  Also this makes sure that things like
10392              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10393              * The options to the grok function call causes it to fail if the
10394              * sequence is just a single code point.  We then go treat it as
10395              * just another character in the current EXACT node, and hence it
10396              * gets uniform treatment with all the other characters.  The
10397              * special treatment for quantifiers is not needed for such single
10398              * character sequences */
10399             ++RExC_parse;
10400             if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
10401                                 FALSE /* not strict */ )) {
10402                 RExC_parse--;
10403                 goto defchar;
10404             }
10405             break;
10406         case 'k':    /* Handle \k<NAME> and \k'NAME' */
10407         parse_named_seq:
10408         {
10409             char ch= RExC_parse[1];
10410             if (ch != '<' && ch != '\'' && ch != '{') {
10411                 RExC_parse++;
10412                 vFAIL2("Sequence %.2s... not terminated",parse_start);
10413             } else {
10414                 /* this pretty much dupes the code for (?P=...) in reg(), if
10415                    you change this make sure you change that */
10416                 char* name_start = (RExC_parse += 2);
10417                 U32 num = 0;
10418                 SV *sv_dat = reg_scan_name(pRExC_state,
10419                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10420                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10421                 if (RExC_parse == name_start || *RExC_parse != ch)
10422                     vFAIL2("Sequence %.3s... not terminated",parse_start);
10423
10424                 if (!SIZE_ONLY) {
10425                     num = add_data( pRExC_state, 1, "S" );
10426                     RExC_rxi->data->data[num]=(void*)sv_dat;
10427                     SvREFCNT_inc_simple_void(sv_dat);
10428                 }
10429
10430                 RExC_sawback = 1;
10431                 ret = reganode(pRExC_state,
10432                                ((! FOLD)
10433                                  ? NREF
10434                                  : (ASCII_FOLD_RESTRICTED)
10435                                    ? NREFFA
10436                                    : (AT_LEAST_UNI_SEMANTICS)
10437                                      ? NREFFU
10438                                      : (LOC)
10439                                        ? NREFFL
10440                                        : NREFF),
10441                                 num);
10442                 *flagp |= HASWIDTH;
10443
10444                 /* override incorrect value set in reganode MJD */
10445                 Set_Node_Offset(ret, parse_start+1);
10446                 Set_Node_Cur_Length(ret); /* MJD */
10447                 nextchar(pRExC_state);
10448
10449             }
10450             break;
10451         }
10452         case 'g':
10453         case '1': case '2': case '3': case '4':
10454         case '5': case '6': case '7': case '8': case '9':
10455             {
10456                 I32 num;
10457                 bool isg = *RExC_parse == 'g';
10458                 bool isrel = 0;
10459                 bool hasbrace = 0;
10460                 if (isg) {
10461                     RExC_parse++;
10462                     if (*RExC_parse == '{') {
10463                         RExC_parse++;
10464                         hasbrace = 1;
10465                     }
10466                     if (*RExC_parse == '-') {
10467                         RExC_parse++;
10468                         isrel = 1;
10469                     }
10470                     if (hasbrace && !isDIGIT(*RExC_parse)) {
10471                         if (isrel) RExC_parse--;
10472                         RExC_parse -= 2;
10473                         goto parse_named_seq;
10474                 }   }
10475                 num = atoi(RExC_parse);
10476                 if (isg && num == 0)
10477                     vFAIL("Reference to invalid group 0");
10478                 if (isrel) {
10479                     num = RExC_npar - num;
10480                     if (num < 1)
10481                         vFAIL("Reference to nonexistent or unclosed group");
10482                 }
10483                 if (!isg && num > 9 && num >= RExC_npar)
10484                     /* Probably a character specified in octal, e.g. \35 */
10485                     goto defchar;
10486                 else {
10487                     char * const parse_start = RExC_parse - 1; /* MJD */
10488                     while (isDIGIT(*RExC_parse))
10489                         RExC_parse++;
10490                     if (parse_start == RExC_parse - 1)
10491                         vFAIL("Unterminated \\g... pattern");
10492                     if (hasbrace) {
10493                         if (*RExC_parse != '}')
10494                             vFAIL("Unterminated \\g{...} pattern");
10495                         RExC_parse++;
10496                     }
10497                     if (!SIZE_ONLY) {
10498                         if (num > (I32)RExC_rx->nparens)
10499                             vFAIL("Reference to nonexistent group");
10500                     }
10501                     RExC_sawback = 1;
10502                     ret = reganode(pRExC_state,
10503                                    ((! FOLD)
10504                                      ? REF
10505                                      : (ASCII_FOLD_RESTRICTED)
10506                                        ? REFFA
10507                                        : (AT_LEAST_UNI_SEMANTICS)
10508                                          ? REFFU
10509                                          : (LOC)
10510                                            ? REFFL
10511                                            : REFF),
10512                                     num);
10513                     *flagp |= HASWIDTH;
10514
10515                     /* override incorrect value set in reganode MJD */
10516                     Set_Node_Offset(ret, parse_start+1);
10517                     Set_Node_Cur_Length(ret); /* MJD */
10518                     RExC_parse--;
10519                     nextchar(pRExC_state);
10520                 }
10521             }
10522             break;
10523         case '\0':
10524             if (RExC_parse >= RExC_end)
10525                 FAIL("Trailing \\");
10526             /* FALL THROUGH */
10527         default:
10528             /* Do not generate "unrecognized" warnings here, we fall
10529                back into the quick-grab loop below */
10530             parse_start--;
10531             goto defchar;
10532         }
10533         break;
10534
10535     case '#':
10536         if (RExC_flags & RXf_PMf_EXTENDED) {
10537             if ( reg_skipcomment( pRExC_state ) )
10538                 goto tryagain;
10539         }
10540         /* FALL THROUGH */
10541
10542     default:
10543
10544             parse_start = RExC_parse - 1;
10545
10546             RExC_parse++;
10547
10548         defchar: {
10549             STRLEN len = 0;
10550             UV ender;
10551             char *p;
10552             char *s;
10553 #define MAX_NODE_STRING_SIZE 127
10554             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10555             char *s0;
10556             U8 upper_parse = MAX_NODE_STRING_SIZE;
10557             STRLEN foldlen;
10558             U8 node_type;
10559             bool next_is_quantifier;
10560             char * oldp = NULL;
10561
10562             /* If a folding node contains only code points that don't
10563              * participate in folds, it can be changed into an EXACT node,
10564              * which allows the optimizer more things to look for */
10565             bool maybe_exact;
10566
10567             ender = 0;
10568             node_type = compute_EXACTish(pRExC_state);
10569             ret = reg_node(pRExC_state, node_type);
10570
10571             /* In pass1, folded, we use a temporary buffer instead of the
10572              * actual node, as the node doesn't exist yet */
10573             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10574
10575             s0 = s;
10576
10577         reparse:
10578
10579             /* We do the EXACTFish to EXACT node only if folding, and not if in
10580              * locale, as whether a character folds or not isn't known until
10581              * runtime */
10582             maybe_exact = FOLD && ! LOC;
10583
10584             /* XXX The node can hold up to 255 bytes, yet this only goes to
10585              * 127.  I (khw) do not know why.  Keeping it somewhat less than
10586              * 255 allows us to not have to worry about overflow due to
10587              * converting to utf8 and fold expansion, but that value is
10588              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
10589              * split up by this limit into a single one using the real max of
10590              * 255.  Even at 127, this breaks under rare circumstances.  If
10591              * folding, we do not want to split a node at a character that is a
10592              * non-final in a multi-char fold, as an input string could just
10593              * happen to want to match across the node boundary.  The join
10594              * would solve that problem if the join actually happens.  But a
10595              * series of more than two nodes in a row each of 127 would cause
10596              * the first join to succeed to get to 254, but then there wouldn't
10597              * be room for the next one, which could at be one of those split
10598              * multi-char folds.  I don't know of any fool-proof solution.  One
10599              * could back off to end with only a code point that isn't such a
10600              * non-final, but it is possible for there not to be any in the
10601              * entire node. */
10602             for (p = RExC_parse - 1;
10603                  len < upper_parse && p < RExC_end;
10604                  len++)
10605             {
10606                 oldp = p;
10607
10608                 if (RExC_flags & RXf_PMf_EXTENDED)
10609                     p = regwhite( pRExC_state, p );
10610                 switch ((U8)*p) {
10611                 case '^':
10612                 case '$':
10613                 case '.':
10614                 case '[':
10615                 case '(':
10616                 case ')':
10617                 case '|':
10618                     goto loopdone;
10619                 case '\\':
10620                     /* Literal Escapes Switch
10621
10622                        This switch is meant to handle escape sequences that
10623                        resolve to a literal character.
10624
10625                        Every escape sequence that represents something
10626                        else, like an assertion or a char class, is handled
10627                        in the switch marked 'Special Escapes' above in this
10628                        routine, but also has an entry here as anything that
10629                        isn't explicitly mentioned here will be treated as
10630                        an unescaped equivalent literal.
10631                     */
10632
10633                     switch ((U8)*++p) {
10634                     /* These are all the special escapes. */
10635                     case 'A':             /* Start assertion */
10636                     case 'b': case 'B':   /* Word-boundary assertion*/
10637                     case 'C':             /* Single char !DANGEROUS! */
10638                     case 'd': case 'D':   /* digit class */
10639                     case 'g': case 'G':   /* generic-backref, pos assertion */
10640                     case 'h': case 'H':   /* HORIZWS */
10641                     case 'k': case 'K':   /* named backref, keep marker */
10642                     case 'p': case 'P':   /* Unicode property */
10643                               case 'R':   /* LNBREAK */
10644                     case 's': case 'S':   /* space class */
10645                     case 'v': case 'V':   /* VERTWS */
10646                     case 'w': case 'W':   /* word class */
10647                     case 'X':             /* eXtended Unicode "combining character sequence" */
10648                     case 'z': case 'Z':   /* End of line/string assertion */
10649                         --p;
10650                         goto loopdone;
10651
10652                     /* Anything after here is an escape that resolves to a
10653                        literal. (Except digits, which may or may not)
10654                      */
10655                     case 'n':
10656                         ender = '\n';
10657                         p++;
10658                         break;
10659                     case 'N': /* Handle a single-code point named character. */
10660                         /* The options cause it to fail if a multiple code
10661                          * point sequence.  Handle those in the switch() above
10662                          * */
10663                         RExC_parse = p + 1;
10664                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
10665                                             flagp, depth, FALSE,
10666                                             FALSE /* not strict */ ))
10667                         {
10668                             RExC_parse = p = oldp;
10669                             goto loopdone;
10670                         }
10671                         p = RExC_parse;
10672                         if (ender > 0xff) {
10673                             REQUIRE_UTF8;
10674                         }
10675                         break;
10676                     case 'r':
10677                         ender = '\r';
10678                         p++;
10679                         break;
10680                     case 't':
10681                         ender = '\t';
10682                         p++;
10683                         break;
10684                     case 'f':
10685                         ender = '\f';
10686                         p++;
10687                         break;
10688                     case 'e':
10689                           ender = ASCII_TO_NATIVE('\033');
10690                         p++;
10691                         break;
10692                     case 'a':
10693                           ender = ASCII_TO_NATIVE('\007');
10694                         p++;
10695                         break;
10696                     case 'o':
10697                         {
10698                             UV result;
10699                             const char* error_msg;
10700
10701                             bool valid = grok_bslash_o(&p,
10702                                                        &result,
10703                                                        &error_msg,
10704                                                        TRUE, /* out warnings */
10705                                                        FALSE, /* not strict */
10706                                                        TRUE, /* Output warnings
10707                                                                 for non-
10708                                                                 portables */
10709                                                        UTF);
10710                             if (! valid) {
10711                                 RExC_parse = p; /* going to die anyway; point
10712                                                    to exact spot of failure */
10713                                 vFAIL(error_msg);
10714                             }
10715                             ender = result;
10716                             if (PL_encoding && ender < 0x100) {
10717                                 goto recode_encoding;
10718                             }
10719                             if (ender > 0xff) {
10720                                 REQUIRE_UTF8;
10721                             }
10722                             break;
10723                         }
10724                     case 'x':
10725                         {
10726                             UV result = UV_MAX; /* initialize to erroneous
10727                                                    value */
10728                             const char* error_msg;
10729
10730                             bool valid = grok_bslash_x(&p,
10731                                                        &result,
10732                                                        &error_msg,
10733                                                        TRUE, /* out warnings */
10734                                                        FALSE, /* not strict */
10735                                                        TRUE, /* Output warnings
10736                                                                 for non-
10737                                                                 portables */
10738                                                        UTF);
10739                             if (! valid) {
10740                                 RExC_parse = p; /* going to die anyway; point
10741                                                    to exact spot of failure */
10742                                 vFAIL(error_msg);
10743                             }
10744                             ender = result;
10745
10746                             if (PL_encoding && ender < 0x100) {
10747                                 goto recode_encoding;
10748                             }
10749                             if (ender > 0xff) {
10750                                 REQUIRE_UTF8;
10751                             }
10752                             break;
10753                         }
10754                     case 'c':
10755                         p++;
10756                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10757                         break;
10758                     case '0': case '1': case '2': case '3':case '4':
10759                     case '5': case '6': case '7':
10760                         if (*p == '0' ||
10761                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10762                         {
10763                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10764                             STRLEN numlen = 3;
10765                             ender = grok_oct(p, &numlen, &flags, NULL);
10766                             if (ender > 0xff) {
10767                                 REQUIRE_UTF8;
10768                             }
10769                             p += numlen;
10770                             if (SIZE_ONLY   /* like \08, \178 */
10771                                 && numlen < 3
10772                                 && p < RExC_end
10773                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
10774                             {
10775                                 reg_warn_non_literal_string(
10776                                          p + 1,
10777                                          form_short_octal_warning(p, numlen));
10778                             }
10779                         }
10780                         else {  /* Not to be treated as an octal constant, go
10781                                    find backref */
10782                             --p;
10783                             goto loopdone;
10784                         }
10785                         if (PL_encoding && ender < 0x100)
10786                             goto recode_encoding;
10787                         break;
10788                     recode_encoding:
10789                         if (! RExC_override_recoding) {
10790                             SV* enc = PL_encoding;
10791                             ender = reg_recode((const char)(U8)ender, &enc);
10792                             if (!enc && SIZE_ONLY)
10793                                 ckWARNreg(p, "Invalid escape in the specified encoding");
10794                             REQUIRE_UTF8;
10795                         }
10796                         break;
10797                     case '\0':
10798                         if (p >= RExC_end)
10799                             FAIL("Trailing \\");
10800                         /* FALL THROUGH */
10801                     default:
10802                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
10803                             /* Include any { following the alpha to emphasize
10804                              * that it could be part of an escape at some point
10805                              * in the future */
10806                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
10807                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
10808                         }
10809                         goto normal_default;
10810                     } /* End of switch on '\' */
10811                     break;
10812                 default:    /* A literal character */
10813
10814                     if (! SIZE_ONLY
10815                         && RExC_flags & RXf_PMf_EXTENDED
10816                         && ckWARN(WARN_DEPRECATED)
10817                         && is_PATWS_non_low(p, UTF))
10818                     {
10819                         vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
10820                                 "Escape literal pattern white space under /x");
10821                     }
10822
10823                   normal_default:
10824                     if (UTF8_IS_START(*p) && UTF) {
10825                         STRLEN numlen;
10826                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10827                                                &numlen, UTF8_ALLOW_DEFAULT);
10828                         p += numlen;
10829                     }
10830                     else
10831                         ender = (U8) *p++;
10832                     break;
10833                 } /* End of switch on the literal */
10834
10835                 /* Here, have looked at the literal character and <ender>
10836                  * contains its ordinal, <p> points to the character after it
10837                  */
10838
10839                 if ( RExC_flags & RXf_PMf_EXTENDED)
10840                     p = regwhite( pRExC_state, p );
10841
10842                 /* If the next thing is a quantifier, it applies to this
10843                  * character only, which means that this character has to be in
10844                  * its own node and can't just be appended to the string in an
10845                  * existing node, so if there are already other characters in
10846                  * the node, close the node with just them, and set up to do
10847                  * this character again next time through, when it will be the
10848                  * only thing in its new node */
10849                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
10850                 {
10851                     p = oldp;
10852                     goto loopdone;
10853                 }
10854
10855                 if (FOLD) {
10856                     if (UTF
10857                             /* See comments for join_exact() as to why we fold
10858                              * this non-UTF at compile time */
10859                         || (node_type == EXACTFU
10860                             && ender == LATIN_SMALL_LETTER_SHARP_S))
10861                     {
10862
10863
10864                         /* Prime the casefolded buffer.  Locale rules, which
10865                          * apply only to code points < 256, aren't known until
10866                          * execution, so for them, just output the original
10867                          * character using utf8.  If we start to fold non-UTF
10868                          * patterns, be sure to update join_exact() */
10869                         if (LOC && ender < 256) {
10870                             if (UNI_IS_INVARIANT(ender)) {
10871                                 *s = (U8) ender;
10872                                 foldlen = 1;
10873                             } else {
10874                                 *s = UTF8_TWO_BYTE_HI(ender);
10875                                 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
10876                                 foldlen = 2;
10877                             }
10878                         }
10879                         else {
10880                             UV folded = _to_uni_fold_flags(
10881                                            ender,
10882                                            (U8 *) s,
10883                                            &foldlen,
10884                                            FOLD_FLAGS_FULL
10885                                            | ((LOC) ?  FOLD_FLAGS_LOCALE
10886                                                     : (ASCII_FOLD_RESTRICTED)
10887                                                       ? FOLD_FLAGS_NOMIX_ASCII
10888                                                       : 0)
10889                                             );
10890
10891                             /* If this node only contains non-folding code
10892                              * points so far, see if this new one is also
10893                              * non-folding */
10894                             if (maybe_exact) {
10895                                 if (folded != ender) {
10896                                     maybe_exact = FALSE;
10897                                 }
10898                                 else {
10899                                     /* Here the fold is the original; we have
10900                                      * to check further to see if anything
10901                                      * folds to it */
10902                                     if (! PL_utf8_foldable) {
10903                                         SV* swash = swash_init("utf8",
10904                                                            "_Perl_Any_Folds",
10905                                                            &PL_sv_undef, 1, 0);
10906                                         PL_utf8_foldable =
10907                                                     _get_swash_invlist(swash);
10908                                         SvREFCNT_dec_NN(swash);
10909                                     }
10910                                     if (_invlist_contains_cp(PL_utf8_foldable,
10911                                                              ender))
10912                                     {
10913                                         maybe_exact = FALSE;
10914                                     }
10915                                 }
10916                             }
10917                             ender = folded;
10918                         }
10919                         s += foldlen;
10920
10921                         /* The loop increments <len> each time, as all but this
10922                          * path (and the one just below for UTF) through it add
10923                          * a single byte to the EXACTish node.  But this one
10924                          * has changed len to be the correct final value, so
10925                          * subtract one to cancel out the increment that
10926                          * follows */
10927                         len += foldlen - 1;
10928                     }
10929                     else {
10930                         *(s++) = (char) ender;
10931                         maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
10932                     }
10933                 }
10934                 else if (UTF) {
10935                     const STRLEN unilen = reguni(pRExC_state, ender, s);
10936                     if (unilen > 0) {
10937                        s   += unilen;
10938                        len += unilen;
10939                     }
10940
10941                     /* See comment just above for - 1 */
10942                     len--;
10943                 }
10944                 else {
10945                     REGC((char)ender, s++);
10946                 }
10947
10948                 if (next_is_quantifier) {
10949
10950                     /* Here, the next input is a quantifier, and to get here,
10951                      * the current character is the only one in the node.
10952                      * Also, here <len> doesn't include the final byte for this
10953                      * character */
10954                     len++;
10955                     goto loopdone;
10956                 }
10957
10958             } /* End of loop through literal characters */
10959
10960             /* Here we have either exhausted the input or ran out of room in
10961              * the node.  (If we encountered a character that can't be in the
10962              * node, transfer is made directly to <loopdone>, and so we
10963              * wouldn't have fallen off the end of the loop.)  In the latter
10964              * case, we artificially have to split the node into two, because
10965              * we just don't have enough space to hold everything.  This
10966              * creates a problem if the final character participates in a
10967              * multi-character fold in the non-final position, as a match that
10968              * should have occurred won't, due to the way nodes are matched,
10969              * and our artificial boundary.  So back off until we find a non-
10970              * problematic character -- one that isn't at the beginning or
10971              * middle of such a fold.  (Either it doesn't participate in any
10972              * folds, or appears only in the final position of all the folds it
10973              * does participate in.)  A better solution with far fewer false
10974              * positives, and that would fill the nodes more completely, would
10975              * be to actually have available all the multi-character folds to
10976              * test against, and to back-off only far enough to be sure that
10977              * this node isn't ending with a partial one.  <upper_parse> is set
10978              * further below (if we need to reparse the node) to include just
10979              * up through that final non-problematic character that this code
10980              * identifies, so when it is set to less than the full node, we can
10981              * skip the rest of this */
10982             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
10983
10984                 const STRLEN full_len = len;
10985
10986                 assert(len >= MAX_NODE_STRING_SIZE);
10987
10988                 /* Here, <s> points to the final byte of the final character.
10989                  * Look backwards through the string until find a non-
10990                  * problematic character */
10991
10992                 if (! UTF) {
10993
10994                     /* These two have no multi-char folds to non-UTF characters
10995                      */
10996                     if (ASCII_FOLD_RESTRICTED || LOC) {
10997                         goto loopdone;
10998                     }
10999
11000                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
11001                     len = s - s0 + 1;
11002                 }
11003                 else {
11004                     if (!  PL_NonL1NonFinalFold) {
11005                         PL_NonL1NonFinalFold = _new_invlist_C_array(
11006                                         NonL1_Perl_Non_Final_Folds_invlist);
11007                     }
11008
11009                     /* Point to the first byte of the final character */
11010                     s = (char *) utf8_hop((U8 *) s, -1);
11011
11012                     while (s >= s0) {   /* Search backwards until find
11013                                            non-problematic char */
11014                         if (UTF8_IS_INVARIANT(*s)) {
11015
11016                             /* There are no ascii characters that participate
11017                              * in multi-char folds under /aa.  In EBCDIC, the
11018                              * non-ascii invariants are all control characters,
11019                              * so don't ever participate in any folds. */
11020                             if (ASCII_FOLD_RESTRICTED
11021                                 || ! IS_NON_FINAL_FOLD(*s))
11022                             {
11023                                 break;
11024                             }
11025                         }
11026                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11027
11028                             /* No Latin1 characters participate in multi-char
11029                              * folds under /l */
11030                             if (LOC
11031                                 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
11032                                                                 *s, *(s+1))))
11033                             {
11034                                 break;
11035                             }
11036                         }
11037                         else if (! _invlist_contains_cp(
11038                                         PL_NonL1NonFinalFold,
11039                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
11040                         {
11041                             break;
11042                         }
11043
11044                         /* Here, the current character is problematic in that
11045                          * it does occur in the non-final position of some
11046                          * fold, so try the character before it, but have to
11047                          * special case the very first byte in the string, so
11048                          * we don't read outside the string */
11049                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11050                     } /* End of loop backwards through the string */
11051
11052                     /* If there were only problematic characters in the string,
11053                      * <s> will point to before s0, in which case the length
11054                      * should be 0, otherwise include the length of the
11055                      * non-problematic character just found */
11056                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11057                 }
11058
11059                 /* Here, have found the final character, if any, that is
11060                  * non-problematic as far as ending the node without splitting
11061                  * it across a potential multi-char fold.  <len> contains the
11062                  * number of bytes in the node up-to and including that
11063                  * character, or is 0 if there is no such character, meaning
11064                  * the whole node contains only problematic characters.  In
11065                  * this case, give up and just take the node as-is.  We can't
11066                  * do any better */
11067                 if (len == 0) {
11068                     len = full_len;
11069                 } else {
11070
11071                     /* Here, the node does contain some characters that aren't
11072                      * problematic.  If one such is the final character in the
11073                      * node, we are done */
11074                     if (len == full_len) {
11075                         goto loopdone;
11076                     }
11077                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11078
11079                         /* If the final character is problematic, but the
11080                          * penultimate is not, back-off that last character to
11081                          * later start a new node with it */
11082                         p = oldp;
11083                         goto loopdone;
11084                     }
11085
11086                     /* Here, the final non-problematic character is earlier
11087                      * in the input than the penultimate character.  What we do
11088                      * is reparse from the beginning, going up only as far as
11089                      * this final ok one, thus guaranteeing that the node ends
11090                      * in an acceptable character.  The reason we reparse is
11091                      * that we know how far in the character is, but we don't
11092                      * know how to correlate its position with the input parse.
11093                      * An alternate implementation would be to build that
11094                      * correlation as we go along during the original parse,
11095                      * but that would entail extra work for every node, whereas
11096                      * this code gets executed only when the string is too
11097                      * large for the node, and the final two characters are
11098                      * problematic, an infrequent occurrence.  Yet another
11099                      * possible strategy would be to save the tail of the
11100                      * string, and the next time regatom is called, initialize
11101                      * with that.  The problem with this is that unless you
11102                      * back off one more character, you won't be guaranteed
11103                      * regatom will get called again, unless regbranch,
11104                      * regpiece ... are also changed.  If you do back off that
11105                      * extra character, so that there is input guaranteed to
11106                      * force calling regatom, you can't handle the case where
11107                      * just the first character in the node is acceptable.  I
11108                      * (khw) decided to try this method which doesn't have that
11109                      * pitfall; if performance issues are found, we can do a
11110                      * combination of the current approach plus that one */
11111                     upper_parse = len;
11112                     len = 0;
11113                     s = s0;
11114                     goto reparse;
11115                 }
11116             }   /* End of verifying node ends with an appropriate char */
11117
11118         loopdone:   /* Jumped to when encounters something that shouldn't be in
11119                        the node */
11120
11121             /* If 'maybe_exact' is still set here, means there are no
11122              * code points in the node that participate in folds */
11123             if (FOLD && maybe_exact) {
11124                 OP(ret) = EXACT;
11125             }
11126
11127             /* I (khw) don't know if you can get here with zero length, but the
11128              * old code handled this situation by creating a zero-length EXACT
11129              * node.  Might as well be NOTHING instead */
11130             if (len == 0) {
11131                 OP(ret) = NOTHING;
11132             }
11133             else{
11134                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11135             }
11136
11137             RExC_parse = p - 1;
11138             Set_Node_Cur_Length(ret); /* MJD */
11139             nextchar(pRExC_state);
11140             {
11141                 /* len is STRLEN which is unsigned, need to copy to signed */
11142                 IV iv = len;
11143                 if (iv < 0)
11144                     vFAIL("Internal disaster");
11145             }
11146
11147         } /* End of label 'defchar:' */
11148         break;
11149     } /* End of giant switch on input character */
11150
11151     return(ret);
11152 }
11153
11154 STATIC char *
11155 S_regwhite( RExC_state_t *pRExC_state, char *p )
11156 {
11157     const char *e = RExC_end;
11158
11159     PERL_ARGS_ASSERT_REGWHITE;
11160
11161     while (p < e) {
11162         if (isSPACE(*p))
11163             ++p;
11164         else if (*p == '#') {
11165             bool ended = 0;
11166             do {
11167                 if (*p++ == '\n') {
11168                     ended = 1;
11169                     break;
11170                 }
11171             } while (p < e);
11172             if (!ended)
11173                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11174         }
11175         else
11176             break;
11177     }
11178     return p;
11179 }
11180
11181 STATIC char *
11182 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
11183 {
11184     /* Returns the next non-pattern-white space, non-comment character (the
11185      * latter only if 'recognize_comment is true) in the string p, which is
11186      * ended by RExC_end.  If there is no line break ending a comment,
11187      * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
11188     const char *e = RExC_end;
11189
11190     PERL_ARGS_ASSERT_REGPATWS;
11191
11192     while (p < e) {
11193         STRLEN len;
11194         if ((len = is_PATWS_safe(p, e, UTF))) {
11195             p += len;
11196         }
11197         else if (recognize_comment && *p == '#') {
11198             bool ended = 0;
11199             do {
11200                 p++;
11201                 if (is_LNBREAK_safe(p, e, UTF)) {
11202                     ended = 1;
11203                     break;
11204                 }
11205             } while (p < e);
11206             if (!ended)
11207                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11208         }
11209         else
11210             break;
11211     }
11212     return p;
11213 }
11214
11215 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11216    Character classes ([:foo:]) can also be negated ([:^foo:]).
11217    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11218    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11219    but trigger failures because they are currently unimplemented. */
11220
11221 #define POSIXCC_DONE(c)   ((c) == ':')
11222 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11223 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11224
11225 PERL_STATIC_INLINE I32
11226 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, SV *free_me,
11227                     const bool strict)
11228 {
11229     dVAR;
11230     I32 namedclass = OOB_NAMEDCLASS;
11231
11232     PERL_ARGS_ASSERT_REGPPOSIXCC;
11233
11234     if (value == '[' && RExC_parse + 1 < RExC_end &&
11235         /* I smell either [: or [= or [. -- POSIX has been here, right? */
11236         POSIXCC(UCHARAT(RExC_parse)))
11237     {
11238         const char c = UCHARAT(RExC_parse);
11239         char* const s = RExC_parse++;
11240
11241         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11242             RExC_parse++;
11243         if (RExC_parse == RExC_end) {
11244             if (strict) {
11245
11246                 /* Try to give a better location for the error (than the end of
11247                  * the string) by looking for the matching ']' */
11248                 RExC_parse = s;
11249                 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11250                     RExC_parse++;
11251                 }
11252                 vFAIL2("Unmatched '%c' in POSIX class", c);
11253             }
11254             /* Grandfather lone [:, [=, [. */
11255             RExC_parse = s;
11256         }
11257         else {
11258             const char* const t = RExC_parse++; /* skip over the c */
11259             assert(*t == c);
11260
11261             if (UCHARAT(RExC_parse) == ']') {
11262                 const char *posixcc = s + 1;
11263                 RExC_parse++; /* skip over the ending ] */
11264
11265                 if (*s == ':') {
11266                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11267                     const I32 skip = t - posixcc;
11268
11269                     /* Initially switch on the length of the name.  */
11270                     switch (skip) {
11271                     case 4:
11272                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
11273                                                           this is the Perl \w
11274                                                         */
11275                             namedclass = ANYOF_WORDCHAR;
11276                         break;
11277                     case 5:
11278                         /* Names all of length 5.  */
11279                         /* alnum alpha ascii blank cntrl digit graph lower
11280                            print punct space upper  */
11281                         /* Offset 4 gives the best switch position.  */
11282                         switch (posixcc[4]) {
11283                         case 'a':
11284                             if (memEQ(posixcc, "alph", 4)) /* alpha */
11285                                 namedclass = ANYOF_ALPHA;
11286                             break;
11287                         case 'e':
11288                             if (memEQ(posixcc, "spac", 4)) /* space */
11289                                 namedclass = ANYOF_PSXSPC;
11290                             break;
11291                         case 'h':
11292                             if (memEQ(posixcc, "grap", 4)) /* graph */
11293                                 namedclass = ANYOF_GRAPH;
11294                             break;
11295                         case 'i':
11296                             if (memEQ(posixcc, "asci", 4)) /* ascii */
11297                                 namedclass = ANYOF_ASCII;
11298                             break;
11299                         case 'k':
11300                             if (memEQ(posixcc, "blan", 4)) /* blank */
11301                                 namedclass = ANYOF_BLANK;
11302                             break;
11303                         case 'l':
11304                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11305                                 namedclass = ANYOF_CNTRL;
11306                             break;
11307                         case 'm':
11308                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
11309                                 namedclass = ANYOF_ALPHANUMERIC;
11310                             break;
11311                         case 'r':
11312                             if (memEQ(posixcc, "lowe", 4)) /* lower */
11313                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
11314                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
11315                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
11316                             break;
11317                         case 't':
11318                             if (memEQ(posixcc, "digi", 4)) /* digit */
11319                                 namedclass = ANYOF_DIGIT;
11320                             else if (memEQ(posixcc, "prin", 4)) /* print */
11321                                 namedclass = ANYOF_PRINT;
11322                             else if (memEQ(posixcc, "punc", 4)) /* punct */
11323                                 namedclass = ANYOF_PUNCT;
11324                             break;
11325                         }
11326                         break;
11327                     case 6:
11328                         if (memEQ(posixcc, "xdigit", 6))
11329                             namedclass = ANYOF_XDIGIT;
11330                         break;
11331                     }
11332
11333                     if (namedclass == OOB_NAMEDCLASS)
11334                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11335                                       t - s - 1, s + 1);
11336
11337                     /* The #defines are structured so each complement is +1 to
11338                      * the normal one */
11339                     if (complement) {
11340                         namedclass++;
11341                     }
11342                     assert (posixcc[skip] == ':');
11343                     assert (posixcc[skip+1] == ']');
11344                 } else if (!SIZE_ONLY) {
11345                     /* [[=foo=]] and [[.foo.]] are still future. */
11346
11347                     /* adjust RExC_parse so the warning shows after
11348                        the class closes */
11349                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11350                         RExC_parse++;
11351                     SvREFCNT_dec(free_me);
11352                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11353                 }
11354             } else {
11355                 /* Maternal grandfather:
11356                  * "[:" ending in ":" but not in ":]" */
11357                 if (strict) {
11358                     vFAIL("Unmatched '[' in POSIX class");
11359                 }
11360
11361                 /* Grandfather lone [:, [=, [. */
11362                 RExC_parse = s;
11363             }
11364         }
11365     }
11366
11367     return namedclass;
11368 }
11369
11370 STATIC bool
11371 S_could_it_be_POSIX(pTHX_ RExC_state_t *pRExC_state)
11372 {
11373     /* This applies some heuristics at the current parse position (which should
11374      * be at a '[') to see if what follows might be intended to be a [:posix:]
11375      * class.  It returns true if it really is a posix class, of course, but it
11376      * also can return true if it thinks that what was intended was a posix
11377      * class that didn't quite make it.
11378      *
11379      * It will return true for
11380      *      [:alphanumerics:
11381      *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
11382      *                         ')' indicating the end of the (?[
11383      *      [:any garbage including %^&$ punctuation:]
11384      *
11385      * This is designed to be called only from S_handle_sets; it could be
11386      * easily adapted to be called from the spot at the beginning of regclass()
11387      * that checks to see in a normal bracketed class if the surrounding []
11388      * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
11389      * change long-standing behavior, so I (khw) didn't do that */
11390     char* p = RExC_parse + 1;
11391     char first_char = *p;
11392
11393     PERL_ARGS_ASSERT_COULD_IT_BE_POSIX;
11394
11395     assert(*(p - 1) == '[');
11396
11397     if (! POSIXCC(first_char)) {
11398         return FALSE;
11399     }
11400
11401     p++;
11402     while (p < RExC_end && isWORDCHAR(*p)) p++;
11403
11404     if (p >= RExC_end) {
11405         return FALSE;
11406     }
11407
11408     if (p - RExC_parse > 2    /* Got at least 1 word character */
11409         && (*p == first_char
11410             || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
11411     {
11412         return TRUE;
11413     }
11414
11415     p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
11416
11417     return (p
11418             && p - RExC_parse > 2 /* [:] evaluates to colon;
11419                                       [::] is a bad posix class. */
11420             && first_char == *(p - 1));
11421 }
11422
11423 STATIC regnode *
11424 S_handle_sets(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
11425                    char * const oregcomp_parse)
11426 {
11427     /* Handle the (?[...]) construct to do set operations */
11428
11429     U8 curchar;
11430     UV start, end;      /* End points of code point ranges */
11431     SV* result_string;
11432     char *save_end, *save_parse;
11433     SV* final;
11434     STRLEN len;
11435     regnode* node;
11436     AV* stack;
11437     const bool save_fold = FOLD;
11438
11439     GET_RE_DEBUG_FLAGS_DECL;
11440
11441     PERL_ARGS_ASSERT_HANDLE_SETS;
11442
11443     if (LOC) {
11444         vFAIL("(?[...]) not valid in locale");
11445     }
11446     RExC_uni_semantics = 1;
11447
11448     /* This will return only an ANYOF regnode, or (unlikely) something smaller
11449      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
11450      * call regclass to handle '[]' so as to not have to reinvent its parsing
11451      * rules here (throwing away the size it computes each time).  And, we exit
11452      * upon an unescaped ']' that isn't one ending a regclass.  To do both
11453      * these things, we need to realize that something preceded by a backslash
11454      * is escaped, so we have to keep track of backslashes */
11455     if (SIZE_ONLY) {
11456
11457         Perl_ck_warner_d(aTHX_
11458             packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
11459             "The regex_sets feature is experimental" REPORT_LOCATION,
11460             (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse);
11461
11462         while (RExC_parse < RExC_end) {
11463             SV* current = NULL;
11464             RExC_parse = regpatws(pRExC_state, RExC_parse,
11465                                 TRUE); /* means recognize comments */
11466             switch (*RExC_parse) {
11467                 default:
11468                     break;
11469                 case '\\':
11470                     /* Skip the next byte.  This would have to change to skip
11471                      * the next character if we were to recognize and handle
11472                      * specific non-ASCIIs */
11473                     RExC_parse++;
11474                     break;
11475                 case '[':
11476                 {
11477                     /* If this looks like it is a [:posix:] class, leave the
11478                      * parse pointer at the '[' to fool regclass() into
11479                      * thinking it is part of a '[[:posix]]'.  That function
11480                      * will use strict checking to force a syntax error if it
11481                      * doesn't work out to a legitimate class */
11482                     bool is_posix_class = could_it_be_POSIX(pRExC_state);
11483                     if (! is_posix_class) {
11484                         RExC_parse++;
11485                     }
11486
11487                     (void) regclass(pRExC_state, flagp,depth+1,
11488                                     is_posix_class, /* parse the whole char
11489                                                        class only if not a
11490                                                        posix class */
11491                                     FALSE, /* don't allow multi-char folds */
11492                                     TRUE, /* silence non-portable warnings. */
11493                                     &current);
11494                     /* function call leaves parse pointing to the ']', except
11495                      * if we faked it */
11496                     if (is_posix_class) {
11497                         RExC_parse--;
11498                     }
11499
11500                     SvREFCNT_dec(current);   /* In case it returned something */
11501                     break;
11502                 }
11503
11504                 case ']':
11505                     RExC_parse++;
11506                     if (RExC_parse < RExC_end
11507                         && *RExC_parse == ')')
11508                     {
11509                         node = reganode(pRExC_state, ANYOF, 0);
11510                         RExC_size += ANYOF_SKIP;
11511                         nextchar(pRExC_state);
11512                         Set_Node_Length(node,
11513                                 RExC_parse - oregcomp_parse + 1); /* MJD */
11514                         return node;
11515                     }
11516                     goto no_close;
11517             }
11518             RExC_parse++;
11519         }
11520
11521         no_close:
11522         FAIL("Syntax error in (?[...])");
11523     }
11524
11525     /* Pass 2 only after this.  Everything in this construct is a
11526      * metacharacter.  Operands begin with either a '\' (for an escape
11527      * sequence), or a '[' for a bracketed character class.  Any other
11528      * character should be an operator, or parenthesis for grouping.  Both
11529      * types of operands are handled by calling regclass() to parse them.  It
11530      * is called with a parameter to indicate to return the computed inversion
11531      * list.  The parsing here is implemented via a stack.  Each entry on the
11532      * stack is a single character representing one of the operators, or the
11533      * '('; or else a pointer to an operand inversion list. */
11534
11535 #define IS_OPERAND(a)  (! SvIOK(a))
11536
11537     /* The stack starts empty.  It is a syntax error if the first thing parsed
11538      * is a binary operator; everything else is pushed on the stack.  When an
11539      * operand is parsed, the top of the stack is examined.  If it is a binary
11540      * operator, the item before it should be an operand, and both are replaced
11541      * by the result of doing that operation on the new operand and the one on
11542      * the stack.   Thus a sequence of binary operands is reduced to a single
11543      * one before the next one is parsed.
11544      *
11545      * A unary operator may immediately follow a binary in the input, for
11546      * example
11547      *      [a] + ! [b]
11548      * When an operand is parsed and the top of the stack is a unary operator,
11549      * the operation is performed, and then the stack is rechecked to see if
11550      * this new operand is part of a binary operation; if so, it is handled as
11551      * above.
11552      *
11553      * A '(' is simply pushed on the stack; it is valid only if the stack is
11554      * empty, or the top element of the stack is an operator (for which the
11555      * parenthesized expression will become an operand).  By the time the
11556      * corresponding ')' is parsed everything in between should have been
11557      * parsed and evaluated to a single operand (or else is a syntax error),
11558      * and is handled as a regular operand */
11559
11560     stack = newAV();
11561
11562     while (RExC_parse < RExC_end) {
11563         I32 top_index = av_top(stack);
11564         SV** top_ptr;
11565         SV* current = NULL;
11566
11567         /* Skip white space */
11568         RExC_parse = regpatws(pRExC_state, RExC_parse,
11569                                 TRUE); /* means recognize comments */
11570         if (RExC_parse >= RExC_end
11571             || (curchar = UCHARAT(RExC_parse)) == ']')
11572         {   /* Exit loop at the end */
11573             break;
11574         }
11575
11576         switch (curchar) {
11577
11578             default:
11579                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11580                 vFAIL("Unexpected character");
11581
11582             case '\\':
11583                 (void) regclass(pRExC_state, flagp,depth+1,
11584                                 TRUE, /* means parse just the next thing */
11585                                 FALSE, /* don't allow multi-char folds */
11586                                 FALSE, /* don't silence non-portable warnings.
11587                                         */
11588                                 &current);
11589                 /* regclass() will return with parsing just the \ sequence,
11590                  * leaving the parse pointer at the next thing to parse */
11591                 RExC_parse--;
11592                 goto handle_operand;
11593
11594             case '[':   /* Is a bracketed character class */
11595             {
11596                 bool is_posix_class = could_it_be_POSIX(pRExC_state);
11597
11598                 if (! is_posix_class) {
11599                     RExC_parse++;
11600                 }
11601
11602                 (void) regclass(pRExC_state, flagp,depth+1,
11603                                 is_posix_class, /* parse the whole char class
11604                                                    only if not a posix class */
11605                                 FALSE, /* don't allow multi-char folds */
11606                                 FALSE, /* don't silence non-portable warnings.
11607                                         */
11608                                 &current);
11609                 /* function call leaves parse pointing to the ']', except if we
11610                  * faked it */
11611                 if (is_posix_class) {
11612                     RExC_parse--;
11613                 }
11614
11615                 goto handle_operand;
11616             }
11617
11618             case '&':
11619             case '|':
11620             case '+':
11621             case '-':
11622             case '^':
11623                 if (top_index < 0
11624                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
11625                     || ! IS_OPERAND(*top_ptr))
11626                 {
11627                     RExC_parse++;
11628                     vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
11629                 }
11630                 av_push(stack, newSVuv(curchar));
11631                 break;
11632
11633             case '!':
11634                 av_push(stack, newSVuv(curchar));
11635                 break;
11636
11637             case '(':
11638                 if (top_index >= 0) {
11639                     top_ptr = av_fetch(stack, top_index, FALSE);
11640                     assert(top_ptr);
11641                     if (IS_OPERAND(*top_ptr)) {
11642                         RExC_parse++;
11643                         vFAIL("Unexpected '(' with no preceding operator");
11644                     }
11645                 }
11646                 av_push(stack, newSVuv(curchar));
11647                 break;
11648
11649             case ')':
11650             {
11651                 SV* lparen;
11652                 if (top_index < 1
11653                     || ! (current = av_pop(stack))
11654                     || ! IS_OPERAND(current)
11655                     || ! (lparen = av_pop(stack))
11656                     || IS_OPERAND(lparen)
11657                     || SvUV(lparen) != '(')
11658                 {
11659                     RExC_parse++;
11660                     vFAIL("Unexpected ')'");
11661                 }
11662                 top_index -= 2;
11663                 SvREFCNT_dec_NN(lparen);
11664
11665                 /* FALL THROUGH */
11666             }
11667
11668               handle_operand:
11669
11670                 /* Here, we have an operand to process, in 'current' */
11671
11672                 if (top_index < 0) {    /* Just push if stack is empty */
11673                     av_push(stack, current);
11674                 }
11675                 else {
11676                     SV* top = av_pop(stack);
11677                     char current_operator;
11678
11679                     if (IS_OPERAND(top)) {
11680                         vFAIL("Operand with no preceding operator");
11681                     }
11682                     current_operator = (char) SvUV(top);
11683                     switch (current_operator) {
11684                         case '(':   /* Push the '(' back on followed by the new
11685                                        operand */
11686                             av_push(stack, top);
11687                             av_push(stack, current);
11688                             SvREFCNT_inc(top);  /* Counters the '_dec' done
11689                                                    just after the 'break', so
11690                                                    it doesn't get wrongly freed
11691                                                  */
11692                             break;
11693
11694                         case '!':
11695                             _invlist_invert(current);
11696
11697                             /* Unlike binary operators, the top of the stack,
11698                              * now that this unary one has been popped off, may
11699                              * legally be an operator, and we now have operand
11700                              * for it. */
11701                             top_index--;
11702                             SvREFCNT_dec_NN(top);
11703                             goto handle_operand;
11704
11705                         case '&':
11706                             _invlist_intersection(av_pop(stack),
11707                                                    current,
11708                                                    &current);
11709                             av_push(stack, current);
11710                             break;
11711
11712                         case '|':
11713                         case '+':
11714                             _invlist_union(av_pop(stack), current, &current);
11715                             av_push(stack, current);
11716                             break;
11717
11718                         case '-':
11719                             _invlist_subtract(av_pop(stack), current, &current);
11720                             av_push(stack, current);
11721                             break;
11722
11723                         case '^':   /* The union minus the intersection */
11724                         {
11725                             SV* i = NULL;
11726                             SV* u = NULL;
11727                             SV* element;
11728
11729                             element = av_pop(stack);
11730                             _invlist_union(element, current, &u);
11731                             _invlist_intersection(element, current, &i);
11732                             _invlist_subtract(u, i, &current);
11733                             av_push(stack, current);
11734                             SvREFCNT_dec_NN(i);
11735                             SvREFCNT_dec_NN(u);
11736                             SvREFCNT_dec_NN(element);
11737                             break;
11738                         }
11739
11740                         default:
11741                             Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
11742                 }
11743                 SvREFCNT_dec_NN(top);
11744             }
11745         }
11746
11747         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11748     }
11749
11750     if (av_top(stack) < 0   /* Was empty */
11751         || ((final = av_pop(stack)) == NULL)
11752         || ! IS_OPERAND(final)
11753         || av_top(stack) >= 0)  /* More left on stack */
11754     {
11755         vFAIL("Incomplete expression within '(?[ ])'");
11756     }
11757
11758     invlist_iterinit(final);
11759
11760     /* Here, 'final' is the resultant inversion list of evaluating the
11761      * expression.  Feed it to regclass() to generate the real resultant node.
11762      * regclass() is expecting a string of ranges and individual code points */
11763     result_string = newSVpvs("");
11764     while (invlist_iternext(final, &start, &end)) {
11765         if (start == end) {
11766             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
11767         }
11768         else {
11769             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
11770                                                      start,          end);
11771         }
11772     }
11773
11774     save_parse = RExC_parse;
11775     RExC_parse = SvPV(result_string, len);
11776     save_end = RExC_end;
11777     RExC_end = RExC_parse + len;
11778
11779     /* We turn off folding around the call, as the class we have constructed
11780      * already has all folding taken into consideration, and we don't want
11781      * regclass() to add to that */
11782     RExC_flags &= ~RXf_PMf_FOLD;
11783     node = regclass(pRExC_state, flagp,depth+1,
11784                     FALSE, /* means parse the whole char class */
11785                     FALSE, /* don't allow multi-char folds */
11786                     TRUE, /* silence non-portable warnings.  The above may very
11787                              well have generated non-portable code points, but
11788                              they're valid on this machine */
11789                     NULL);
11790     if (save_fold) {
11791         RExC_flags |= RXf_PMf_FOLD;
11792     }
11793     RExC_parse = save_parse + 1;
11794     RExC_end = save_end;
11795     SvREFCNT_dec_NN(final);
11796     SvREFCNT_dec_NN(result_string);
11797     SvREFCNT_dec_NN(stack);
11798
11799     nextchar(pRExC_state);
11800     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
11801     return node;
11802 }
11803 #undef IS_OPERAND
11804
11805 /* The names of properties whose definitions are not known at compile time are
11806  * stored in this SV, after a constant heading.  So if the length has been
11807  * changed since initialization, then there is a run-time definition. */
11808 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
11809
11810 STATIC regnode *
11811 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
11812                  const bool stop_at_1,  /* Just parse the next thing, don't
11813                                            look for a full character class */
11814                  bool allow_multi_folds,
11815                  const bool silence_non_portable,   /* Don't output warnings
11816                                                        about too large
11817                                                        characters */
11818                  SV** ret_invlist)  /* Return an inversion list, not a node */
11819 {
11820     /* parse a bracketed class specification.  Most of these will produce an
11821      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
11822      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
11823      * under /i with multi-character folds: it will be rewritten following the
11824      * paradigm of this example, where the <multi-fold>s are characters which
11825      * fold to multiple character sequences:
11826      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
11827      * gets effectively rewritten as:
11828      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
11829      * reg() gets called (recursively) on the rewritten version, and this
11830      * function will return what it constructs.  (Actually the <multi-fold>s
11831      * aren't physically removed from the [abcdefghi], it's just that they are
11832      * ignored in the recursion by means of a flag:
11833      * <RExC_in_multi_char_class>.)
11834      *
11835      * ANYOF nodes contain a bit map for the first 256 characters, with the
11836      * corresponding bit set if that character is in the list.  For characters
11837      * above 255, a range list or swash is used.  There are extra bits for \w,
11838      * etc. in locale ANYOFs, as what these match is not determinable at
11839      * compile time */
11840
11841     dVAR;
11842     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
11843     IV range = 0;
11844     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
11845     regnode *ret;
11846     STRLEN numlen;
11847     IV namedclass = OOB_NAMEDCLASS;
11848     char *rangebegin = NULL;
11849     bool need_class = 0;
11850     SV *listsv = NULL;
11851     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
11852                                       than just initialized.  */
11853     SV* properties = NULL;    /* Code points that match \p{} \P{} */
11854     SV* posixes = NULL;     /* Code points that match classes like, [:word:],
11855                                extended beyond the Latin1 range */
11856     UV element_count = 0;   /* Number of distinct elements in the class.
11857                                Optimizations may be possible if this is tiny */
11858     AV * multi_char_matches = NULL; /* Code points that fold to more than one
11859                                        character; used under /i */
11860     UV n;
11861     char * stop_ptr = RExC_end;    /* where to stop parsing */
11862     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
11863                                                    space? */
11864     const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
11865
11866     /* Unicode properties are stored in a swash; this holds the current one
11867      * being parsed.  If this swash is the only above-latin1 component of the
11868      * character class, an optimization is to pass it directly on to the
11869      * execution engine.  Otherwise, it is set to NULL to indicate that there
11870      * are other things in the class that have to be dealt with at execution
11871      * time */
11872     SV* swash = NULL;           /* Code points that match \p{} \P{} */
11873
11874     /* Set if a component of this character class is user-defined; just passed
11875      * on to the engine */
11876     bool has_user_defined_property = FALSE;
11877
11878     /* inversion list of code points this node matches only when the target
11879      * string is in UTF-8.  (Because is under /d) */
11880     SV* depends_list = NULL;
11881
11882     /* inversion list of code points this node matches.  For much of the
11883      * function, it includes only those that match regardless of the utf8ness
11884      * of the target string */
11885     SV* cp_list = NULL;
11886
11887 #ifdef EBCDIC
11888     /* In a range, counts how many 0-2 of the ends of it came from literals,
11889      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
11890     UV literal_endpoint = 0;
11891 #endif
11892     bool invert = FALSE;    /* Is this class to be complemented */
11893
11894     /* Is there any thing like \W or [:^digit:] that matches above the legal
11895      * Unicode range? */
11896     bool runtime_posix_matches_above_Unicode = FALSE;
11897
11898     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11899         case we need to change the emitted regop to an EXACT. */
11900     const char * orig_parse = RExC_parse;
11901     const I32 orig_size = RExC_size;
11902     GET_RE_DEBUG_FLAGS_DECL;
11903
11904     PERL_ARGS_ASSERT_REGCLASS;
11905 #ifndef DEBUGGING
11906     PERL_UNUSED_ARG(depth);
11907 #endif
11908
11909     DEBUG_PARSE("clas");
11910
11911     /* Assume we are going to generate an ANYOF node. */
11912     ret = reganode(pRExC_state, ANYOF, 0);
11913
11914     if (SIZE_ONLY) {
11915         RExC_size += ANYOF_SKIP;
11916         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11917     }
11918     else {
11919         ANYOF_FLAGS(ret) = 0;
11920
11921         RExC_emit += ANYOF_SKIP;
11922         if (LOC) {
11923             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11924         }
11925         listsv = newSVpvs("# comment\n");
11926         initial_listsv_len = SvCUR(listsv);
11927     }
11928
11929     if (skip_white) {
11930         RExC_parse = regpatws(pRExC_state, RExC_parse,
11931                               FALSE /* means don't recognize comments */);
11932     }
11933
11934     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
11935         RExC_parse++;
11936         invert = TRUE;
11937         allow_multi_folds = FALSE;
11938         RExC_naughty++;
11939         if (skip_white) {
11940             RExC_parse = regpatws(pRExC_state, RExC_parse,
11941                                   FALSE /* means don't recognize comments */);
11942         }
11943     }
11944
11945     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
11946     if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
11947         const char *s = RExC_parse;
11948         const char  c = *s++;
11949
11950         while (isWORDCHAR(*s))
11951             s++;
11952         if (*s && c == *s && s[1] == ']') {
11953             SAVEFREESV(RExC_rx_sv);
11954             SAVEFREESV(listsv);
11955             ckWARN3reg(s+2,
11956                        "POSIX syntax [%c %c] belongs inside character classes",
11957                        c, c);
11958             (void)ReREFCNT_inc(RExC_rx_sv);
11959             SvREFCNT_inc_simple_void_NN(listsv);
11960         }
11961     }
11962
11963     /* If the caller wants us to just parse a single element, accomplish this
11964      * by faking the loop ending condition */
11965     if (stop_at_1 && RExC_end > RExC_parse) {
11966         stop_ptr = RExC_parse + 1;
11967     }
11968
11969     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
11970     if (UCHARAT(RExC_parse) == ']')
11971         goto charclassloop;
11972
11973 parseit:
11974     while (1) {
11975         if  (RExC_parse >= stop_ptr) {
11976             break;
11977         }
11978
11979         if (skip_white) {
11980             RExC_parse = regpatws(pRExC_state, RExC_parse,
11981                                   FALSE /* means don't recognize comments */);
11982         }
11983
11984         if  (UCHARAT(RExC_parse) == ']') {
11985             break;
11986         }
11987
11988     charclassloop:
11989
11990         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11991         save_value = value;
11992         save_prevvalue = prevvalue;
11993
11994         if (!range) {
11995             rangebegin = RExC_parse;
11996             element_count++;
11997         }
11998         if (UTF) {
11999             value = utf8n_to_uvchr((U8*)RExC_parse,
12000                                    RExC_end - RExC_parse,
12001                                    &numlen, UTF8_ALLOW_DEFAULT);
12002             RExC_parse += numlen;
12003         }
12004         else
12005             value = UCHARAT(RExC_parse++);
12006
12007         if (value == '['
12008             && RExC_parse < RExC_end
12009             && POSIXCC(UCHARAT(RExC_parse)))
12010         {
12011             namedclass = regpposixcc(pRExC_state, value, listsv, strict);
12012         }
12013         else if (value == '\\') {
12014             if (UTF) {
12015                 value = utf8n_to_uvchr((U8*)RExC_parse,
12016                                    RExC_end - RExC_parse,
12017                                    &numlen, UTF8_ALLOW_DEFAULT);
12018                 RExC_parse += numlen;
12019             }
12020             else
12021                 value = UCHARAT(RExC_parse++);
12022
12023             /* Some compilers cannot handle switching on 64-bit integer
12024              * values, therefore value cannot be an UV.  Yes, this will
12025              * be a problem later if we want switch on Unicode.
12026              * A similar issue a little bit later when switching on
12027              * namedclass. --jhi */
12028
12029             /* If the \ is escaping white space when white space is being
12030              * skipped, it means that that white space is wanted literally, and
12031              * is already in 'value'.  Otherwise, need to translate the escape
12032              * into what it signifies. */
12033             if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
12034
12035             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
12036             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
12037             case 's':   namedclass = ANYOF_SPACE;       break;
12038             case 'S':   namedclass = ANYOF_NSPACE;      break;
12039             case 'd':   namedclass = ANYOF_DIGIT;       break;
12040             case 'D':   namedclass = ANYOF_NDIGIT;      break;
12041             case 'v':   namedclass = ANYOF_VERTWS;      break;
12042             case 'V':   namedclass = ANYOF_NVERTWS;     break;
12043             case 'h':   namedclass = ANYOF_HORIZWS;     break;
12044             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
12045             case 'N':  /* Handle \N{NAME} in class */
12046                 {
12047                     /* We only pay attention to the first char of
12048                     multichar strings being returned. I kinda wonder
12049                     if this makes sense as it does change the behaviour
12050                     from earlier versions, OTOH that behaviour was broken
12051                     as well. */
12052                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
12053                                       TRUE, /* => charclass */
12054                                       strict))
12055                     {
12056                         goto parseit;
12057                     }
12058                 }
12059                 break;
12060             case 'p':
12061             case 'P':
12062                 {
12063                 char *e;
12064
12065                 /* We will handle any undefined properties ourselves */
12066                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
12067
12068                 if (RExC_parse >= RExC_end)
12069                     vFAIL2("Empty \\%c{}", (U8)value);
12070                 if (*RExC_parse == '{') {
12071                     const U8 c = (U8)value;
12072                     e = strchr(RExC_parse++, '}');
12073                     if (!e)
12074                         vFAIL2("Missing right brace on \\%c{}", c);
12075                     while (isSPACE(UCHARAT(RExC_parse)))
12076                         RExC_parse++;
12077                     if (e == RExC_parse)
12078                         vFAIL2("Empty \\%c{}", c);
12079                     n = e - RExC_parse;
12080                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
12081                         n--;
12082                 }
12083                 else {
12084                     e = RExC_parse;
12085                     n = 1;
12086                 }
12087                 if (!SIZE_ONLY) {
12088                     SV* invlist;
12089                     char* name;
12090
12091                     if (UCHARAT(RExC_parse) == '^') {
12092                          RExC_parse++;
12093                          n--;
12094                          /* toggle.  (The rhs xor gets the single bit that
12095                           * differs between P and p; the other xor inverts just
12096                           * that bit) */
12097                          value ^= 'P' ^ 'p';
12098
12099                          while (isSPACE(UCHARAT(RExC_parse))) {
12100                               RExC_parse++;
12101                               n--;
12102                          }
12103                     }
12104                     /* Try to get the definition of the property into
12105                      * <invlist>.  If /i is in effect, the effective property
12106                      * will have its name be <__NAME_i>.  The design is
12107                      * discussed in commit
12108                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
12109                     Newx(name, n + sizeof("_i__\n"), char);
12110
12111                     sprintf(name, "%s%.*s%s\n",
12112                                     (FOLD) ? "__" : "",
12113                                     (int)n,
12114                                     RExC_parse,
12115                                     (FOLD) ? "_i" : ""
12116                     );
12117
12118                     /* Look up the property name, and get its swash and
12119                      * inversion list, if the property is found  */
12120                     if (swash) {
12121                         SvREFCNT_dec_NN(swash);
12122                     }
12123                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
12124                                              1, /* binary */
12125                                              0, /* not tr/// */
12126                                              NULL, /* No inversion list */
12127                                              &swash_init_flags
12128                                             );
12129                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
12130                         if (swash) {
12131                             SvREFCNT_dec_NN(swash);
12132                             swash = NULL;
12133                         }
12134
12135                         /* Here didn't find it.  It could be a user-defined
12136                          * property that will be available at run-time.  If we
12137                          * accept only compile-time properties, is an error;
12138                          * otherwise add it to the list for run-time look up */
12139                         if (ret_invlist) {
12140                             RExC_parse = e + 1;
12141                             vFAIL3("Property '%.*s' is unknown", (int) n, name);
12142                         }
12143                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
12144                                         (value == 'p' ? '+' : '!'),
12145                                         name);
12146                         has_user_defined_property = TRUE;
12147
12148                         /* We don't know yet, so have to assume that the
12149                          * property could match something in the Latin1 range,
12150                          * hence something that isn't utf8.  Note that this
12151                          * would cause things in <depends_list> to match
12152                          * inappropriately, except that any \p{}, including
12153                          * this one forces Unicode semantics, which means there
12154                          * is <no depends_list> */
12155                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
12156                     }
12157                     else {
12158
12159                         /* Here, did get the swash and its inversion list.  If
12160                          * the swash is from a user-defined property, then this
12161                          * whole character class should be regarded as such */
12162                         has_user_defined_property =
12163                                     (swash_init_flags
12164                                      & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
12165
12166                         /* Invert if asking for the complement */
12167                         if (value == 'P') {
12168                             _invlist_union_complement_2nd(properties,
12169                                                           invlist,
12170                                                           &properties);
12171
12172                             /* The swash can't be used as-is, because we've
12173                              * inverted things; delay removing it to here after
12174                              * have copied its invlist above */
12175                             SvREFCNT_dec_NN(swash);
12176                             swash = NULL;
12177                         }
12178                         else {
12179                             _invlist_union(properties, invlist, &properties);
12180                         }
12181                     }
12182                     Safefree(name);
12183                 }
12184                 RExC_parse = e + 1;
12185                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
12186                                                 named */
12187
12188                 /* \p means they want Unicode semantics */
12189                 RExC_uni_semantics = 1;
12190                 }
12191                 break;
12192             case 'n':   value = '\n';                   break;
12193             case 'r':   value = '\r';                   break;
12194             case 't':   value = '\t';                   break;
12195             case 'f':   value = '\f';                   break;
12196             case 'b':   value = '\b';                   break;
12197             case 'e':   value = ASCII_TO_NATIVE('\033');break;
12198             case 'a':   value = ASCII_TO_NATIVE('\007');break;
12199             case 'o':
12200                 RExC_parse--;   /* function expects to be pointed at the 'o' */
12201                 {
12202                     const char* error_msg;
12203                     bool valid = grok_bslash_o(&RExC_parse,
12204                                                &value,
12205                                                &error_msg,
12206                                                SIZE_ONLY,   /* warnings in pass
12207                                                                1 only */
12208                                                strict,
12209                                                silence_non_portable,
12210                                                UTF);
12211                     if (! valid) {
12212                         vFAIL(error_msg);
12213                     }
12214                 }
12215                 if (PL_encoding && value < 0x100) {
12216                     goto recode_encoding;
12217                 }
12218                 break;
12219             case 'x':
12220                 RExC_parse--;   /* function expects to be pointed at the 'x' */
12221                 {
12222                     const char* error_msg;
12223                     bool valid = grok_bslash_x(&RExC_parse,
12224                                                &value,
12225                                                &error_msg,
12226                                                TRUE, /* Output warnings */
12227                                                strict,
12228                                                silence_non_portable,
12229                                                UTF);
12230                     if (! valid) {
12231                         vFAIL(error_msg);
12232                     }
12233                 }
12234                 if (PL_encoding && value < 0x100)
12235                     goto recode_encoding;
12236                 break;
12237             case 'c':
12238                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
12239                 break;
12240             case '0': case '1': case '2': case '3': case '4':
12241             case '5': case '6': case '7':
12242                 {
12243                     /* Take 1-3 octal digits */
12244                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12245                     numlen = (strict) ? 4 : 3;
12246                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
12247                     RExC_parse += numlen;
12248                     if (numlen != 3) {
12249                         SAVEFREESV(listsv); /* In case warnings are fatalized */
12250                         if (strict) {
12251                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12252                             vFAIL("Need exactly 3 octal digits");
12253                         }
12254                         else if (! SIZE_ONLY /* like \08, \178 */
12255                                  && numlen < 3
12256                                  && RExC_parse < RExC_end
12257                                  && isDIGIT(*RExC_parse)
12258                                  && ckWARN(WARN_REGEXP))
12259                         {
12260                             SAVEFREESV(RExC_rx_sv);
12261                             reg_warn_non_literal_string(
12262                                  RExC_parse + 1,
12263                                  form_short_octal_warning(RExC_parse, numlen));
12264                             (void)ReREFCNT_inc(RExC_rx_sv);
12265                         }
12266                         SvREFCNT_inc_simple_void_NN(listsv);
12267                     }
12268                     if (PL_encoding && value < 0x100)
12269                         goto recode_encoding;
12270                     break;
12271                 }
12272             recode_encoding:
12273                 if (! RExC_override_recoding) {
12274                     SV* enc = PL_encoding;
12275                     value = reg_recode((const char)(U8)value, &enc);
12276                     if (!enc) {
12277                         if (strict) {
12278                             vFAIL("Invalid escape in the specified encoding");
12279                         }
12280                         else if (SIZE_ONLY) {
12281                             ckWARNreg(RExC_parse,
12282                                   "Invalid escape in the specified encoding");
12283                         }
12284                     }
12285                     break;
12286                 }
12287             default:
12288                 /* Allow \_ to not give an error */
12289                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
12290                     SAVEFREESV(listsv);
12291                     if (strict) {
12292                         vFAIL2("Unrecognized escape \\%c in character class",
12293                                (int)value);
12294                     }
12295                     else {
12296                         SAVEFREESV(RExC_rx_sv);
12297                         ckWARN2reg(RExC_parse,
12298                             "Unrecognized escape \\%c in character class passed through",
12299                             (int)value);
12300                         (void)ReREFCNT_inc(RExC_rx_sv);
12301                     }
12302                     SvREFCNT_inc_simple_void_NN(listsv);
12303                 }
12304                 break;
12305             }   /* End of switch on char following backslash */
12306         } /* end of handling backslash escape sequences */
12307 #ifdef EBCDIC
12308         else
12309             literal_endpoint++;
12310 #endif
12311
12312         /* Here, we have the current token in 'value' */
12313
12314         /* What matches in a locale is not known until runtime.  This includes
12315          * what the Posix classes (like \w, [:space:]) match.  Room must be
12316          * reserved (one time per class) to store such classes, either if Perl
12317          * is compiled so that locale nodes always should have this space, or
12318          * if there is such class info to be stored.  The space will contain a
12319          * bit for each named class that is to be matched against.  This isn't
12320          * needed for \p{} and pseudo-classes, as they are not affected by
12321          * locale, and hence are dealt with separately */
12322         if (LOC
12323             && ! need_class
12324             && (ANYOF_LOCALE == ANYOF_CLASS
12325                 || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
12326         {
12327             need_class = 1;
12328             if (SIZE_ONLY) {
12329                 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12330             }
12331             else {
12332                 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12333                 ANYOF_CLASS_ZERO(ret);
12334             }
12335             ANYOF_FLAGS(ret) |= ANYOF_CLASS;
12336         }
12337
12338         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
12339
12340             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
12341              * literal, as is the character that began the false range, i.e.
12342              * the 'a' in the examples */
12343             if (range) {
12344                 if (!SIZE_ONLY) {
12345                     const int w = (RExC_parse >= rangebegin)
12346                                   ? RExC_parse - rangebegin
12347                                   : 0;
12348                     SAVEFREESV(listsv); /* in case of fatal warnings */
12349                     if (strict) {
12350                         vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin);
12351                     }
12352                     else {
12353                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
12354                         ckWARN4reg(RExC_parse,
12355                                 "False [] range \"%*.*s\"",
12356                                 w, w, rangebegin);
12357                         (void)ReREFCNT_inc(RExC_rx_sv);
12358                         cp_list = add_cp_to_invlist(cp_list, '-');
12359                         cp_list = add_cp_to_invlist(cp_list, prevvalue);
12360                     }
12361                     SvREFCNT_inc_simple_void_NN(listsv);
12362                 }
12363
12364                 range = 0; /* this was not a true range */
12365                 element_count += 2; /* So counts for three values */
12366             }
12367
12368             if (! SIZE_ONLY) {
12369                 U8 classnum = namedclass_to_classnum(namedclass);
12370                 if (namedclass >= ANYOF_MAX) {  /* If a special class */
12371                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
12372
12373                         /* Here, should be \h, \H, \v, or \V.  Neither /d nor
12374                          * /l make a difference in what these match.  There
12375                          * would be problems if these characters had folds
12376                          * other than themselves, as cp_list is subject to
12377                          * folding. */
12378                         if (classnum != _CC_VERTSPACE) {
12379                             assert(   namedclass == ANYOF_HORIZWS
12380                                    || namedclass == ANYOF_NHORIZWS);
12381
12382                             /* It turns out that \h is just a synonym for
12383                              * XPosixBlank */
12384                             classnum = _CC_BLANK;
12385                         }
12386
12387                         _invlist_union_maybe_complement_2nd(
12388                                 cp_list,
12389                                 PL_XPosix_ptrs[classnum],
12390                                 cBOOL(namedclass % 2), /* Complement if odd
12391                                                           (NHORIZWS, NVERTWS)
12392                                                         */
12393                                 &cp_list);
12394                     }
12395                 }
12396                 else if (classnum == _CC_ASCII) {
12397 #ifdef HAS_ISASCII
12398                     if (LOC) {
12399                         ANYOF_CLASS_SET(ret, namedclass);
12400                     }
12401                     else
12402 #endif  /* Not isascii(); just use the hard-coded definition for it */
12403                         _invlist_union_maybe_complement_2nd(
12404                                 posixes,
12405                                 PL_ASCII,
12406                                 cBOOL(namedclass % 2), /* Complement if odd
12407                                                           (NASCII) */
12408                                 &posixes);
12409                 }
12410                 else {  /* Garden variety class */
12411
12412                     /* The ascii range inversion list */
12413                     SV* ascii_source = PL_Posix_ptrs[classnum];
12414
12415                     /* The full Latin1 range inversion list */
12416                     SV* l1_source = PL_L1Posix_ptrs[classnum];
12417
12418                     /* This code is structured into two major clauses.  The
12419                      * first is for classes whose complete definitions may not
12420                      * already be known.  It not, the Latin1 definition
12421                      * (guaranteed to already known) is used plus code is
12422                      * generated to load the rest at run-time (only if needed).
12423                      * If the complete definition is known, it drops down to
12424                      * the second clause, where the complete definition is
12425                      * known */
12426
12427                     if (classnum < _FIRST_NON_SWASH_CC) {
12428
12429                         /* Here, the class has a swash, which may or not
12430                          * already be loaded */
12431
12432                         /* The name of the property to use to match the full
12433                          * eXtended Unicode range swash for this character
12434                          * class */
12435                         const char *Xname = swash_property_names[classnum];
12436
12437                         /* If returning the inversion list, we can't defer
12438                          * getting this until runtime */
12439                         if (ret_invlist && !  PL_utf8_swash_ptrs[classnum]) {
12440                             PL_utf8_swash_ptrs[classnum] =
12441                                 _core_swash_init("utf8", Xname, &PL_sv_undef,
12442                                              1, /* binary */
12443                                              0, /* not tr/// */
12444                                              NULL, /* No inversion list */
12445                                              NULL  /* No flags */
12446                                             );
12447                             assert(PL_utf8_swash_ptrs[classnum]);
12448                         }
12449                         if ( !  PL_utf8_swash_ptrs[classnum]) {
12450                             if (namedclass % 2 == 0) { /* A non-complemented
12451                                                           class */
12452                                 /* If not /a matching, there are code points we
12453                                  * don't know at compile time.  Arrange for the
12454                                  * unknown matches to be loaded at run-time, if
12455                                  * needed */
12456                                 if (! AT_LEAST_ASCII_RESTRICTED) {
12457                                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
12458                                                                  Xname);
12459                                 }
12460                                 if (LOC) {  /* Under locale, set run-time
12461                                                lookup */
12462                                     ANYOF_CLASS_SET(ret, namedclass);
12463                                 }
12464                                 else {
12465                                     /* Add the current class's code points to
12466                                      * the running total */
12467                                     _invlist_union(posixes,
12468                                                    (AT_LEAST_ASCII_RESTRICTED)
12469                                                         ? ascii_source
12470                                                         : l1_source,
12471                                                    &posixes);
12472                                 }
12473                             }
12474                             else {  /* A complemented class */
12475                                 if (AT_LEAST_ASCII_RESTRICTED) {
12476                                     /* Under /a should match everything above
12477                                      * ASCII, plus the complement of the set's
12478                                      * ASCII matches */
12479                                     _invlist_union_complement_2nd(posixes,
12480                                                                   ascii_source,
12481                                                                   &posixes);
12482                                 }
12483                                 else {
12484                                     /* Arrange for the unknown matches to be
12485                                      * loaded at run-time, if needed */
12486                                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
12487                                                                  Xname);
12488                                     runtime_posix_matches_above_Unicode = TRUE;
12489                                     if (LOC) {
12490                                         ANYOF_CLASS_SET(ret, namedclass);
12491                                     }
12492                                     else {
12493
12494                                         /* We want to match everything in
12495                                          * Latin1, except those things that
12496                                          * l1_source matches */
12497                                         SV* scratch_list = NULL;
12498                                         _invlist_subtract(PL_Latin1, l1_source,
12499                                                           &scratch_list);
12500
12501                                         /* Add the list from this class to the
12502                                          * running total */
12503                                         if (! posixes) {
12504                                             posixes = scratch_list;
12505                                         }
12506                                         else {
12507                                             _invlist_union(posixes,
12508                                                            scratch_list,
12509                                                            &posixes);
12510                                             SvREFCNT_dec_NN(scratch_list);
12511                                         }
12512                                         if (DEPENDS_SEMANTICS) {
12513                                             ANYOF_FLAGS(ret)
12514                                                   |= ANYOF_NON_UTF8_LATIN1_ALL;
12515                                         }
12516                                     }
12517                                 }
12518                             }
12519                             goto namedclass_done;
12520                         }
12521
12522                         /* Here, there is a swash loaded for the class.  If no
12523                          * inversion list for it yet, get it */
12524                         if (! PL_XPosix_ptrs[classnum]) {
12525                             PL_XPosix_ptrs[classnum]
12526                              = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
12527                         }
12528                     }
12529
12530                     /* Here there is an inversion list already loaded for the
12531                      * entire class */
12532
12533                     if (namedclass % 2 == 0) {  /* A non-complemented class,
12534                                                    like ANYOF_PUNCT */
12535                         if (! LOC) {
12536                             /* For non-locale, just add it to any existing list
12537                              * */
12538                             _invlist_union(posixes,
12539                                            (AT_LEAST_ASCII_RESTRICTED)
12540                                                ? ascii_source
12541                                                : PL_XPosix_ptrs[classnum],
12542                                            &posixes);
12543                         }
12544                         else {  /* Locale */
12545                             SV* scratch_list = NULL;
12546
12547                             /* For above Latin1 code points, we use the full
12548                              * Unicode range */
12549                             _invlist_intersection(PL_AboveLatin1,
12550                                                   PL_XPosix_ptrs[classnum],
12551                                                   &scratch_list);
12552                             /* And set the output to it, adding instead if
12553                              * there already is an output.  Checking if
12554                              * 'posixes' is NULL first saves an extra clone.
12555                              * Its reference count will be decremented at the
12556                              * next union, etc, or if this is the only
12557                              * instance, at the end of the routine */
12558                             if (! posixes) {
12559                                 posixes = scratch_list;
12560                             }
12561                             else {
12562                                 _invlist_union(posixes, scratch_list, &posixes);
12563                                 SvREFCNT_dec_NN(scratch_list);
12564                             }
12565
12566 #ifndef HAS_ISBLANK
12567                             if (namedclass != ANYOF_BLANK) {
12568 #endif
12569                                 /* Set this class in the node for runtime
12570                                  * matching */
12571                                 ANYOF_CLASS_SET(ret, namedclass);
12572 #ifndef HAS_ISBLANK
12573                             }
12574                             else {
12575                                 /* No isblank(), use the hard-coded ASCII-range
12576                                  * blanks, adding them to the running total. */
12577
12578                                 _invlist_union(posixes, ascii_source, &posixes);
12579                             }
12580 #endif
12581                         }
12582                     }
12583                     else {  /* A complemented class, like ANYOF_NPUNCT */
12584                         if (! LOC) {
12585                             _invlist_union_complement_2nd(
12586                                                 posixes,
12587                                                 (AT_LEAST_ASCII_RESTRICTED)
12588                                                     ? ascii_source
12589                                                     : PL_XPosix_ptrs[classnum],
12590                                                 &posixes);
12591                             /* Under /d, everything in the upper half of the
12592                              * Latin1 range matches this complement */
12593                             if (DEPENDS_SEMANTICS) {
12594                                 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
12595                             }
12596                         }
12597                         else {  /* Locale */
12598                             SV* scratch_list = NULL;
12599                             _invlist_subtract(PL_AboveLatin1,
12600                                               PL_XPosix_ptrs[classnum],
12601                                               &scratch_list);
12602                             if (! posixes) {
12603                                 posixes = scratch_list;
12604                             }
12605                             else {
12606                                 _invlist_union(posixes, scratch_list, &posixes);
12607                                 SvREFCNT_dec_NN(scratch_list);
12608                             }
12609 #ifndef HAS_ISBLANK
12610                             if (namedclass != ANYOF_NBLANK) {
12611 #endif
12612                                 ANYOF_CLASS_SET(ret, namedclass);
12613 #ifndef HAS_ISBLANK
12614                             }
12615                             else {
12616                                 /* Get the list of all code points in Latin1
12617                                  * that are not ASCII blanks, and add them to
12618                                  * the running total */
12619                                 _invlist_subtract(PL_Latin1, ascii_source,
12620                                                   &scratch_list);
12621                                 _invlist_union(posixes, scratch_list, &posixes);
12622                                 SvREFCNT_dec_NN(scratch_list);
12623                             }
12624 #endif
12625                         }
12626                     }
12627                 }
12628               namedclass_done:
12629                 continue;   /* Go get next character */
12630             }
12631         } /* end of namedclass \blah */
12632
12633         /* Here, we have a single value.  If 'range' is set, it is the ending
12634          * of a range--check its validity.  Later, we will handle each
12635          * individual code point in the range.  If 'range' isn't set, this
12636          * could be the beginning of a range, so check for that by looking
12637          * ahead to see if the next real character to be processed is the range
12638          * indicator--the minus sign */
12639
12640         if (skip_white) {
12641             RExC_parse = regpatws(pRExC_state, RExC_parse,
12642                                 FALSE /* means don't recognize comments */);
12643         }
12644
12645         if (range) {
12646             if (prevvalue > value) /* b-a */ {
12647                 const int w = RExC_parse - rangebegin;
12648                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
12649                 range = 0; /* not a valid range */
12650             }
12651         }
12652         else {
12653             prevvalue = value; /* save the beginning of the potential range */
12654             if (! stop_at_1     /* Can't be a range if parsing just one thing */
12655                 && *RExC_parse == '-')
12656             {
12657                 char* next_char_ptr = RExC_parse + 1;
12658                 if (skip_white) {   /* Get the next real char after the '-' */
12659                     next_char_ptr = regpatws(pRExC_state,
12660                                              RExC_parse + 1,
12661                                              FALSE); /* means don't recognize
12662                                                         comments */
12663                 }
12664
12665                 /* If the '-' is at the end of the class (just before the ']',
12666                  * it is a literal minus; otherwise it is a range */
12667                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
12668                     RExC_parse = next_char_ptr;
12669
12670                     /* a bad range like \w-, [:word:]- ? */
12671                     if (namedclass > OOB_NAMEDCLASS) {
12672                         if (strict || ckWARN(WARN_REGEXP)) {
12673                             const int w =
12674                                 RExC_parse >= rangebegin ?
12675                                 RExC_parse - rangebegin : 0;
12676                             if (strict) {
12677                                 vFAIL4("False [] range \"%*.*s\"",
12678                                     w, w, rangebegin);
12679                             }
12680                             else {
12681                                 vWARN4(RExC_parse,
12682                                     "False [] range \"%*.*s\"",
12683                                     w, w, rangebegin);
12684                             }
12685                         }
12686                         if (!SIZE_ONLY) {
12687                             cp_list = add_cp_to_invlist(cp_list, '-');
12688                         }
12689                         element_count++;
12690                     } else
12691                         range = 1;      /* yeah, it's a range! */
12692                     continue;   /* but do it the next time */
12693                 }
12694             }
12695         }
12696
12697         /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12698          * if not */
12699
12700         /* non-Latin1 code point implies unicode semantics.  Must be set in
12701          * pass1 so is there for the whole of pass 2 */
12702         if (value > 255) {
12703             RExC_uni_semantics = 1;
12704         }
12705
12706         /* Ready to process either the single value, or the completed range.
12707          * For single-valued non-inverted ranges, we consider the possibility
12708          * of multi-char folds.  (We made a conscious decision to not do this
12709          * for the other cases because it can often lead to non-intuitive
12710          * results.  For example, you have the peculiar case that:
12711          *  "s s" =~ /^[^\xDF]+$/i => Y
12712          *  "ss"  =~ /^[^\xDF]+$/i => N
12713          *
12714          * See [perl #89750] */
12715         if (FOLD && allow_multi_folds && value == prevvalue) {
12716             if (value == LATIN_SMALL_LETTER_SHARP_S
12717                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
12718                                                         value)))
12719             {
12720                 /* Here <value> is indeed a multi-char fold.  Get what it is */
12721
12722                 U8 foldbuf[UTF8_MAXBYTES_CASE];
12723                 STRLEN foldlen;
12724
12725                 UV folded = _to_uni_fold_flags(
12726                                 value,
12727                                 foldbuf,
12728                                 &foldlen,
12729                                 FOLD_FLAGS_FULL
12730                                 | ((LOC) ?  FOLD_FLAGS_LOCALE
12731                                             : (ASCII_FOLD_RESTRICTED)
12732                                               ? FOLD_FLAGS_NOMIX_ASCII
12733                                               : 0)
12734                                 );
12735
12736                 /* Here, <folded> should be the first character of the
12737                  * multi-char fold of <value>, with <foldbuf> containing the
12738                  * whole thing.  But, if this fold is not allowed (because of
12739                  * the flags), <fold> will be the same as <value>, and should
12740                  * be processed like any other character, so skip the special
12741                  * handling */
12742                 if (folded != value) {
12743
12744                     /* Skip if we are recursed, currently parsing the class
12745                      * again.  Otherwise add this character to the list of
12746                      * multi-char folds. */
12747                     if (! RExC_in_multi_char_class) {
12748                         AV** this_array_ptr;
12749                         AV* this_array;
12750                         STRLEN cp_count = utf8_length(foldbuf,
12751                                                       foldbuf + foldlen);
12752                         SV* multi_fold = sv_2mortal(newSVpvn("", 0));
12753
12754                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
12755
12756
12757                         if (! multi_char_matches) {
12758                             multi_char_matches = newAV();
12759                         }
12760
12761                         /* <multi_char_matches> is actually an array of arrays.
12762                          * There will be one or two top-level elements: [2],
12763                          * and/or [3].  The [2] element is an array, each
12764                          * element thereof is a character which folds to two
12765                          * characters; likewise for [3].  (Unicode guarantees a
12766                          * maximum of 3 characters in any fold.)  When we
12767                          * rewrite the character class below, we will do so
12768                          * such that the longest folds are written first, so
12769                          * that it prefers the longest matching strings first.
12770                          * This is done even if it turns out that any
12771                          * quantifier is non-greedy, out of programmer
12772                          * laziness.  Tom Christiansen has agreed that this is
12773                          * ok.  This makes the test for the ligature 'ffi' come
12774                          * before the test for 'ff' */
12775                         if (av_exists(multi_char_matches, cp_count)) {
12776                             this_array_ptr = (AV**) av_fetch(multi_char_matches,
12777                                                              cp_count, FALSE);
12778                             this_array = *this_array_ptr;
12779                         }
12780                         else {
12781                             this_array = newAV();
12782                             av_store(multi_char_matches, cp_count,
12783                                      (SV*) this_array);
12784                         }
12785                         av_push(this_array, multi_fold);
12786                     }
12787
12788                     /* This element should not be processed further in this
12789                      * class */
12790                     element_count--;
12791                     value = save_value;
12792                     prevvalue = save_prevvalue;
12793                     continue;
12794                 }
12795             }
12796         }
12797
12798         /* Deal with this element of the class */
12799         if (! SIZE_ONLY) {
12800 #ifndef EBCDIC
12801             cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
12802 #else
12803             UV* this_range = _new_invlist(1);
12804             _append_range_to_invlist(this_range, prevvalue, value);
12805
12806             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
12807              * If this range was specified using something like 'i-j', we want
12808              * to include only the 'i' and the 'j', and not anything in
12809              * between, so exclude non-ASCII, non-alphabetics from it.
12810              * However, if the range was specified with something like
12811              * [\x89-\x91] or [\x89-j], all code points within it should be
12812              * included.  literal_endpoint==2 means both ends of the range used
12813              * a literal character, not \x{foo} */
12814             if (literal_endpoint == 2
12815                 && (prevvalue >= 'a' && value <= 'z')
12816                     || (prevvalue >= 'A' && value <= 'Z'))
12817             {
12818                 _invlist_intersection(this_range, PL_ASCII, &this_range, );
12819                 _invlist_intersection(this_range, PL_Alpha, &this_range, );
12820             }
12821             _invlist_union(cp_list, this_range, &cp_list);
12822             literal_endpoint = 0;
12823 #endif
12824         }
12825
12826         range = 0; /* this range (if it was one) is done now */
12827     } /* End of loop through all the text within the brackets */
12828
12829     /* If anything in the class expands to more than one character, we have to
12830      * deal with them by building up a substitute parse string, and recursively
12831      * calling reg() on it, instead of proceeding */
12832     if (multi_char_matches) {
12833         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
12834         I32 cp_count;
12835         STRLEN len;
12836         char *save_end = RExC_end;
12837         char *save_parse = RExC_parse;
12838         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
12839                                        a "|" */
12840         I32 reg_flags;
12841
12842         assert(! invert);
12843 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
12844            because too confusing */
12845         if (invert) {
12846             sv_catpv(substitute_parse, "(?:");
12847         }
12848 #endif
12849
12850         /* Look at the longest folds first */
12851         for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
12852
12853             if (av_exists(multi_char_matches, cp_count)) {
12854                 AV** this_array_ptr;
12855                 SV* this_sequence;
12856
12857                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12858                                                  cp_count, FALSE);
12859                 while ((this_sequence = av_pop(*this_array_ptr)) !=
12860                                                                 &PL_sv_undef)
12861                 {
12862                     if (! first_time) {
12863                         sv_catpv(substitute_parse, "|");
12864                     }
12865                     first_time = FALSE;
12866
12867                     sv_catpv(substitute_parse, SvPVX(this_sequence));
12868                 }
12869             }
12870         }
12871
12872         /* If the character class contains anything else besides these
12873          * multi-character folds, have to include it in recursive parsing */
12874         if (element_count) {
12875             sv_catpv(substitute_parse, "|[");
12876             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
12877             sv_catpv(substitute_parse, "]");
12878         }
12879
12880         sv_catpv(substitute_parse, ")");
12881 #if 0
12882         if (invert) {
12883             /* This is a way to get the parse to skip forward a whole named
12884              * sequence instead of matching the 2nd character when it fails the
12885              * first */
12886             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
12887         }
12888 #endif
12889
12890         RExC_parse = SvPV(substitute_parse, len);
12891         RExC_end = RExC_parse + len;
12892         RExC_in_multi_char_class = 1;
12893         RExC_emit = (regnode *)orig_emit;
12894
12895         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
12896
12897         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED);
12898
12899         RExC_parse = save_parse;
12900         RExC_end = save_end;
12901         RExC_in_multi_char_class = 0;
12902         SvREFCNT_dec_NN(multi_char_matches);
12903         SvREFCNT_dec_NN(listsv);
12904         return ret;
12905     }
12906
12907     /* If the character class contains only a single element, it may be
12908      * optimizable into another node type which is smaller and runs faster.
12909      * Check if this is the case for this class */
12910     if (element_count == 1 && ! ret_invlist) {
12911         U8 op = END;
12912         U8 arg = 0;
12913
12914         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
12915                                               [:digit:] or \p{foo} */
12916
12917             /* All named classes are mapped into POSIXish nodes, with its FLAG
12918              * argument giving which class it is */
12919             switch ((I32)namedclass) {
12920                 case ANYOF_UNIPROP:
12921                     break;
12922
12923                 /* These don't depend on the charset modifiers.  They always
12924                  * match under /u rules */
12925                 case ANYOF_NHORIZWS:
12926                 case ANYOF_HORIZWS:
12927                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
12928                     /* FALLTHROUGH */
12929
12930                 case ANYOF_NVERTWS:
12931                 case ANYOF_VERTWS:
12932                     op = POSIXU;
12933                     goto join_posix;
12934
12935                 /* The actual POSIXish node for all the rest depends on the
12936                  * charset modifier.  The ones in the first set depend only on
12937                  * ASCII or, if available on this platform, locale */
12938                 case ANYOF_ASCII:
12939                 case ANYOF_NASCII:
12940 #ifdef HAS_ISASCII
12941                     op = (LOC) ? POSIXL : POSIXA;
12942 #else
12943                     op = POSIXA;
12944 #endif
12945                     goto join_posix;
12946
12947                 case ANYOF_NCASED:
12948                 case ANYOF_LOWER:
12949                 case ANYOF_NLOWER:
12950                 case ANYOF_UPPER:
12951                 case ANYOF_NUPPER:
12952                     /* under /a could be alpha */
12953                     if (FOLD) {
12954                         if (ASCII_RESTRICTED) {
12955                             namedclass = ANYOF_ALPHA + (namedclass % 2);
12956                         }
12957                         else if (! LOC) {
12958                             break;
12959                         }
12960                     }
12961                     /* FALLTHROUGH */
12962
12963                 /* The rest have more possibilities depending on the charset.
12964                  * We take advantage of the enum ordering of the charset
12965                  * modifiers to get the exact node type, */
12966                 default:
12967                     op = POSIXD + get_regex_charset(RExC_flags);
12968                     if (op > POSIXA) { /* /aa is same as /a */
12969                         op = POSIXA;
12970                     }
12971 #ifndef HAS_ISBLANK
12972                     if (op == POSIXL
12973                         && (namedclass == ANYOF_BLANK
12974                             || namedclass == ANYOF_NBLANK))
12975                     {
12976                         op = POSIXA;
12977                     }
12978 #endif
12979
12980                 join_posix:
12981                     /* The odd numbered ones are the complements of the
12982                      * next-lower even number one */
12983                     if (namedclass % 2 == 1) {
12984                         invert = ! invert;
12985                         namedclass--;
12986                     }
12987                     arg = namedclass_to_classnum(namedclass);
12988                     break;
12989             }
12990         }
12991         else if (value == prevvalue) {
12992
12993             /* Here, the class consists of just a single code point */
12994
12995             if (invert) {
12996                 if (! LOC && value == '\n') {
12997                     op = REG_ANY; /* Optimize [^\n] */
12998                     *flagp |= HASWIDTH|SIMPLE;
12999                     RExC_naughty++;
13000                 }
13001             }
13002             else if (value < 256 || UTF) {
13003
13004                 /* Optimize a single value into an EXACTish node, but not if it
13005                  * would require converting the pattern to UTF-8. */
13006                 op = compute_EXACTish(pRExC_state);
13007             }
13008         } /* Otherwise is a range */
13009         else if (! LOC) {   /* locale could vary these */
13010             if (prevvalue == '0') {
13011                 if (value == '9') {
13012                     arg = _CC_DIGIT;
13013                     op = POSIXA;
13014                 }
13015             }
13016         }
13017
13018         /* Here, we have changed <op> away from its initial value iff we found
13019          * an optimization */
13020         if (op != END) {
13021
13022             /* Throw away this ANYOF regnode, and emit the calculated one,
13023              * which should correspond to the beginning, not current, state of
13024              * the parse */
13025             const char * cur_parse = RExC_parse;
13026             RExC_parse = (char *)orig_parse;
13027             if ( SIZE_ONLY) {
13028                 if (! LOC) {
13029
13030                     /* To get locale nodes to not use the full ANYOF size would
13031                      * require moving the code above that writes the portions
13032                      * of it that aren't in other nodes to after this point.
13033                      * e.g.  ANYOF_CLASS_SET */
13034                     RExC_size = orig_size;
13035                 }
13036             }
13037             else {
13038                 RExC_emit = (regnode *)orig_emit;
13039                 if (PL_regkind[op] == POSIXD) {
13040                     if (invert) {
13041                         op += NPOSIXD - POSIXD;
13042                     }
13043                 }
13044             }
13045
13046             ret = reg_node(pRExC_state, op);
13047
13048             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
13049                 if (! SIZE_ONLY) {
13050                     FLAGS(ret) = arg;
13051                 }
13052                 *flagp |= HASWIDTH|SIMPLE;
13053             }
13054             else if (PL_regkind[op] == EXACT) {
13055                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13056             }
13057
13058             RExC_parse = (char *) cur_parse;
13059
13060             SvREFCNT_dec(posixes);
13061             SvREFCNT_dec_NN(listsv);
13062             SvREFCNT_dec(cp_list);
13063             return ret;
13064         }
13065     }
13066
13067     if (SIZE_ONLY)
13068         return ret;
13069     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
13070
13071     /* If folding, we calculate all characters that could fold to or from the
13072      * ones already on the list */
13073     if (FOLD && cp_list) {
13074         UV start, end;  /* End points of code point ranges */
13075
13076         SV* fold_intersection = NULL;
13077
13078         /* If the highest code point is within Latin1, we can use the
13079          * compiled-in Alphas list, and not have to go out to disk.  This
13080          * yields two false positives, the masculine and feminine ordinal
13081          * indicators, which are weeded out below using the
13082          * IS_IN_SOME_FOLD_L1() macro */
13083         if (invlist_highest(cp_list) < 256) {
13084             _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
13085                                                            &fold_intersection);
13086         }
13087         else {
13088
13089             /* Here, there are non-Latin1 code points, so we will have to go
13090              * fetch the list of all the characters that participate in folds
13091              */
13092             if (! PL_utf8_foldable) {
13093                 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13094                                        &PL_sv_undef, 1, 0);
13095                 PL_utf8_foldable = _get_swash_invlist(swash);
13096                 SvREFCNT_dec_NN(swash);
13097             }
13098
13099             /* This is a hash that for a particular fold gives all characters
13100              * that are involved in it */
13101             if (! PL_utf8_foldclosures) {
13102
13103                 /* If we were unable to find any folds, then we likely won't be
13104                  * able to find the closures.  So just create an empty list.
13105                  * Folding will effectively be restricted to the non-Unicode
13106                  * rules hard-coded into Perl.  (This case happens legitimately
13107                  * during compilation of Perl itself before the Unicode tables
13108                  * are generated) */
13109                 if (_invlist_len(PL_utf8_foldable) == 0) {
13110                     PL_utf8_foldclosures = newHV();
13111                 }
13112                 else {
13113                     /* If the folds haven't been read in, call a fold function
13114                      * to force that */
13115                     if (! PL_utf8_tofold) {
13116                         U8 dummy[UTF8_MAXBYTES+1];
13117
13118                         /* This string is just a short named one above \xff */
13119                         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
13120                         assert(PL_utf8_tofold); /* Verify that worked */
13121                     }
13122                     PL_utf8_foldclosures =
13123                                     _swash_inversion_hash(PL_utf8_tofold);
13124                 }
13125             }
13126
13127             /* Only the characters in this class that participate in folds need
13128              * be checked.  Get the intersection of this class and all the
13129              * possible characters that are foldable.  This can quickly narrow
13130              * down a large class */
13131             _invlist_intersection(PL_utf8_foldable, cp_list,
13132                                   &fold_intersection);
13133         }
13134
13135         /* Now look at the foldable characters in this class individually */
13136         invlist_iterinit(fold_intersection);
13137         while (invlist_iternext(fold_intersection, &start, &end)) {
13138             UV j;
13139
13140             /* Locale folding for Latin1 characters is deferred until runtime */
13141             if (LOC && start < 256) {
13142                 start = 256;
13143             }
13144
13145             /* Look at every character in the range */
13146             for (j = start; j <= end; j++) {
13147
13148                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
13149                 STRLEN foldlen;
13150                 SV** listp;
13151
13152                 if (j < 256) {
13153
13154                     /* We have the latin1 folding rules hard-coded here so that
13155                      * an innocent-looking character class, like /[ks]/i won't
13156                      * have to go out to disk to find the possible matches.
13157                      * XXX It would be better to generate these via regen, in
13158                      * case a new version of the Unicode standard adds new
13159                      * mappings, though that is not really likely, and may be
13160                      * caught by the default: case of the switch below. */
13161
13162                     if (IS_IN_SOME_FOLD_L1(j)) {
13163
13164                         /* ASCII is always matched; non-ASCII is matched only
13165                          * under Unicode rules */
13166                         if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
13167                             cp_list =
13168                                 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
13169                         }
13170                         else {
13171                             depends_list =
13172                              add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
13173                         }
13174                     }
13175
13176                     if (HAS_NONLATIN1_FOLD_CLOSURE(j)
13177                         && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
13178                     {
13179                         /* Certain Latin1 characters have matches outside
13180                          * Latin1.  To get here, <j> is one of those
13181                          * characters.   None of these matches is valid for
13182                          * ASCII characters under /aa, which is why the 'if'
13183                          * just above excludes those.  These matches only
13184                          * happen when the target string is utf8.  The code
13185                          * below adds the single fold closures for <j> to the
13186                          * inversion list. */
13187                         switch (j) {
13188                             case 'k':
13189                             case 'K':
13190                                 cp_list =
13191                                     add_cp_to_invlist(cp_list, KELVIN_SIGN);
13192                                 break;
13193                             case 's':
13194                             case 'S':
13195                                 cp_list = add_cp_to_invlist(cp_list,
13196                                                     LATIN_SMALL_LETTER_LONG_S);
13197                                 break;
13198                             case MICRO_SIGN:
13199                                 cp_list = add_cp_to_invlist(cp_list,
13200                                                     GREEK_CAPITAL_LETTER_MU);
13201                                 cp_list = add_cp_to_invlist(cp_list,
13202                                                     GREEK_SMALL_LETTER_MU);
13203                                 break;
13204                             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13205                             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13206                                 cp_list =
13207                                     add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
13208                                 break;
13209                             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13210                                 cp_list = add_cp_to_invlist(cp_list,
13211                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13212                                 break;
13213                             case LATIN_SMALL_LETTER_SHARP_S:
13214                                 cp_list = add_cp_to_invlist(cp_list,
13215                                                 LATIN_CAPITAL_LETTER_SHARP_S);
13216                                 break;
13217                             case 'F': case 'f':
13218                             case 'I': case 'i':
13219                             case 'L': case 'l':
13220                             case 'T': case 't':
13221                             case 'A': case 'a':
13222                             case 'H': case 'h':
13223                             case 'J': case 'j':
13224                             case 'N': case 'n':
13225                             case 'W': case 'w':
13226                             case 'Y': case 'y':
13227                                 /* These all are targets of multi-character
13228                                  * folds from code points that require UTF8 to
13229                                  * express, so they can't match unless the
13230                                  * target string is in UTF-8, so no action here
13231                                  * is necessary, as regexec.c properly handles
13232                                  * the general case for UTF-8 matching and
13233                                  * multi-char folds */
13234                                 break;
13235                             default:
13236                                 /* Use deprecated warning to increase the
13237                                  * chances of this being output */
13238                                 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
13239                                 break;
13240                         }
13241                     }
13242                     continue;
13243                 }
13244
13245                 /* Here is an above Latin1 character.  We don't have the rules
13246                  * hard-coded for it.  First, get its fold.  This is the simple
13247                  * fold, as the multi-character folds have been handled earlier
13248                  * and separated out */
13249                 _to_uni_fold_flags(j, foldbuf, &foldlen,
13250                                                ((LOC)
13251                                                ? FOLD_FLAGS_LOCALE
13252                                                : (ASCII_FOLD_RESTRICTED)
13253                                                   ? FOLD_FLAGS_NOMIX_ASCII
13254                                                   : 0));
13255
13256                 /* Single character fold of above Latin1.  Add everything in
13257                  * its fold closure to the list that this node should match.
13258                  * The fold closures data structure is a hash with the keys
13259                  * being the UTF-8 of every character that is folded to, like
13260                  * 'k', and the values each an array of all code points that
13261                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
13262                  * Multi-character folds are not included */
13263                 if ((listp = hv_fetch(PL_utf8_foldclosures,
13264                                       (char *) foldbuf, foldlen, FALSE)))
13265                 {
13266                     AV* list = (AV*) *listp;
13267                     IV k;
13268                     for (k = 0; k <= av_len(list); k++) {
13269                         SV** c_p = av_fetch(list, k, FALSE);
13270                         UV c;
13271                         if (c_p == NULL) {
13272                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
13273                         }
13274                         c = SvUV(*c_p);
13275
13276                         /* /aa doesn't allow folds between ASCII and non-; /l
13277                          * doesn't allow them between above and below 256 */
13278                         if ((ASCII_FOLD_RESTRICTED
13279                                   && (isASCII(c) != isASCII(j)))
13280                             || (LOC && ((c < 256) != (j < 256))))
13281                         {
13282                             continue;
13283                         }
13284
13285                         /* Folds involving non-ascii Latin1 characters
13286                          * under /d are added to a separate list */
13287                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
13288                         {
13289                             cp_list = add_cp_to_invlist(cp_list, c);
13290                         }
13291                         else {
13292                           depends_list = add_cp_to_invlist(depends_list, c);
13293                         }
13294                     }
13295                 }
13296             }
13297         }
13298         SvREFCNT_dec_NN(fold_intersection);
13299     }
13300
13301     /* And combine the result (if any) with any inversion list from posix
13302      * classes.  The lists are kept separate up to now because we don't want to
13303      * fold the classes (folding of those is automatically handled by the swash
13304      * fetching code) */
13305     if (posixes) {
13306         if (! DEPENDS_SEMANTICS) {
13307             if (cp_list) {
13308                 _invlist_union(cp_list, posixes, &cp_list);
13309                 SvREFCNT_dec_NN(posixes);
13310             }
13311             else {
13312                 cp_list = posixes;
13313             }
13314         }
13315         else {
13316             /* Under /d, we put into a separate list the Latin1 things that
13317              * match only when the target string is utf8 */
13318             SV* nonascii_but_latin1_properties = NULL;
13319             _invlist_intersection(posixes, PL_Latin1,
13320                                   &nonascii_but_latin1_properties);
13321             _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
13322                               &nonascii_but_latin1_properties);
13323             _invlist_subtract(posixes, nonascii_but_latin1_properties,
13324                               &posixes);
13325             if (cp_list) {
13326                 _invlist_union(cp_list, posixes, &cp_list);
13327                 SvREFCNT_dec_NN(posixes);
13328             }
13329             else {
13330                 cp_list = posixes;
13331             }
13332
13333             if (depends_list) {
13334                 _invlist_union(depends_list, nonascii_but_latin1_properties,
13335                                &depends_list);
13336                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
13337             }
13338             else {
13339                 depends_list = nonascii_but_latin1_properties;
13340             }
13341         }
13342     }
13343
13344     /* And combine the result (if any) with any inversion list from properties.
13345      * The lists are kept separate up to now so that we can distinguish the two
13346      * in regards to matching above-Unicode.  A run-time warning is generated
13347      * if a Unicode property is matched against a non-Unicode code point. But,
13348      * we allow user-defined properties to match anything, without any warning,
13349      * and we also suppress the warning if there is a portion of the character
13350      * class that isn't a Unicode property, and which matches above Unicode, \W
13351      * or [\x{110000}] for example.
13352      * (Note that in this case, unlike the Posix one above, there is no
13353      * <depends_list>, because having a Unicode property forces Unicode
13354      * semantics */
13355     if (properties) {
13356         bool warn_super = ! has_user_defined_property;
13357         if (cp_list) {
13358
13359             /* If it matters to the final outcome, see if a non-property
13360              * component of the class matches above Unicode.  If so, the
13361              * warning gets suppressed.  This is true even if just a single
13362              * such code point is specified, as though not strictly correct if
13363              * another such code point is matched against, the fact that they
13364              * are using above-Unicode code points indicates they should know
13365              * the issues involved */
13366             if (warn_super) {
13367                 bool non_prop_matches_above_Unicode =
13368                             runtime_posix_matches_above_Unicode
13369                             | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
13370                 if (invert) {
13371                     non_prop_matches_above_Unicode =
13372                                             !  non_prop_matches_above_Unicode;
13373                 }
13374                 warn_super = ! non_prop_matches_above_Unicode;
13375             }
13376
13377             _invlist_union(properties, cp_list, &cp_list);
13378             SvREFCNT_dec_NN(properties);
13379         }
13380         else {
13381             cp_list = properties;
13382         }
13383
13384         if (warn_super) {
13385             OP(ret) = ANYOF_WARN_SUPER;
13386         }
13387     }
13388
13389     /* Here, we have calculated what code points should be in the character
13390      * class.
13391      *
13392      * Now we can see about various optimizations.  Fold calculation (which we
13393      * did above) needs to take place before inversion.  Otherwise /[^k]/i
13394      * would invert to include K, which under /i would match k, which it
13395      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
13396      * folded until runtime */
13397
13398     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
13399      * at compile time.  Besides not inverting folded locale now, we can't
13400      * invert if there are things such as \w, which aren't known until runtime
13401      * */
13402     if (invert
13403         && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
13404         && ! depends_list
13405         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13406     {
13407         _invlist_invert(cp_list);
13408
13409         /* Any swash can't be used as-is, because we've inverted things */
13410         if (swash) {
13411             SvREFCNT_dec_NN(swash);
13412             swash = NULL;
13413         }
13414
13415         /* Clear the invert flag since have just done it here */
13416         invert = FALSE;
13417     }
13418
13419     if (ret_invlist) {
13420         *ret_invlist = cp_list;
13421
13422         /* Discard the generated node */
13423         if (SIZE_ONLY) {
13424             RExC_size = orig_size;
13425         }
13426         else {
13427             RExC_emit = orig_emit;
13428         }
13429         return END;
13430     }
13431
13432     /* If we didn't do folding, it's because some information isn't available
13433      * until runtime; set the run-time fold flag for these.  (We don't have to
13434      * worry about properties folding, as that is taken care of by the swash
13435      * fetching) */
13436     if (FOLD && LOC)
13437     {
13438        ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
13439     }
13440
13441     /* Some character classes are equivalent to other nodes.  Such nodes take
13442      * up less room and generally fewer operations to execute than ANYOF nodes.
13443      * Above, we checked for and optimized into some such equivalents for
13444      * certain common classes that are easy to test.  Getting to this point in
13445      * the code means that the class didn't get optimized there.  Since this
13446      * code is only executed in Pass 2, it is too late to save space--it has
13447      * been allocated in Pass 1, and currently isn't given back.  But turning
13448      * things into an EXACTish node can allow the optimizer to join it to any
13449      * adjacent such nodes.  And if the class is equivalent to things like /./,
13450      * expensive run-time swashes can be avoided.  Now that we have more
13451      * complete information, we can find things necessarily missed by the
13452      * earlier code.  I (khw) am not sure how much to look for here.  It would
13453      * be easy, but perhaps too slow, to check any candidates against all the
13454      * node types they could possibly match using _invlistEQ(). */
13455
13456     if (cp_list
13457         && ! invert
13458         && ! depends_list
13459         && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
13460         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13461     {
13462         UV start, end;
13463         U8 op = END;  /* The optimzation node-type */
13464         const char * cur_parse= RExC_parse;
13465
13466         invlist_iterinit(cp_list);
13467         if (! invlist_iternext(cp_list, &start, &end)) {
13468
13469             /* Here, the list is empty.  This happens, for example, when a
13470              * Unicode property is the only thing in the character class, and
13471              * it doesn't match anything.  (perluniprops.pod notes such
13472              * properties) */
13473             op = OPFAIL;
13474             *flagp |= HASWIDTH|SIMPLE;
13475         }
13476         else if (start == end) {    /* The range is a single code point */
13477             if (! invlist_iternext(cp_list, &start, &end)
13478
13479                     /* Don't do this optimization if it would require changing
13480                      * the pattern to UTF-8 */
13481                 && (start < 256 || UTF))
13482             {
13483                 /* Here, the list contains a single code point.  Can optimize
13484                  * into an EXACT node */
13485
13486                 value = start;
13487
13488                 if (! FOLD) {
13489                     op = EXACT;
13490                 }
13491                 else if (LOC) {
13492
13493                     /* A locale node under folding with one code point can be
13494                      * an EXACTFL, as its fold won't be calculated until
13495                      * runtime */
13496                     op = EXACTFL;
13497                 }
13498                 else {
13499
13500                     /* Here, we are generally folding, but there is only one
13501                      * code point to match.  If we have to, we use an EXACT
13502                      * node, but it would be better for joining with adjacent
13503                      * nodes in the optimization pass if we used the same
13504                      * EXACTFish node that any such are likely to be.  We can
13505                      * do this iff the code point doesn't participate in any
13506                      * folds.  For example, an EXACTF of a colon is the same as
13507                      * an EXACT one, since nothing folds to or from a colon. */
13508                     if (value < 256) {
13509                         if (IS_IN_SOME_FOLD_L1(value)) {
13510                             op = EXACT;
13511                         }
13512                     }
13513                     else {
13514                         if (! PL_utf8_foldable) {
13515                             SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13516                                                 &PL_sv_undef, 1, 0);
13517                             PL_utf8_foldable = _get_swash_invlist(swash);
13518                             SvREFCNT_dec_NN(swash);
13519                         }
13520                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13521                             op = EXACT;
13522                         }
13523                     }
13524
13525                     /* If we haven't found the node type, above, it means we
13526                      * can use the prevailing one */
13527                     if (op == END) {
13528                         op = compute_EXACTish(pRExC_state);
13529                     }
13530                 }
13531             }
13532         }
13533         else if (start == 0) {
13534             if (end == UV_MAX) {
13535                 op = SANY;
13536                 *flagp |= HASWIDTH|SIMPLE;
13537                 RExC_naughty++;
13538             }
13539             else if (end == '\n' - 1
13540                     && invlist_iternext(cp_list, &start, &end)
13541                     && start == '\n' + 1 && end == UV_MAX)
13542             {
13543                 op = REG_ANY;
13544                 *flagp |= HASWIDTH|SIMPLE;
13545                 RExC_naughty++;
13546             }
13547         }
13548         invlist_iterfinish(cp_list);
13549
13550         if (op != END) {
13551             RExC_parse = (char *)orig_parse;
13552             RExC_emit = (regnode *)orig_emit;
13553
13554             ret = reg_node(pRExC_state, op);
13555
13556             RExC_parse = (char *)cur_parse;
13557
13558             if (PL_regkind[op] == EXACT) {
13559                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13560             }
13561
13562             SvREFCNT_dec_NN(cp_list);
13563             SvREFCNT_dec_NN(listsv);
13564             return ret;
13565         }
13566     }
13567
13568     /* Here, <cp_list> contains all the code points we can determine at
13569      * compile time that match under all conditions.  Go through it, and
13570      * for things that belong in the bitmap, put them there, and delete from
13571      * <cp_list>.  While we are at it, see if everything above 255 is in the
13572      * list, and if so, set a flag to speed up execution */
13573     ANYOF_BITMAP_ZERO(ret);
13574     if (cp_list) {
13575
13576         /* This gets set if we actually need to modify things */
13577         bool change_invlist = FALSE;
13578
13579         UV start, end;
13580
13581         /* Start looking through <cp_list> */
13582         invlist_iterinit(cp_list);
13583         while (invlist_iternext(cp_list, &start, &end)) {
13584             UV high;
13585             int i;
13586
13587             if (end == UV_MAX && start <= 256) {
13588                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13589             }
13590
13591             /* Quit if are above what we should change */
13592             if (start > 255) {
13593                 break;
13594             }
13595
13596             change_invlist = TRUE;
13597
13598             /* Set all the bits in the range, up to the max that we are doing */
13599             high = (end < 255) ? end : 255;
13600             for (i = start; i <= (int) high; i++) {
13601                 if (! ANYOF_BITMAP_TEST(ret, i)) {
13602                     ANYOF_BITMAP_SET(ret, i);
13603                     prevvalue = value;
13604                     value = i;
13605                 }
13606             }
13607         }
13608         invlist_iterfinish(cp_list);
13609
13610         /* Done with loop; remove any code points that are in the bitmap from
13611          * <cp_list> */
13612         if (change_invlist) {
13613             _invlist_subtract(cp_list, PL_Latin1, &cp_list);
13614         }
13615
13616         /* If have completely emptied it, remove it completely */
13617         if (_invlist_len(cp_list) == 0) {
13618             SvREFCNT_dec_NN(cp_list);
13619             cp_list = NULL;
13620         }
13621     }
13622
13623     if (invert) {
13624         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
13625     }
13626
13627     /* Here, the bitmap has been populated with all the Latin1 code points that
13628      * always match.  Can now add to the overall list those that match only
13629      * when the target string is UTF-8 (<depends_list>). */
13630     if (depends_list) {
13631         if (cp_list) {
13632             _invlist_union(cp_list, depends_list, &cp_list);
13633             SvREFCNT_dec_NN(depends_list);
13634         }
13635         else {
13636             cp_list = depends_list;
13637         }
13638     }
13639
13640     /* If there is a swash and more than one element, we can't use the swash in
13641      * the optimization below. */
13642     if (swash && element_count > 1) {
13643         SvREFCNT_dec_NN(swash);
13644         swash = NULL;
13645     }
13646
13647     if (! cp_list
13648         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13649     {
13650         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
13651         SvREFCNT_dec_NN(listsv);
13652     }
13653     else {
13654         /* av[0] stores the character class description in its textual form:
13655          *       used later (regexec.c:Perl_regclass_swash()) to initialize the
13656          *       appropriate swash, and is also useful for dumping the regnode.
13657          * av[1] if NULL, is a placeholder to later contain the swash computed
13658          *       from av[0].  But if no further computation need be done, the
13659          *       swash is stored there now.
13660          * av[2] stores the cp_list inversion list for use in addition or
13661          *       instead of av[0]; used only if av[1] is NULL
13662          * av[3] is set if any component of the class is from a user-defined
13663          *       property; used only if av[1] is NULL */
13664         AV * const av = newAV();
13665         SV *rv;
13666
13667         av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13668                         ? listsv
13669                         : (SvREFCNT_dec_NN(listsv), &PL_sv_undef));
13670         if (swash) {
13671             av_store(av, 1, swash);
13672             SvREFCNT_dec_NN(cp_list);
13673         }
13674         else {
13675             av_store(av, 1, NULL);
13676             if (cp_list) {
13677                 av_store(av, 2, cp_list);
13678                 av_store(av, 3, newSVuv(has_user_defined_property));
13679             }
13680         }
13681
13682         rv = newRV_noinc(MUTABLE_SV(av));
13683         n = add_data(pRExC_state, 1, "s");
13684         RExC_rxi->data->data[n] = (void*)rv;
13685         ARG_SET(ret, n);
13686     }
13687
13688     *flagp |= HASWIDTH|SIMPLE;
13689     return ret;
13690 }
13691 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
13692
13693
13694 /* reg_skipcomment()
13695
13696    Absorbs an /x style # comments from the input stream.
13697    Returns true if there is more text remaining in the stream.
13698    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
13699    terminates the pattern without including a newline.
13700
13701    Note its the callers responsibility to ensure that we are
13702    actually in /x mode
13703
13704 */
13705
13706 STATIC bool
13707 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
13708 {
13709     bool ended = 0;
13710
13711     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
13712
13713     while (RExC_parse < RExC_end)
13714         if (*RExC_parse++ == '\n') {
13715             ended = 1;
13716             break;
13717         }
13718     if (!ended) {
13719         /* we ran off the end of the pattern without ending
13720            the comment, so we have to add an \n when wrapping */
13721         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
13722         return 0;
13723     } else
13724         return 1;
13725 }
13726
13727 /* nextchar()
13728
13729    Advances the parse position, and optionally absorbs
13730    "whitespace" from the inputstream.
13731
13732    Without /x "whitespace" means (?#...) style comments only,
13733    with /x this means (?#...) and # comments and whitespace proper.
13734
13735    Returns the RExC_parse point from BEFORE the scan occurs.
13736
13737    This is the /x friendly way of saying RExC_parse++.
13738 */
13739
13740 STATIC char*
13741 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
13742 {
13743     char* const retval = RExC_parse++;
13744
13745     PERL_ARGS_ASSERT_NEXTCHAR;
13746
13747     for (;;) {
13748         if (RExC_end - RExC_parse >= 3
13749             && *RExC_parse == '('
13750             && RExC_parse[1] == '?'
13751             && RExC_parse[2] == '#')
13752         {
13753             while (*RExC_parse != ')') {
13754                 if (RExC_parse == RExC_end)
13755                     FAIL("Sequence (?#... not terminated");
13756                 RExC_parse++;
13757             }
13758             RExC_parse++;
13759             continue;
13760         }
13761         if (RExC_flags & RXf_PMf_EXTENDED) {
13762             if (isSPACE(*RExC_parse)) {
13763                 RExC_parse++;
13764                 continue;
13765             }
13766             else if (*RExC_parse == '#') {
13767                 if ( reg_skipcomment( pRExC_state ) )
13768                     continue;
13769             }
13770         }
13771         return retval;
13772     }
13773 }
13774
13775 /*
13776 - reg_node - emit a node
13777 */
13778 STATIC regnode *                        /* Location. */
13779 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
13780 {
13781     dVAR;
13782     regnode *ptr;
13783     regnode * const ret = RExC_emit;
13784     GET_RE_DEBUG_FLAGS_DECL;
13785
13786     PERL_ARGS_ASSERT_REG_NODE;
13787
13788     if (SIZE_ONLY) {
13789         SIZE_ALIGN(RExC_size);
13790         RExC_size += 1;
13791         return(ret);
13792     }
13793     if (RExC_emit >= RExC_emit_bound)
13794         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13795                    op, RExC_emit, RExC_emit_bound);
13796
13797     NODE_ALIGN_FILL(ret);
13798     ptr = ret;
13799     FILL_ADVANCE_NODE(ptr, op);
13800 #ifdef RE_TRACK_PATTERN_OFFSETS
13801     if (RExC_offsets) {         /* MJD */
13802         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
13803               "reg_node", __LINE__,
13804               PL_reg_name[op],
13805               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
13806                 ? "Overwriting end of array!\n" : "OK",
13807               (UV)(RExC_emit - RExC_emit_start),
13808               (UV)(RExC_parse - RExC_start),
13809               (UV)RExC_offsets[0]));
13810         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
13811     }
13812 #endif
13813     RExC_emit = ptr;
13814     return(ret);
13815 }
13816
13817 /*
13818 - reganode - emit a node with an argument
13819 */
13820 STATIC regnode *                        /* Location. */
13821 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
13822 {
13823     dVAR;
13824     regnode *ptr;
13825     regnode * const ret = RExC_emit;
13826     GET_RE_DEBUG_FLAGS_DECL;
13827
13828     PERL_ARGS_ASSERT_REGANODE;
13829
13830     if (SIZE_ONLY) {
13831         SIZE_ALIGN(RExC_size);
13832         RExC_size += 2;
13833         /*
13834            We can't do this:
13835
13836            assert(2==regarglen[op]+1);
13837
13838            Anything larger than this has to allocate the extra amount.
13839            If we changed this to be:
13840
13841            RExC_size += (1 + regarglen[op]);
13842
13843            then it wouldn't matter. Its not clear what side effect
13844            might come from that so its not done so far.
13845            -- dmq
13846         */
13847         return(ret);
13848     }
13849     if (RExC_emit >= RExC_emit_bound)
13850         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13851                    op, RExC_emit, RExC_emit_bound);
13852
13853     NODE_ALIGN_FILL(ret);
13854     ptr = ret;
13855     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
13856 #ifdef RE_TRACK_PATTERN_OFFSETS
13857     if (RExC_offsets) {         /* MJD */
13858         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13859               "reganode",
13860               __LINE__,
13861               PL_reg_name[op],
13862               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
13863               "Overwriting end of array!\n" : "OK",
13864               (UV)(RExC_emit - RExC_emit_start),
13865               (UV)(RExC_parse - RExC_start),
13866               (UV)RExC_offsets[0]));
13867         Set_Cur_Node_Offset;
13868     }
13869 #endif
13870     RExC_emit = ptr;
13871     return(ret);
13872 }
13873
13874 /*
13875 - reguni - emit (if appropriate) a Unicode character
13876 */
13877 STATIC STRLEN
13878 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
13879 {
13880     dVAR;
13881
13882     PERL_ARGS_ASSERT_REGUNI;
13883
13884     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
13885 }
13886
13887 /*
13888 - reginsert - insert an operator in front of already-emitted operand
13889 *
13890 * Means relocating the operand.
13891 */
13892 STATIC void
13893 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
13894 {
13895     dVAR;
13896     regnode *src;
13897     regnode *dst;
13898     regnode *place;
13899     const int offset = regarglen[(U8)op];
13900     const int size = NODE_STEP_REGNODE + offset;
13901     GET_RE_DEBUG_FLAGS_DECL;
13902
13903     PERL_ARGS_ASSERT_REGINSERT;
13904     PERL_UNUSED_ARG(depth);
13905 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13906     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
13907     if (SIZE_ONLY) {
13908         RExC_size += size;
13909         return;
13910     }
13911
13912     src = RExC_emit;
13913     RExC_emit += size;
13914     dst = RExC_emit;
13915     if (RExC_open_parens) {
13916         int paren;
13917         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
13918         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
13919             if ( RExC_open_parens[paren] >= opnd ) {
13920                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
13921                 RExC_open_parens[paren] += size;
13922             } else {
13923                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
13924             }
13925             if ( RExC_close_parens[paren] >= opnd ) {
13926                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
13927                 RExC_close_parens[paren] += size;
13928             } else {
13929                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
13930             }
13931         }
13932     }
13933
13934     while (src > opnd) {
13935         StructCopy(--src, --dst, regnode);
13936 #ifdef RE_TRACK_PATTERN_OFFSETS
13937         if (RExC_offsets) {     /* MJD 20010112 */
13938             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
13939                   "reg_insert",
13940                   __LINE__,
13941                   PL_reg_name[op],
13942                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
13943                     ? "Overwriting end of array!\n" : "OK",
13944                   (UV)(src - RExC_emit_start),
13945                   (UV)(dst - RExC_emit_start),
13946                   (UV)RExC_offsets[0]));
13947             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
13948             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
13949         }
13950 #endif
13951     }
13952
13953
13954     place = opnd;               /* Op node, where operand used to be. */
13955 #ifdef RE_TRACK_PATTERN_OFFSETS
13956     if (RExC_offsets) {         /* MJD */
13957         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13958               "reginsert",
13959               __LINE__,
13960               PL_reg_name[op],
13961               (UV)(place - RExC_emit_start) > RExC_offsets[0]
13962               ? "Overwriting end of array!\n" : "OK",
13963               (UV)(place - RExC_emit_start),
13964               (UV)(RExC_parse - RExC_start),
13965               (UV)RExC_offsets[0]));
13966         Set_Node_Offset(place, RExC_parse);
13967         Set_Node_Length(place, 1);
13968     }
13969 #endif
13970     src = NEXTOPER(place);
13971     FILL_ADVANCE_NODE(place, op);
13972     Zero(src, offset, regnode);
13973 }
13974
13975 /*
13976 - regtail - set the next-pointer at the end of a node chain of p to val.
13977 - SEE ALSO: regtail_study
13978 */
13979 /* TODO: All three parms should be const */
13980 STATIC void
13981 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13982 {
13983     dVAR;
13984     regnode *scan;
13985     GET_RE_DEBUG_FLAGS_DECL;
13986
13987     PERL_ARGS_ASSERT_REGTAIL;
13988 #ifndef DEBUGGING
13989     PERL_UNUSED_ARG(depth);
13990 #endif
13991
13992     if (SIZE_ONLY)
13993         return;
13994
13995     /* Find last node. */
13996     scan = p;
13997     for (;;) {
13998         regnode * const temp = regnext(scan);
13999         DEBUG_PARSE_r({
14000             SV * const mysv=sv_newmortal();
14001             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
14002             regprop(RExC_rx, mysv, scan);
14003             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
14004                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
14005                     (temp == NULL ? "->" : ""),
14006                     (temp == NULL ? PL_reg_name[OP(val)] : "")
14007             );
14008         });
14009         if (temp == NULL)
14010             break;
14011         scan = temp;
14012     }
14013
14014     if (reg_off_by_arg[OP(scan)]) {
14015         ARG_SET(scan, val - scan);
14016     }
14017     else {
14018         NEXT_OFF(scan) = val - scan;
14019     }
14020 }
14021
14022 #ifdef DEBUGGING
14023 /*
14024 - regtail_study - set the next-pointer at the end of a node chain of p to val.
14025 - Look for optimizable sequences at the same time.
14026 - currently only looks for EXACT chains.
14027
14028 This is experimental code. The idea is to use this routine to perform
14029 in place optimizations on branches and groups as they are constructed,
14030 with the long term intention of removing optimization from study_chunk so
14031 that it is purely analytical.
14032
14033 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
14034 to control which is which.
14035
14036 */
14037 /* TODO: All four parms should be const */
14038
14039 STATIC U8
14040 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14041 {
14042     dVAR;
14043     regnode *scan;
14044     U8 exact = PSEUDO;
14045 #ifdef EXPERIMENTAL_INPLACESCAN
14046     I32 min = 0;
14047 #endif
14048     GET_RE_DEBUG_FLAGS_DECL;
14049
14050     PERL_ARGS_ASSERT_REGTAIL_STUDY;
14051
14052
14053     if (SIZE_ONLY)
14054         return exact;
14055
14056     /* Find last node. */
14057
14058     scan = p;
14059     for (;;) {
14060         regnode * const temp = regnext(scan);
14061 #ifdef EXPERIMENTAL_INPLACESCAN
14062         if (PL_regkind[OP(scan)] == EXACT) {
14063             bool has_exactf_sharp_s;    /* Unexamined in this routine */
14064             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
14065                 return EXACT;
14066         }
14067 #endif
14068         if ( exact ) {
14069             switch (OP(scan)) {
14070                 case EXACT:
14071                 case EXACTF:
14072                 case EXACTFA:
14073                 case EXACTFU:
14074                 case EXACTFU_SS:
14075                 case EXACTFU_TRICKYFOLD:
14076                 case EXACTFL:
14077                         if( exact == PSEUDO )
14078                             exact= OP(scan);
14079                         else if ( exact != OP(scan) )
14080                             exact= 0;
14081                 case NOTHING:
14082                     break;
14083                 default:
14084                     exact= 0;
14085             }
14086         }
14087         DEBUG_PARSE_r({
14088             SV * const mysv=sv_newmortal();
14089             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
14090             regprop(RExC_rx, mysv, scan);
14091             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
14092                 SvPV_nolen_const(mysv),
14093                 REG_NODE_NUM(scan),
14094                 PL_reg_name[exact]);
14095         });
14096         if (temp == NULL)
14097             break;
14098         scan = temp;
14099     }
14100     DEBUG_PARSE_r({
14101         SV * const mysv_val=sv_newmortal();
14102         DEBUG_PARSE_MSG("");
14103         regprop(RExC_rx, mysv_val, val);
14104         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
14105                       SvPV_nolen_const(mysv_val),
14106                       (IV)REG_NODE_NUM(val),
14107                       (IV)(val - scan)
14108         );
14109     });
14110     if (reg_off_by_arg[OP(scan)]) {
14111         ARG_SET(scan, val - scan);
14112     }
14113     else {
14114         NEXT_OFF(scan) = val - scan;
14115     }
14116
14117     return exact;
14118 }
14119 #endif
14120
14121 /*
14122  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
14123  */
14124 #ifdef DEBUGGING
14125 static void
14126 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
14127 {
14128     int bit;
14129     int set=0;
14130     regex_charset cs;
14131
14132     for (bit=0; bit<32; bit++) {
14133         if (flags & (1<<bit)) {
14134             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
14135                 continue;
14136             }
14137             if (!set++ && lead)
14138                 PerlIO_printf(Perl_debug_log, "%s",lead);
14139             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
14140         }
14141     }
14142     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
14143             if (!set++ && lead) {
14144                 PerlIO_printf(Perl_debug_log, "%s",lead);
14145             }
14146             switch (cs) {
14147                 case REGEX_UNICODE_CHARSET:
14148                     PerlIO_printf(Perl_debug_log, "UNICODE");
14149                     break;
14150                 case REGEX_LOCALE_CHARSET:
14151                     PerlIO_printf(Perl_debug_log, "LOCALE");
14152                     break;
14153                 case REGEX_ASCII_RESTRICTED_CHARSET:
14154                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
14155                     break;
14156                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
14157                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
14158                     break;
14159                 default:
14160                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
14161                     break;
14162             }
14163     }
14164     if (lead)  {
14165         if (set)
14166             PerlIO_printf(Perl_debug_log, "\n");
14167         else
14168             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14169     }
14170 }
14171 #endif
14172
14173 void
14174 Perl_regdump(pTHX_ const regexp *r)
14175 {
14176 #ifdef DEBUGGING
14177     dVAR;
14178     SV * const sv = sv_newmortal();
14179     SV *dsv= sv_newmortal();
14180     RXi_GET_DECL(r,ri);
14181     GET_RE_DEBUG_FLAGS_DECL;
14182
14183     PERL_ARGS_ASSERT_REGDUMP;
14184
14185     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
14186
14187     /* Header fields of interest. */
14188     if (r->anchored_substr) {
14189         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
14190             RE_SV_DUMPLEN(r->anchored_substr), 30);
14191         PerlIO_printf(Perl_debug_log,
14192                       "anchored %s%s at %"IVdf" ",
14193                       s, RE_SV_TAIL(r->anchored_substr),
14194                       (IV)r->anchored_offset);
14195     } else if (r->anchored_utf8) {
14196         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
14197             RE_SV_DUMPLEN(r->anchored_utf8), 30);
14198         PerlIO_printf(Perl_debug_log,
14199                       "anchored utf8 %s%s at %"IVdf" ",
14200                       s, RE_SV_TAIL(r->anchored_utf8),
14201                       (IV)r->anchored_offset);
14202     }
14203     if (r->float_substr) {
14204         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
14205             RE_SV_DUMPLEN(r->float_substr), 30);
14206         PerlIO_printf(Perl_debug_log,
14207                       "floating %s%s at %"IVdf"..%"UVuf" ",
14208                       s, RE_SV_TAIL(r->float_substr),
14209                       (IV)r->float_min_offset, (UV)r->float_max_offset);
14210     } else if (r->float_utf8) {
14211         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
14212             RE_SV_DUMPLEN(r->float_utf8), 30);
14213         PerlIO_printf(Perl_debug_log,
14214                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
14215                       s, RE_SV_TAIL(r->float_utf8),
14216                       (IV)r->float_min_offset, (UV)r->float_max_offset);
14217     }
14218     if (r->check_substr || r->check_utf8)
14219         PerlIO_printf(Perl_debug_log,
14220                       (const char *)
14221                       (r->check_substr == r->float_substr
14222                        && r->check_utf8 == r->float_utf8
14223                        ? "(checking floating" : "(checking anchored"));
14224     if (r->extflags & RXf_NOSCAN)
14225         PerlIO_printf(Perl_debug_log, " noscan");
14226     if (r->extflags & RXf_CHECK_ALL)
14227         PerlIO_printf(Perl_debug_log, " isall");
14228     if (r->check_substr || r->check_utf8)
14229         PerlIO_printf(Perl_debug_log, ") ");
14230
14231     if (ri->regstclass) {
14232         regprop(r, sv, ri->regstclass);
14233         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
14234     }
14235     if (r->extflags & RXf_ANCH) {
14236         PerlIO_printf(Perl_debug_log, "anchored");
14237         if (r->extflags & RXf_ANCH_BOL)
14238             PerlIO_printf(Perl_debug_log, "(BOL)");
14239         if (r->extflags & RXf_ANCH_MBOL)
14240             PerlIO_printf(Perl_debug_log, "(MBOL)");
14241         if (r->extflags & RXf_ANCH_SBOL)
14242             PerlIO_printf(Perl_debug_log, "(SBOL)");
14243         if (r->extflags & RXf_ANCH_GPOS)
14244             PerlIO_printf(Perl_debug_log, "(GPOS)");
14245         PerlIO_putc(Perl_debug_log, ' ');
14246     }
14247     if (r->extflags & RXf_GPOS_SEEN)
14248         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
14249     if (r->intflags & PREGf_SKIP)
14250         PerlIO_printf(Perl_debug_log, "plus ");
14251     if (r->intflags & PREGf_IMPLICIT)
14252         PerlIO_printf(Perl_debug_log, "implicit ");
14253     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
14254     if (r->extflags & RXf_EVAL_SEEN)
14255         PerlIO_printf(Perl_debug_log, "with eval ");
14256     PerlIO_printf(Perl_debug_log, "\n");
14257     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
14258 #else
14259     PERL_ARGS_ASSERT_REGDUMP;
14260     PERL_UNUSED_CONTEXT;
14261     PERL_UNUSED_ARG(r);
14262 #endif  /* DEBUGGING */
14263 }
14264
14265 /*
14266 - regprop - printable representation of opcode
14267 */
14268 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
14269 STMT_START { \
14270         if (do_sep) {                           \
14271             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
14272             if (flags & ANYOF_INVERT)           \
14273                 /*make sure the invert info is in each */ \
14274                 sv_catpvs(sv, "^");             \
14275             do_sep = 0;                         \
14276         }                                       \
14277 } STMT_END
14278
14279 void
14280 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
14281 {
14282 #ifdef DEBUGGING
14283     dVAR;
14284     int k;
14285
14286     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
14287     static const char * const anyofs[] = {
14288 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
14289     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
14290     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
14291     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
14292     || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
14293     || _CC_VERTSPACE != 16
14294   #error Need to adjust order of anyofs[]
14295 #endif
14296         "[\\w]",
14297         "[\\W]",
14298         "[\\d]",
14299         "[\\D]",
14300         "[:alpha:]",
14301         "[:^alpha:]",
14302         "[:lower:]",
14303         "[:^lower:]",
14304         "[:upper:]",
14305         "[:^upper:]",
14306         "[:punct:]",
14307         "[:^punct:]",
14308         "[:print:]",
14309         "[:^print:]",
14310         "[:alnum:]",
14311         "[:^alnum:]",
14312         "[:graph:]",
14313         "[:^graph:]",
14314         "[:cased:]",
14315         "[:^cased:]",
14316         "[\\s]",
14317         "[\\S]",
14318         "[:blank:]",
14319         "[:^blank:]",
14320         "[:xdigit:]",
14321         "[:^xdigit:]",
14322         "[:space:]",
14323         "[:^space:]",
14324         "[:cntrl:]",
14325         "[:^cntrl:]",
14326         "[:ascii:]",
14327         "[:^ascii:]",
14328         "[\\v]",
14329         "[\\V]"
14330     };
14331     RXi_GET_DECL(prog,progi);
14332     GET_RE_DEBUG_FLAGS_DECL;
14333
14334     PERL_ARGS_ASSERT_REGPROP;
14335
14336     sv_setpvs(sv, "");
14337
14338     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
14339         /* It would be nice to FAIL() here, but this may be called from
14340            regexec.c, and it would be hard to supply pRExC_state. */
14341         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
14342     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
14343
14344     k = PL_regkind[OP(o)];
14345
14346     if (k == EXACT) {
14347         sv_catpvs(sv, " ");
14348         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
14349          * is a crude hack but it may be the best for now since
14350          * we have no flag "this EXACTish node was UTF-8"
14351          * --jhi */
14352         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
14353                   PERL_PV_ESCAPE_UNI_DETECT |
14354                   PERL_PV_ESCAPE_NONASCII   |
14355                   PERL_PV_PRETTY_ELLIPSES   |
14356                   PERL_PV_PRETTY_LTGT       |
14357                   PERL_PV_PRETTY_NOCLEAR
14358                   );
14359     } else if (k == TRIE) {
14360         /* print the details of the trie in dumpuntil instead, as
14361          * progi->data isn't available here */
14362         const char op = OP(o);
14363         const U32 n = ARG(o);
14364         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
14365                (reg_ac_data *)progi->data->data[n] :
14366                NULL;
14367         const reg_trie_data * const trie
14368             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
14369
14370         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
14371         DEBUG_TRIE_COMPILE_r(
14372             Perl_sv_catpvf(aTHX_ sv,
14373                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
14374                 (UV)trie->startstate,
14375                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
14376                 (UV)trie->wordcount,
14377                 (UV)trie->minlen,
14378                 (UV)trie->maxlen,
14379                 (UV)TRIE_CHARCOUNT(trie),
14380                 (UV)trie->uniquecharcount
14381             )
14382         );
14383         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
14384             int i;
14385             int rangestart = -1;
14386             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
14387             sv_catpvs(sv, "[");
14388             for (i = 0; i <= 256; i++) {
14389                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
14390                     if (rangestart == -1)
14391                         rangestart = i;
14392                 } else if (rangestart != -1) {
14393                     if (i <= rangestart + 3)
14394                         for (; rangestart < i; rangestart++)
14395                             put_byte(sv, rangestart);
14396                     else {
14397                         put_byte(sv, rangestart);
14398                         sv_catpvs(sv, "-");
14399                         put_byte(sv, i - 1);
14400                     }
14401                     rangestart = -1;
14402                 }
14403             }
14404             sv_catpvs(sv, "]");
14405         }
14406
14407     } else if (k == CURLY) {
14408         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
14409             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
14410         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
14411     }
14412     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
14413         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
14414     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
14415         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
14416         if ( RXp_PAREN_NAMES(prog) ) {
14417             if ( k != REF || (OP(o) < NREF)) {
14418                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
14419                 SV **name= av_fetch(list, ARG(o), 0 );
14420                 if (name)
14421                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14422             }
14423             else {
14424                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
14425                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
14426                 I32 *nums=(I32*)SvPVX(sv_dat);
14427                 SV **name= av_fetch(list, nums[0], 0 );
14428                 I32 n;
14429                 if (name) {
14430                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
14431                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
14432                                     (n ? "," : ""), (IV)nums[n]);
14433                     }
14434                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14435                 }
14436             }
14437         }
14438     } else if (k == GOSUB)
14439         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
14440     else if (k == VERB) {
14441         if (!o->flags)
14442             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
14443                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
14444     } else if (k == LOGICAL)
14445         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
14446     else if (k == ANYOF) {
14447         int i, rangestart = -1;
14448         const U8 flags = ANYOF_FLAGS(o);
14449         int do_sep = 0;
14450
14451
14452         if (flags & ANYOF_LOCALE)
14453             sv_catpvs(sv, "{loc}");
14454         if (flags & ANYOF_LOC_FOLD)
14455             sv_catpvs(sv, "{i}");
14456         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
14457         if (flags & ANYOF_INVERT)
14458             sv_catpvs(sv, "^");
14459
14460         /* output what the standard cp 0-255 bitmap matches */
14461         for (i = 0; i <= 256; i++) {
14462             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
14463                 if (rangestart == -1)
14464                     rangestart = i;
14465             } else if (rangestart != -1) {
14466                 if (i <= rangestart + 3)
14467                     for (; rangestart < i; rangestart++)
14468                         put_byte(sv, rangestart);
14469                 else {
14470                     put_byte(sv, rangestart);
14471                     sv_catpvs(sv, "-");
14472                     put_byte(sv, i - 1);
14473                 }
14474                 do_sep = 1;
14475                 rangestart = -1;
14476             }
14477         }
14478
14479         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14480         /* output any special charclass tests (used entirely under use locale) */
14481         if (ANYOF_CLASS_TEST_ANY_SET(o))
14482             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
14483                 if (ANYOF_CLASS_TEST(o,i)) {
14484                     sv_catpv(sv, anyofs[i]);
14485                     do_sep = 1;
14486                 }
14487
14488         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14489
14490         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
14491             sv_catpvs(sv, "{non-utf8-latin1-all}");
14492         }
14493
14494         /* output information about the unicode matching */
14495         if (flags & ANYOF_UNICODE_ALL)
14496             sv_catpvs(sv, "{unicode_all}");
14497         else if (ANYOF_NONBITMAP(o))
14498             sv_catpvs(sv, "{unicode}");
14499         if (flags & ANYOF_NONBITMAP_NON_UTF8)
14500             sv_catpvs(sv, "{outside bitmap}");
14501
14502         if (ANYOF_NONBITMAP(o)) {
14503             SV *lv; /* Set if there is something outside the bit map */
14504             SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
14505             bool byte_output = FALSE;   /* If something in the bitmap has been
14506                                            output */
14507
14508             if (lv && lv != &PL_sv_undef) {
14509                 if (sw) {
14510                     U8 s[UTF8_MAXBYTES_CASE+1];
14511
14512                     for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
14513                         uvchr_to_utf8(s, i);
14514
14515                         if (i < 256
14516                             && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
14517                                                                things already
14518                                                                output as part
14519                                                                of the bitmap */
14520                             && swash_fetch(sw, s, TRUE))
14521                         {
14522                             if (rangestart == -1)
14523                                 rangestart = i;
14524                         } else if (rangestart != -1) {
14525                             byte_output = TRUE;
14526                             if (i <= rangestart + 3)
14527                                 for (; rangestart < i; rangestart++) {
14528                                     put_byte(sv, rangestart);
14529                                 }
14530                             else {
14531                                 put_byte(sv, rangestart);
14532                                 sv_catpvs(sv, "-");
14533                                 put_byte(sv, i-1);
14534                             }
14535                             rangestart = -1;
14536                         }
14537                     }
14538                 }
14539
14540                 {
14541                     char *s = savesvpv(lv);
14542                     char * const origs = s;
14543
14544                     while (*s && *s != '\n')
14545                         s++;
14546
14547                     if (*s == '\n') {
14548                         const char * const t = ++s;
14549
14550                         if (byte_output) {
14551                             sv_catpvs(sv, " ");
14552                         }
14553
14554                         while (*s) {
14555                             if (*s == '\n') {
14556
14557                                 /* Truncate very long output */
14558                                 if (s - origs > 256) {
14559                                     Perl_sv_catpvf(aTHX_ sv,
14560                                                    "%.*s...",
14561                                                    (int) (s - origs - 1),
14562                                                    t);
14563                                     goto out_dump;
14564                                 }
14565                                 *s = ' ';
14566                             }
14567                             else if (*s == '\t') {
14568                                 *s = '-';
14569                             }
14570                             s++;
14571                         }
14572                         if (s[-1] == ' ')
14573                             s[-1] = 0;
14574
14575                         sv_catpv(sv, t);
14576                     }
14577
14578                 out_dump:
14579
14580                     Safefree(origs);
14581                 }
14582                 SvREFCNT_dec_NN(lv);
14583             }
14584         }
14585
14586         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14587     }
14588     else if (k == POSIXD || k == NPOSIXD) {
14589         U8 index = FLAGS(o) * 2;
14590         if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14591             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14592         }
14593         else {
14594             sv_catpv(sv, anyofs[index]);
14595         }
14596     }
14597     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14598         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14599 #else
14600     PERL_UNUSED_CONTEXT;
14601     PERL_UNUSED_ARG(sv);
14602     PERL_UNUSED_ARG(o);
14603     PERL_UNUSED_ARG(prog);
14604 #endif  /* DEBUGGING */
14605 }
14606
14607 SV *
14608 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14609 {                               /* Assume that RE_INTUIT is set */
14610     dVAR;
14611     struct regexp *const prog = ReANY(r);
14612     GET_RE_DEBUG_FLAGS_DECL;
14613
14614     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14615     PERL_UNUSED_CONTEXT;
14616
14617     DEBUG_COMPILE_r(
14618         {
14619             const char * const s = SvPV_nolen_const(prog->check_substr
14620                       ? prog->check_substr : prog->check_utf8);
14621
14622             if (!PL_colorset) reginitcolors();
14623             PerlIO_printf(Perl_debug_log,
14624                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14625                       PL_colors[4],
14626                       prog->check_substr ? "" : "utf8 ",
14627                       PL_colors[5],PL_colors[0],
14628                       s,
14629                       PL_colors[1],
14630                       (strlen(s) > 60 ? "..." : ""));
14631         } );
14632
14633     return prog->check_substr ? prog->check_substr : prog->check_utf8;
14634 }
14635
14636 /*
14637    pregfree()
14638
14639    handles refcounting and freeing the perl core regexp structure. When
14640    it is necessary to actually free the structure the first thing it
14641    does is call the 'free' method of the regexp_engine associated to
14642    the regexp, allowing the handling of the void *pprivate; member
14643    first. (This routine is not overridable by extensions, which is why
14644    the extensions free is called first.)
14645
14646    See regdupe and regdupe_internal if you change anything here.
14647 */
14648 #ifndef PERL_IN_XSUB_RE
14649 void
14650 Perl_pregfree(pTHX_ REGEXP *r)
14651 {
14652     SvREFCNT_dec(r);
14653 }
14654
14655 void
14656 Perl_pregfree2(pTHX_ REGEXP *rx)
14657 {
14658     dVAR;
14659     struct regexp *const r = ReANY(rx);
14660     GET_RE_DEBUG_FLAGS_DECL;
14661
14662     PERL_ARGS_ASSERT_PREGFREE2;
14663
14664     if (r->mother_re) {
14665         ReREFCNT_dec(r->mother_re);
14666     } else {
14667         CALLREGFREE_PVT(rx); /* free the private data */
14668         SvREFCNT_dec(RXp_PAREN_NAMES(r));
14669         Safefree(r->xpv_len_u.xpvlenu_pv);
14670     }
14671     if (r->substrs) {
14672         SvREFCNT_dec(r->anchored_substr);
14673         SvREFCNT_dec(r->anchored_utf8);
14674         SvREFCNT_dec(r->float_substr);
14675         SvREFCNT_dec(r->float_utf8);
14676         Safefree(r->substrs);
14677     }
14678     RX_MATCH_COPY_FREE(rx);
14679 #ifdef PERL_ANY_COW
14680     SvREFCNT_dec(r->saved_copy);
14681 #endif
14682     Safefree(r->offs);
14683     SvREFCNT_dec(r->qr_anoncv);
14684     rx->sv_u.svu_rx = 0;
14685 }
14686
14687 /*  reg_temp_copy()
14688
14689     This is a hacky workaround to the structural issue of match results
14690     being stored in the regexp structure which is in turn stored in
14691     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
14692     could be PL_curpm in multiple contexts, and could require multiple
14693     result sets being associated with the pattern simultaneously, such
14694     as when doing a recursive match with (??{$qr})
14695
14696     The solution is to make a lightweight copy of the regexp structure
14697     when a qr// is returned from the code executed by (??{$qr}) this
14698     lightweight copy doesn't actually own any of its data except for
14699     the starp/end and the actual regexp structure itself.
14700
14701 */
14702
14703
14704 REGEXP *
14705 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
14706 {
14707     struct regexp *ret;
14708     struct regexp *const r = ReANY(rx);
14709     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
14710
14711     PERL_ARGS_ASSERT_REG_TEMP_COPY;
14712
14713     if (!ret_x)
14714         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
14715     else {
14716         SvOK_off((SV *)ret_x);
14717         if (islv) {
14718             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
14719                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
14720                made both spots point to the same regexp body.) */
14721             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
14722             assert(!SvPVX(ret_x));
14723             ret_x->sv_u.svu_rx = temp->sv_any;
14724             temp->sv_any = NULL;
14725             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
14726             SvREFCNT_dec_NN(temp);
14727             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
14728                ing below will not set it. */
14729             SvCUR_set(ret_x, SvCUR(rx));
14730         }
14731     }
14732     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
14733        sv_force_normal(sv) is called.  */
14734     SvFAKE_on(ret_x);
14735     ret = ReANY(ret_x);
14736
14737     SvFLAGS(ret_x) |= SvUTF8(rx);
14738     /* We share the same string buffer as the original regexp, on which we
14739        hold a reference count, incremented when mother_re is set below.
14740        The string pointer is copied here, being part of the regexp struct.
14741      */
14742     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
14743            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
14744     if (r->offs) {
14745         const I32 npar = r->nparens+1;
14746         Newx(ret->offs, npar, regexp_paren_pair);
14747         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14748     }
14749     if (r->substrs) {
14750         Newx(ret->substrs, 1, struct reg_substr_data);
14751         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14752
14753         SvREFCNT_inc_void(ret->anchored_substr);
14754         SvREFCNT_inc_void(ret->anchored_utf8);
14755         SvREFCNT_inc_void(ret->float_substr);
14756         SvREFCNT_inc_void(ret->float_utf8);
14757
14758         /* check_substr and check_utf8, if non-NULL, point to either their
14759            anchored or float namesakes, and don't hold a second reference.  */
14760     }
14761     RX_MATCH_COPIED_off(ret_x);
14762 #ifdef PERL_ANY_COW
14763     ret->saved_copy = NULL;
14764 #endif
14765     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
14766     SvREFCNT_inc_void(ret->qr_anoncv);
14767
14768     return ret_x;
14769 }
14770 #endif
14771
14772 /* regfree_internal()
14773
14774    Free the private data in a regexp. This is overloadable by
14775    extensions. Perl takes care of the regexp structure in pregfree(),
14776    this covers the *pprivate pointer which technically perl doesn't
14777    know about, however of course we have to handle the
14778    regexp_internal structure when no extension is in use.
14779
14780    Note this is called before freeing anything in the regexp
14781    structure.
14782  */
14783
14784 void
14785 Perl_regfree_internal(pTHX_ REGEXP * const rx)
14786 {
14787     dVAR;
14788     struct regexp *const r = ReANY(rx);
14789     RXi_GET_DECL(r,ri);
14790     GET_RE_DEBUG_FLAGS_DECL;
14791
14792     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
14793
14794     DEBUG_COMPILE_r({
14795         if (!PL_colorset)
14796             reginitcolors();
14797         {
14798             SV *dsv= sv_newmortal();
14799             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
14800                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
14801             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
14802                 PL_colors[4],PL_colors[5],s);
14803         }
14804     });
14805 #ifdef RE_TRACK_PATTERN_OFFSETS
14806     if (ri->u.offsets)
14807         Safefree(ri->u.offsets);             /* 20010421 MJD */
14808 #endif
14809     if (ri->code_blocks) {
14810         int n;
14811         for (n = 0; n < ri->num_code_blocks; n++)
14812             SvREFCNT_dec(ri->code_blocks[n].src_regex);
14813         Safefree(ri->code_blocks);
14814     }
14815
14816     if (ri->data) {
14817         int n = ri->data->count;
14818
14819         while (--n >= 0) {
14820           /* If you add a ->what type here, update the comment in regcomp.h */
14821             switch (ri->data->what[n]) {
14822             case 'a':
14823             case 'r':
14824             case 's':
14825             case 'S':
14826             case 'u':
14827                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
14828                 break;
14829             case 'f':
14830                 Safefree(ri->data->data[n]);
14831                 break;
14832             case 'l':
14833             case 'L':
14834                 break;
14835             case 'T':
14836                 { /* Aho Corasick add-on structure for a trie node.
14837                      Used in stclass optimization only */
14838                     U32 refcount;
14839                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
14840                     OP_REFCNT_LOCK;
14841                     refcount = --aho->refcount;
14842                     OP_REFCNT_UNLOCK;
14843                     if ( !refcount ) {
14844                         PerlMemShared_free(aho->states);
14845                         PerlMemShared_free(aho->fail);
14846                          /* do this last!!!! */
14847                         PerlMemShared_free(ri->data->data[n]);
14848                         PerlMemShared_free(ri->regstclass);
14849                     }
14850                 }
14851                 break;
14852             case 't':
14853                 {
14854                     /* trie structure. */
14855                     U32 refcount;
14856                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
14857                     OP_REFCNT_LOCK;
14858                     refcount = --trie->refcount;
14859                     OP_REFCNT_UNLOCK;
14860                     if ( !refcount ) {
14861                         PerlMemShared_free(trie->charmap);
14862                         PerlMemShared_free(trie->states);
14863                         PerlMemShared_free(trie->trans);
14864                         if (trie->bitmap)
14865                             PerlMemShared_free(trie->bitmap);
14866                         if (trie->jump)
14867                             PerlMemShared_free(trie->jump);
14868                         PerlMemShared_free(trie->wordinfo);
14869                         /* do this last!!!! */
14870                         PerlMemShared_free(ri->data->data[n]);
14871                     }
14872                 }
14873                 break;
14874             default:
14875                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
14876             }
14877         }
14878         Safefree(ri->data->what);
14879         Safefree(ri->data);
14880     }
14881
14882     Safefree(ri);
14883 }
14884
14885 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
14886 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
14887 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
14888
14889 /*
14890    re_dup - duplicate a regexp.
14891
14892    This routine is expected to clone a given regexp structure. It is only
14893    compiled under USE_ITHREADS.
14894
14895    After all of the core data stored in struct regexp is duplicated
14896    the regexp_engine.dupe method is used to copy any private data
14897    stored in the *pprivate pointer. This allows extensions to handle
14898    any duplication it needs to do.
14899
14900    See pregfree() and regfree_internal() if you change anything here.
14901 */
14902 #if defined(USE_ITHREADS)
14903 #ifndef PERL_IN_XSUB_RE
14904 void
14905 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
14906 {
14907     dVAR;
14908     I32 npar;
14909     const struct regexp *r = ReANY(sstr);
14910     struct regexp *ret = ReANY(dstr);
14911
14912     PERL_ARGS_ASSERT_RE_DUP_GUTS;
14913
14914     npar = r->nparens+1;
14915     Newx(ret->offs, npar, regexp_paren_pair);
14916     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14917     if(ret->swap) {
14918         /* no need to copy these */
14919         Newx(ret->swap, npar, regexp_paren_pair);
14920     }
14921
14922     if (ret->substrs) {
14923         /* Do it this way to avoid reading from *r after the StructCopy().
14924            That way, if any of the sv_dup_inc()s dislodge *r from the L1
14925            cache, it doesn't matter.  */
14926         const bool anchored = r->check_substr
14927             ? r->check_substr == r->anchored_substr
14928             : r->check_utf8 == r->anchored_utf8;
14929         Newx(ret->substrs, 1, struct reg_substr_data);
14930         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14931
14932         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
14933         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
14934         ret->float_substr = sv_dup_inc(ret->float_substr, param);
14935         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
14936
14937         /* check_substr and check_utf8, if non-NULL, point to either their
14938            anchored or float namesakes, and don't hold a second reference.  */
14939
14940         if (ret->check_substr) {
14941             if (anchored) {
14942                 assert(r->check_utf8 == r->anchored_utf8);
14943                 ret->check_substr = ret->anchored_substr;
14944                 ret->check_utf8 = ret->anchored_utf8;
14945             } else {
14946                 assert(r->check_substr == r->float_substr);
14947                 assert(r->check_utf8 == r->float_utf8);
14948                 ret->check_substr = ret->float_substr;
14949                 ret->check_utf8 = ret->float_utf8;
14950             }
14951         } else if (ret->check_utf8) {
14952             if (anchored) {
14953                 ret->check_utf8 = ret->anchored_utf8;
14954             } else {
14955                 ret->check_utf8 = ret->float_utf8;
14956             }
14957         }
14958     }
14959
14960     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
14961     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
14962
14963     if (ret->pprivate)
14964         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
14965
14966     if (RX_MATCH_COPIED(dstr))
14967         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
14968     else
14969         ret->subbeg = NULL;
14970 #ifdef PERL_ANY_COW
14971     ret->saved_copy = NULL;
14972 #endif
14973
14974     /* Whether mother_re be set or no, we need to copy the string.  We
14975        cannot refrain from copying it when the storage points directly to
14976        our mother regexp, because that's
14977                1: a buffer in a different thread
14978                2: something we no longer hold a reference on
14979                so we need to copy it locally.  */
14980     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
14981     ret->mother_re   = NULL;
14982     ret->gofs = 0;
14983 }
14984 #endif /* PERL_IN_XSUB_RE */
14985
14986 /*
14987    regdupe_internal()
14988
14989    This is the internal complement to regdupe() which is used to copy
14990    the structure pointed to by the *pprivate pointer in the regexp.
14991    This is the core version of the extension overridable cloning hook.
14992    The regexp structure being duplicated will be copied by perl prior
14993    to this and will be provided as the regexp *r argument, however
14994    with the /old/ structures pprivate pointer value. Thus this routine
14995    may override any copying normally done by perl.
14996
14997    It returns a pointer to the new regexp_internal structure.
14998 */
14999
15000 void *
15001 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
15002 {
15003     dVAR;
15004     struct regexp *const r = ReANY(rx);
15005     regexp_internal *reti;
15006     int len;
15007     RXi_GET_DECL(r,ri);
15008
15009     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
15010
15011     len = ProgLen(ri);
15012
15013     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
15014     Copy(ri->program, reti->program, len+1, regnode);
15015
15016     reti->num_code_blocks = ri->num_code_blocks;
15017     if (ri->code_blocks) {
15018         int n;
15019         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
15020                 struct reg_code_block);
15021         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
15022                 struct reg_code_block);
15023         for (n = 0; n < ri->num_code_blocks; n++)
15024              reti->code_blocks[n].src_regex = (REGEXP*)
15025                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
15026     }
15027     else
15028         reti->code_blocks = NULL;
15029
15030     reti->regstclass = NULL;
15031
15032     if (ri->data) {
15033         struct reg_data *d;
15034         const int count = ri->data->count;
15035         int i;
15036
15037         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
15038                 char, struct reg_data);
15039         Newx(d->what, count, U8);
15040
15041         d->count = count;
15042         for (i = 0; i < count; i++) {
15043             d->what[i] = ri->data->what[i];
15044             switch (d->what[i]) {
15045                 /* see also regcomp.h and regfree_internal() */
15046             case 'a': /* actually an AV, but the dup function is identical.  */
15047             case 'r':
15048             case 's':
15049             case 'S':
15050             case 'u': /* actually an HV, but the dup function is identical.  */
15051                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
15052                 break;
15053             case 'f':
15054                 /* This is cheating. */
15055                 Newx(d->data[i], 1, struct regnode_charclass_class);
15056                 StructCopy(ri->data->data[i], d->data[i],
15057                             struct regnode_charclass_class);
15058                 reti->regstclass = (regnode*)d->data[i];
15059                 break;
15060             case 'T':
15061                 /* Trie stclasses are readonly and can thus be shared
15062                  * without duplication. We free the stclass in pregfree
15063                  * when the corresponding reg_ac_data struct is freed.
15064                  */
15065                 reti->regstclass= ri->regstclass;
15066                 /* Fall through */
15067             case 't':
15068                 OP_REFCNT_LOCK;
15069                 ((reg_trie_data*)ri->data->data[i])->refcount++;
15070                 OP_REFCNT_UNLOCK;
15071                 /* Fall through */
15072             case 'l':
15073             case 'L':
15074                 d->data[i] = ri->data->data[i];
15075                 break;
15076             default:
15077                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
15078             }
15079         }
15080
15081         reti->data = d;
15082     }
15083     else
15084         reti->data = NULL;
15085
15086     reti->name_list_idx = ri->name_list_idx;
15087
15088 #ifdef RE_TRACK_PATTERN_OFFSETS
15089     if (ri->u.offsets) {
15090         Newx(reti->u.offsets, 2*len+1, U32);
15091         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
15092     }
15093 #else
15094     SetProgLen(reti,len);
15095 #endif
15096
15097     return (void*)reti;
15098 }
15099
15100 #endif    /* USE_ITHREADS */
15101
15102 #ifndef PERL_IN_XSUB_RE
15103
15104 /*
15105  - regnext - dig the "next" pointer out of a node
15106  */
15107 regnode *
15108 Perl_regnext(pTHX_ regnode *p)
15109 {
15110     dVAR;
15111     I32 offset;
15112
15113     if (!p)
15114         return(NULL);
15115
15116     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
15117         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
15118     }
15119
15120     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
15121     if (offset == 0)
15122         return(NULL);
15123
15124     return(p+offset);
15125 }
15126 #endif
15127
15128 STATIC void
15129 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
15130 {
15131     va_list args;
15132     STRLEN l1 = strlen(pat1);
15133     STRLEN l2 = strlen(pat2);
15134     char buf[512];
15135     SV *msv;
15136     const char *message;
15137
15138     PERL_ARGS_ASSERT_RE_CROAK2;
15139
15140     if (l1 > 510)
15141         l1 = 510;
15142     if (l1 + l2 > 510)
15143         l2 = 510 - l1;
15144     Copy(pat1, buf, l1 , char);
15145     Copy(pat2, buf + l1, l2 , char);
15146     buf[l1 + l2] = '\n';
15147     buf[l1 + l2 + 1] = '\0';
15148 #ifdef I_STDARG
15149     /* ANSI variant takes additional second argument */
15150     va_start(args, pat2);
15151 #else
15152     va_start(args);
15153 #endif
15154     msv = vmess(buf, &args);
15155     va_end(args);
15156     message = SvPV_const(msv,l1);
15157     if (l1 > 512)
15158         l1 = 512;
15159     Copy(message, buf, l1 , char);
15160     buf[l1-1] = '\0';                   /* Overwrite \n */
15161     Perl_croak(aTHX_ "%s", buf);
15162 }
15163
15164 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
15165
15166 #ifndef PERL_IN_XSUB_RE
15167 void
15168 Perl_save_re_context(pTHX)
15169 {
15170     dVAR;
15171
15172     struct re_save_state *state;
15173
15174     SAVEVPTR(PL_curcop);
15175     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
15176
15177     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
15178     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
15179     SSPUSHUV(SAVEt_RE_STATE);
15180
15181     Copy(&PL_reg_state, state, 1, struct re_save_state);
15182
15183     PL_reg_oldsaved = NULL;
15184     PL_reg_oldsavedlen = 0;
15185     PL_reg_oldsavedoffset = 0;
15186     PL_reg_oldsavedcoffset = 0;
15187     PL_reg_maxiter = 0;
15188     PL_reg_leftiter = 0;
15189     PL_reg_poscache = NULL;
15190     PL_reg_poscache_size = 0;
15191 #ifdef PERL_ANY_COW
15192     PL_nrs = NULL;
15193 #endif
15194
15195     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
15196     if (PL_curpm) {
15197         const REGEXP * const rx = PM_GETRE(PL_curpm);
15198         if (rx) {
15199             U32 i;
15200             for (i = 1; i <= RX_NPARENS(rx); i++) {
15201                 char digits[TYPE_CHARS(long)];
15202                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
15203                 GV *const *const gvp
15204                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
15205
15206                 if (gvp) {
15207                     GV * const gv = *gvp;
15208                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
15209                         save_scalar(gv);
15210                 }
15211             }
15212         }
15213     }
15214 }
15215 #endif
15216
15217 #ifdef DEBUGGING
15218
15219 STATIC void
15220 S_put_byte(pTHX_ SV *sv, int c)
15221 {
15222     PERL_ARGS_ASSERT_PUT_BYTE;
15223
15224     /* Our definition of isPRINT() ignores locales, so only bytes that are
15225        not part of UTF-8 are considered printable. I assume that the same
15226        holds for UTF-EBCDIC.
15227        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
15228        which Wikipedia says:
15229
15230        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
15231        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
15232        identical, to the ASCII delete (DEL) or rubout control character.
15233        ) So the old condition can be simplified to !isPRINT(c)  */
15234     if (!isPRINT(c)) {
15235         if (c < 256) {
15236             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
15237         }
15238         else {
15239             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
15240         }
15241     }
15242     else {
15243         const char string = c;
15244         if (c == '-' || c == ']' || c == '\\' || c == '^')
15245             sv_catpvs(sv, "\\");
15246         sv_catpvn(sv, &string, 1);
15247     }
15248 }
15249
15250
15251 #define CLEAR_OPTSTART \
15252     if (optstart) STMT_START { \
15253             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
15254             optstart=NULL; \
15255     } STMT_END
15256
15257 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
15258
15259 STATIC const regnode *
15260 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
15261             const regnode *last, const regnode *plast,
15262             SV* sv, I32 indent, U32 depth)
15263 {
15264     dVAR;
15265     U8 op = PSEUDO;     /* Arbitrary non-END op. */
15266     const regnode *next;
15267     const regnode *optstart= NULL;
15268
15269     RXi_GET_DECL(r,ri);
15270     GET_RE_DEBUG_FLAGS_DECL;
15271
15272     PERL_ARGS_ASSERT_DUMPUNTIL;
15273
15274 #ifdef DEBUG_DUMPUNTIL
15275     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
15276         last ? last-start : 0,plast ? plast-start : 0);
15277 #endif
15278
15279     if (plast && plast < last)
15280         last= plast;
15281
15282     while (PL_regkind[op] != END && (!last || node < last)) {
15283         /* While that wasn't END last time... */
15284         NODE_ALIGN(node);
15285         op = OP(node);
15286         if (op == CLOSE || op == WHILEM)
15287             indent--;
15288         next = regnext((regnode *)node);
15289
15290         /* Where, what. */
15291         if (OP(node) == OPTIMIZED) {
15292             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
15293                 optstart = node;
15294             else
15295                 goto after_print;
15296         } else
15297             CLEAR_OPTSTART;
15298
15299         regprop(r, sv, node);
15300         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
15301                       (int)(2*indent + 1), "", SvPVX_const(sv));
15302
15303         if (OP(node) != OPTIMIZED) {
15304             if (next == NULL)           /* Next ptr. */
15305                 PerlIO_printf(Perl_debug_log, " (0)");
15306             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
15307                 PerlIO_printf(Perl_debug_log, " (FAIL)");
15308             else
15309                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
15310             (void)PerlIO_putc(Perl_debug_log, '\n');
15311         }
15312
15313       after_print:
15314         if (PL_regkind[(U8)op] == BRANCHJ) {
15315             assert(next);
15316             {
15317                 const regnode *nnode = (OP(next) == LONGJMP
15318                                        ? regnext((regnode *)next)
15319                                        : next);
15320                 if (last && nnode > last)
15321                     nnode = last;
15322                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
15323             }
15324         }
15325         else if (PL_regkind[(U8)op] == BRANCH) {
15326             assert(next);
15327             DUMPUNTIL(NEXTOPER(node), next);
15328         }
15329         else if ( PL_regkind[(U8)op]  == TRIE ) {
15330             const regnode *this_trie = node;
15331             const char op = OP(node);
15332             const U32 n = ARG(node);
15333             const reg_ac_data * const ac = op>=AHOCORASICK ?
15334                (reg_ac_data *)ri->data->data[n] :
15335                NULL;
15336             const reg_trie_data * const trie =
15337                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
15338 #ifdef DEBUGGING
15339             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
15340 #endif
15341             const regnode *nextbranch= NULL;
15342             I32 word_idx;
15343             sv_setpvs(sv, "");
15344             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
15345                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
15346
15347                 PerlIO_printf(Perl_debug_log, "%*s%s ",
15348                    (int)(2*(indent+3)), "",
15349                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
15350                             PL_colors[0], PL_colors[1],
15351                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
15352                             PERL_PV_PRETTY_ELLIPSES    |
15353                             PERL_PV_PRETTY_LTGT
15354                             )
15355                             : "???"
15356                 );
15357                 if (trie->jump) {
15358                     U16 dist= trie->jump[word_idx+1];
15359                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
15360                                   (UV)((dist ? this_trie + dist : next) - start));
15361                     if (dist) {
15362                         if (!nextbranch)
15363                             nextbranch= this_trie + trie->jump[0];
15364                         DUMPUNTIL(this_trie + dist, nextbranch);
15365                     }
15366                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
15367                         nextbranch= regnext((regnode *)nextbranch);
15368                 } else {
15369                     PerlIO_printf(Perl_debug_log, "\n");
15370                 }
15371             }
15372             if (last && next > last)
15373                 node= last;
15374             else
15375                 node= next;
15376         }
15377         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
15378             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
15379                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
15380         }
15381         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
15382             assert(next);
15383             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
15384         }
15385         else if ( op == PLUS || op == STAR) {
15386             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
15387         }
15388         else if (PL_regkind[(U8)op] == ANYOF) {
15389             /* arglen 1 + class block */
15390             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
15391                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
15392             node = NEXTOPER(node);
15393         }
15394         else if (PL_regkind[(U8)op] == EXACT) {
15395             /* Literal string, where present. */
15396             node += NODE_SZ_STR(node) - 1;
15397             node = NEXTOPER(node);
15398         }
15399         else {
15400             node = NEXTOPER(node);
15401             node += regarglen[(U8)op];
15402         }
15403         if (op == CURLYX || op == OPEN)
15404             indent++;
15405     }
15406     CLEAR_OPTSTART;
15407 #ifdef DEBUG_DUMPUNTIL
15408     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
15409 #endif
15410     return node;
15411 }
15412
15413 #endif  /* DEBUGGING */
15414
15415 /*
15416  * Local variables:
15417  * c-indentation-style: bsd
15418  * c-basic-offset: 4
15419  * indent-tabs-mode: nil
15420  * End:
15421  *
15422  * ex: set ts=8 sts=4 sw=4 et:
15423  */