regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 extern const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  95 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  96 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  97
  98 #ifdef op
  99 #undef op
 100 #endif /* op */
 101
 102 #ifdef MSDOS
 103 #  if defined(BUGGY_MSC6)
 104  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 105 #    pragma optimize("a",off)
 106  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 107 #    pragma optimize("w",on )
 108 #  endif /* BUGGY_MSC6 */
 109 #endif /* MSDOS */
 110
 111 #ifndef STATIC
 112 #define STATIC  static
 113 #endif
 114
 115
 116 typedef struct RExC_state_t {
 117     U32         flags;                  /* RXf_* are we folding, multilining? */
 118     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 119     char        *precomp;               /* uncompiled string. */
 120     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 121     regexp      *rx;                    /* perl core regexp structure */
 122     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 123     char        *start;                 /* Start of input for compile */
 124     char        *end;                   /* End of input for compile */
 125     char        *parse;                 /* Input-scan pointer. */
 126     I32         whilem_seen;            /* number of WHILEM in this expr */
 127     regnode     *emit_start;            /* Start of emitted-code area */
 128     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 129     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 130     I32         naughty;                /* How bad is this pattern? */
 131     I32         sawback;                /* Did we see \1, ...? */
 132     U32         seen;
 133     I32         size;                   /* Code size. */
 134     I32         npar;                   /* Capture buffer count, (OPEN). */
 135     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 136     I32         nestroot;               /* root parens we are in - used by accept */
 137     I32         extralen;
 138     I32         seen_zerolen;
 139     regnode     **open_parens;          /* pointers to open parens */
 140     regnode     **close_parens;         /* pointers to close parens */
 141     regnode     *opend;                 /* END node in program */
 142     I32         utf8;           /* whether the pattern is utf8 or not */
 143     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 144                                 /* XXX use this for future optimisation of case
 145                                  * where pattern must be upgraded to utf8. */
 146     I32         uni_semantics;  /* If a d charset modifier should use unicode
 147                                    rules, even if the pattern is not in
 148                                    utf8 */
 149     HV          *paren_names;           /* Paren names */
 150
 151     regnode     **recurse;              /* Recurse regops */
 152     I32         recurse_count;          /* Number of recurse regops */
 153     I32         in_lookbehind;
 154     I32         contains_locale;
 155     I32         override_recoding;
 156     I32         in_multi_char_class;
 157     struct reg_code_block *code_blocks; /* positions of literal (?{})
 158                                             within pattern */
 159     int         num_code_blocks;        /* size of code_blocks[] */
 160     int         code_index;             /* next code_blocks[] slot */
 161 #if ADD_TO_REGEXEC
 162     char        *starttry;              /* -Dr: where regtry was called. */
 163 #define RExC_starttry   (pRExC_state->starttry)
 164 #endif
 165     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 166 #ifdef DEBUGGING
 167     const char  *lastparse;
 168     I32         lastnum;
 169     AV          *paren_name_list;       /* idx -> name */
 170 #define RExC_lastparse  (pRExC_state->lastparse)
 171 #define RExC_lastnum    (pRExC_state->lastnum)
 172 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 173 #endif
 174 } RExC_state_t;
 175
 176 #define RExC_flags      (pRExC_state->flags)
 177 #define RExC_pm_flags   (pRExC_state->pm_flags)
 178 #define RExC_precomp    (pRExC_state->precomp)
 179 #define RExC_rx_sv      (pRExC_state->rx_sv)
 180 #define RExC_rx         (pRExC_state->rx)
 181 #define RExC_rxi        (pRExC_state->rxi)
 182 #define RExC_start      (pRExC_state->start)
 183 #define RExC_end        (pRExC_state->end)
 184 #define RExC_parse      (pRExC_state->parse)
 185 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 186 #ifdef RE_TRACK_PATTERN_OFFSETS
 187 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 188 #endif
 189 #define RExC_emit       (pRExC_state->emit)
 190 #define RExC_emit_start (pRExC_state->emit_start)
 191 #define RExC_emit_bound (pRExC_state->emit_bound)
 192 #define RExC_naughty    (pRExC_state->naughty)
 193 #define RExC_sawback    (pRExC_state->sawback)
 194 #define RExC_seen       (pRExC_state->seen)
 195 #define RExC_size       (pRExC_state->size)
 196 #define RExC_npar       (pRExC_state->npar)
 197 #define RExC_nestroot   (pRExC_state->nestroot)
 198 #define RExC_extralen   (pRExC_state->extralen)
 199 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 200 #define RExC_utf8       (pRExC_state->utf8)
 201 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 202 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 203 #define RExC_open_parens        (pRExC_state->open_parens)
 204 #define RExC_close_parens       (pRExC_state->close_parens)
 205 #define RExC_opend      (pRExC_state->opend)
 206 #define RExC_paren_names        (pRExC_state->paren_names)
 207 #define RExC_recurse    (pRExC_state->recurse)
 208 #define RExC_recurse_count      (pRExC_state->recurse_count)
 209 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 210 #define RExC_contains_locale    (pRExC_state->contains_locale)
 211 #define RExC_override_recoding (pRExC_state->override_recoding)
 212 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 213
 214
 215 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 216 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 217         ((*s) == '{' && regcurly(s, FALSE)))
 218
 219 #ifdef SPSTART
 220 #undef SPSTART          /* dratted cpp namespace... */
 221 #endif
 222 /*
 223  * Flags to be passed up and down.
 224  */
 225 #define WORST           0       /* Worst case. */
 226 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 227
 228 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 229  * character.  (There needs to be a case: in the switch statement in regexec.c
 230  * for any node marked SIMPLE.)  Note that this is not the same thing as
 231  * REGNODE_SIMPLE */
 232 #define SIMPLE          0x02
 233 #define SPSTART         0x04    /* Starts with * or + */
 234 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 235 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 236 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 237
 238 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 239
 240 /* whether trie related optimizations are enabled */
 241 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 242 #define TRIE_STUDY_OPT
 243 #define FULL_TRIE_STUDY
 244 #define TRIE_STCLASS
 245 #endif
 246
 247
 248
 249 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 250 #define PBITVAL(paren) (1 << ((paren) & 7))
 251 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 252 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 253 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 254
 255 /* If not already in utf8, do a longjmp back to the beginning */
 256 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 257 #define REQUIRE_UTF8    STMT_START {                                       \
 258                                      if (!UTF) {                           \
 259                                          *flagp = RESTART_UTF8;            \
 260                                          return NULL;                      \
 261                                      }                                     \
 262                         } STMT_END
 263
 264 /* This converts the named class defined in regcomp.h to its equivalent class
 265  * number defined in handy.h. */
 266 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 267 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 268
 269 /* About scan_data_t.
 270
 271   During optimisation we recurse through the regexp program performing
 272   various inplace (keyhole style) optimisations. In addition study_chunk
 273   and scan_commit populate this data structure with information about
 274   what strings MUST appear in the pattern. We look for the longest
 275   string that must appear at a fixed location, and we look for the
 276   longest string that may appear at a floating location. So for instance
 277   in the pattern:
 278
 279     /FOO[xX]A.*B[xX]BAR/
 280
 281   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 282   strings (because they follow a .* construct). study_chunk will identify
 283   both FOO and BAR as being the longest fixed and floating strings respectively.
 284
 285   The strings can be composites, for instance
 286
 287      /(f)(o)(o)/
 288
 289   will result in a composite fixed substring 'foo'.
 290
 291   For each string some basic information is maintained:
 292
 293   - offset or min_offset
 294     This is the position the string must appear at, or not before.
 295     It also implicitly (when combined with minlenp) tells us how many
 296     characters must match before the string we are searching for.
 297     Likewise when combined with minlenp and the length of the string it
 298     tells us how many characters must appear after the string we have
 299     found.
 300
 301   - max_offset
 302     Only used for floating strings. This is the rightmost point that
 303     the string can appear at. If set to I32 max it indicates that the
 304     string can occur infinitely far to the right.
 305
 306   - minlenp
 307     A pointer to the minimum number of characters of the pattern that the
 308     string was found inside. This is important as in the case of positive
 309     lookahead or positive lookbehind we can have multiple patterns
 310     involved. Consider
 311
 312     /(?=FOO).*F/
 313
 314     The minimum length of the pattern overall is 3, the minimum length
 315     of the lookahead part is 3, but the minimum length of the part that
 316     will actually match is 1. So 'FOO's minimum length is 3, but the
 317     minimum length for the F is 1. This is important as the minimum length
 318     is used to determine offsets in front of and behind the string being
 319     looked for.  Since strings can be composites this is the length of the
 320     pattern at the time it was committed with a scan_commit. Note that
 321     the length is calculated by study_chunk, so that the minimum lengths
 322     are not known until the full pattern has been compiled, thus the
 323     pointer to the value.
 324
 325   - lookbehind
 326
 327     In the case of lookbehind the string being searched for can be
 328     offset past the start point of the final matching string.
 329     If this value was just blithely removed from the min_offset it would
 330     invalidate some of the calculations for how many chars must match
 331     before or after (as they are derived from min_offset and minlen and
 332     the length of the string being searched for).
 333     When the final pattern is compiled and the data is moved from the
 334     scan_data_t structure into the regexp structure the information
 335     about lookbehind is factored in, with the information that would
 336     have been lost precalculated in the end_shift field for the
 337     associated string.
 338
 339   The fields pos_min and pos_delta are used to store the minimum offset
 340   and the delta to the maximum offset at the current point in the pattern.
 341
 342 */
 343
 344 typedef struct scan_data_t {
 345     /*I32 len_min;      unused */
 346     /*I32 len_delta;    unused */
 347     I32 pos_min;
 348     I32 pos_delta;
 349     SV *last_found;
 350     I32 last_end;           /* min value, <0 unless valid. */
 351     I32 last_start_min;
 352     I32 last_start_max;
 353     SV **longest;           /* Either &l_fixed, or &l_float. */
 354     SV *longest_fixed;      /* longest fixed string found in pattern */
 355     I32 offset_fixed;       /* offset where it starts */
 356     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 357     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 358     SV *longest_float;      /* longest floating string found in pattern */
 359     I32 offset_float_min;   /* earliest point in string it can appear */
 360     I32 offset_float_max;   /* latest point in string it can appear */
 361     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 362     I32 lookbehind_float;   /* is the position of the string modified by LB */
 363     I32 flags;
 364     I32 whilem_c;
 365     I32 *last_closep;
 366     struct regnode_charclass_class *start_class;
 367 } scan_data_t;
 368
 369 /*
 370  * Forward declarations for pregcomp()'s friends.
 371  */
 372
 373 static const scan_data_t zero_scan_data =
 374   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 375
 376 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 377 #define SF_BEFORE_SEOL          0x0001
 378 #define SF_BEFORE_MEOL          0x0002
 379 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 380 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 381
 382 #ifdef NO_UNARY_PLUS
 383 #  define SF_FIX_SHIFT_EOL      (0+2)
 384 #  define SF_FL_SHIFT_EOL               (0+4)
 385 #else
 386 #  define SF_FIX_SHIFT_EOL      (+2)
 387 #  define SF_FL_SHIFT_EOL               (+4)
 388 #endif
 389
 390 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 391 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 392
 393 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 394 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 395 #define SF_IS_INF               0x0040
 396 #define SF_HAS_PAR              0x0080
 397 #define SF_IN_PAR               0x0100
 398 #define SF_HAS_EVAL             0x0200
 399 #define SCF_DO_SUBSTR           0x0400
 400 #define SCF_DO_STCLASS_AND      0x0800
 401 #define SCF_DO_STCLASS_OR       0x1000
 402 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 403 #define SCF_WHILEM_VISITED_POS  0x2000
 404
 405 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 406 #define SCF_SEEN_ACCEPT         0x8000
 407
 408 #define UTF cBOOL(RExC_utf8)
 409
 410 /* The enums for all these are ordered so things work out correctly */
 411 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 412 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 413 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 414 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 415 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 416 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 417 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 418
 419 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 420
 421 #define OOB_NAMEDCLASS          -1
 422
 423 /* There is no code point that is out-of-bounds, so this is problematic.  But
 424  * its only current use is to initialize a variable that is always set before
 425  * looked at. */
 426 #define OOB_UNICODE             0xDEADBEEF
 427
 428 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 429 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 430
 431
 432 /* length of regex to show in messages that don't mark a position within */
 433 #define RegexLengthToShowInErrorMessages 127
 434
 435 /*
 436  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 437  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 438  * op/pragma/warn/regcomp.
 439  */
 440 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 441 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 442
 443 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 444
 445 /*
 446  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 447  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 448  * "...".
 449  */
 450 #define _FAIL(code) STMT_START {                                        \
 451     const char *ellipses = "";                                          \
 452     IV len = RExC_end - RExC_precomp;                                   \
 453                                                                         \
 454     if (!SIZE_ONLY)                                                     \
 455         SAVEFREESV(RExC_rx_sv);                                         \
 456     if (len > RegexLengthToShowInErrorMessages) {                       \
 457         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 458         len = RegexLengthToShowInErrorMessages - 10;                    \
 459         ellipses = "...";                                               \
 460     }                                                                   \
 461     code;                                                               \
 462 } STMT_END
 463
 464 #define FAIL(msg) _FAIL(                            \
 465     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 466             msg, (int)len, RExC_precomp, ellipses))
 467
 468 #define FAIL2(msg,arg) _FAIL(                       \
 469     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 470             arg, (int)len, RExC_precomp, ellipses))
 471
 472 /*
 473  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 474  */
 475 #define Simple_vFAIL(m) STMT_START {                                    \
 476     const IV offset = RExC_parse - RExC_precomp;                        \
 477     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 478             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 479 } STMT_END
 480
 481 /*
 482  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 483  */
 484 #define vFAIL(m) STMT_START {                           \
 485     if (!SIZE_ONLY)                                     \
 486         SAVEFREESV(RExC_rx_sv);                         \
 487     Simple_vFAIL(m);                                    \
 488 } STMT_END
 489
 490 /*
 491  * Like Simple_vFAIL(), but accepts two arguments.
 492  */
 493 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 494     const IV offset = RExC_parse - RExC_precomp;                        \
 495     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 496             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 497 } STMT_END
 498
 499 /*
 500  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 501  */
 502 #define vFAIL2(m,a1) STMT_START {                       \
 503     if (!SIZE_ONLY)                                     \
 504         SAVEFREESV(RExC_rx_sv);                         \
 505     Simple_vFAIL2(m, a1);                               \
 506 } STMT_END
 507
 508
 509 /*
 510  * Like Simple_vFAIL(), but accepts three arguments.
 511  */
 512 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 513     const IV offset = RExC_parse - RExC_precomp;                \
 514     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 515             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 516 } STMT_END
 517
 518 /*
 519  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 520  */
 521 #define vFAIL3(m,a1,a2) STMT_START {                    \
 522     if (!SIZE_ONLY)                                     \
 523         SAVEFREESV(RExC_rx_sv);                         \
 524     Simple_vFAIL3(m, a1, a2);                           \
 525 } STMT_END
 526
 527 /*
 528  * Like Simple_vFAIL(), but accepts four arguments.
 529  */
 530 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 531     const IV offset = RExC_parse - RExC_precomp;                \
 532     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 533             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 534 } STMT_END
 535
 536 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 537     if (!SIZE_ONLY)                                     \
 538         SAVEFREESV(RExC_rx_sv);                         \
 539     Simple_vFAIL4(m, a1, a2, a3);                       \
 540 } STMT_END
 541
 542 /* m is not necessarily a "literal string", in this macro */
 543 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 544     const IV offset = loc - RExC_precomp;                               \
 545     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 546             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 547 } STMT_END
 548
 549 #define ckWARNreg(loc,m) STMT_START {                                   \
 550     const IV offset = loc - RExC_precomp;                               \
 551     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 552             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 553 } STMT_END
 554
 555 #define vWARN_dep(loc, m) STMT_START {                                  \
 556     const IV offset = loc - RExC_precomp;                               \
 557     Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,     \
 558             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 559 } STMT_END
 560
 561 #define ckWARNdep(loc,m) STMT_START {                                   \
 562     const IV offset = loc - RExC_precomp;                               \
 563     Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                   \
 564             m REPORT_LOCATION,                                          \
 565             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 566 } STMT_END
 567
 568 #define ckWARNregdep(loc,m) STMT_START {                                \
 569     const IV offset = loc - RExC_precomp;                               \
 570     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 571             m REPORT_LOCATION,                                          \
 572             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 573 } STMT_END
 574
 575 #define ckWARN2regdep(loc,m, a1) STMT_START {                           \
 576     const IV offset = loc - RExC_precomp;                               \
 577     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 578             m REPORT_LOCATION,                                          \
 579             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 580 } STMT_END
 581
 582 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 583     const IV offset = loc - RExC_precomp;                               \
 584     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 585             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 586 } STMT_END
 587
 588 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 589     const IV offset = loc - RExC_precomp;                               \
 590     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 591             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 592 } STMT_END
 593
 594 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 595     const IV offset = loc - RExC_precomp;                               \
 596     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 597             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 598 } STMT_END
 599
 600 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 601     const IV offset = loc - RExC_precomp;                               \
 602     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 603             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 604 } STMT_END
 605
 606 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 607     const IV offset = loc - RExC_precomp;                               \
 608     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 609             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 610 } STMT_END
 611
 612 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 613     const IV offset = loc - RExC_precomp;                               \
 614     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 615             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 616 } STMT_END
 617
 618
 619 /* Allow for side effects in s */
 620 #define REGC(c,s) STMT_START {                  \
 621     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 622 } STMT_END
 623
 624 /* Macros for recording node offsets.   20001227 mjd@plover.com
 625  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 626  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 627  * Element 0 holds the number n.
 628  * Position is 1 indexed.
 629  */
 630 #ifndef RE_TRACK_PATTERN_OFFSETS
 631 #define Set_Node_Offset_To_R(node,byte)
 632 #define Set_Node_Offset(node,byte)
 633 #define Set_Cur_Node_Offset
 634 #define Set_Node_Length_To_R(node,len)
 635 #define Set_Node_Length(node,len)
 636 #define Set_Node_Cur_Length(node)
 637 #define Node_Offset(n)
 638 #define Node_Length(n)
 639 #define Set_Node_Offset_Length(node,offset,len)
 640 #define ProgLen(ri) ri->u.proglen
 641 #define SetProgLen(ri,x) ri->u.proglen = x
 642 #else
 643 #define ProgLen(ri) ri->u.offsets[0]
 644 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 645 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 646     if (! SIZE_ONLY) {                                                  \
 647         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 648                     __LINE__, (int)(node), (int)(byte)));               \
 649         if((node) < 0) {                                                \
 650             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 651         } else {                                                        \
 652             RExC_offsets[2*(node)-1] = (byte);                          \
 653         }                                                               \
 654     }                                                                   \
 655 } STMT_END
 656
 657 #define Set_Node_Offset(node,byte) \
 658     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 659 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 660
 661 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 662     if (! SIZE_ONLY) {                                                  \
 663         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 664                 __LINE__, (int)(node), (int)(len)));                    \
 665         if((node) < 0) {                                                \
 666             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 667         } else {                                                        \
 668             RExC_offsets[2*(node)] = (len);                             \
 669         }                                                               \
 670     }                                                                   \
 671 } STMT_END
 672
 673 #define Set_Node_Length(node,len) \
 674     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 675 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 676 #define Set_Node_Cur_Length(node) \
 677     Set_Node_Length(node, RExC_parse - parse_start)
 678
 679 /* Get offsets and lengths */
 680 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 681 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 682
 683 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 684     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 685     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 686 } STMT_END
 687 #endif
 688
 689 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 690 #define EXPERIMENTAL_INPLACESCAN
 691 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 692
 693 #define DEBUG_STUDYDATA(str,data,depth)                              \
 694 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 695     PerlIO_printf(Perl_debug_log,                                    \
 696         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 697         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 698         (int)(depth)*2, "",                                          \
 699         (IV)((data)->pos_min),                                       \
 700         (IV)((data)->pos_delta),                                     \
 701         (UV)((data)->flags),                                         \
 702         (IV)((data)->whilem_c),                                      \
 703         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 704         is_inf ? "INF " : ""                                         \
 705     );                                                               \
 706     if ((data)->last_found)                                          \
 707         PerlIO_printf(Perl_debug_log,                                \
 708             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 709             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 710             SvPVX_const((data)->last_found),                         \
 711             (IV)((data)->last_end),                                  \
 712             (IV)((data)->last_start_min),                            \
 713             (IV)((data)->last_start_max),                            \
 714             ((data)->longest &&                                      \
 715              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 716             SvPVX_const((data)->longest_fixed),                      \
 717             (IV)((data)->offset_fixed),                              \
 718             ((data)->longest &&                                      \
 719              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 720             SvPVX_const((data)->longest_float),                      \
 721             (IV)((data)->offset_float_min),                          \
 722             (IV)((data)->offset_float_max)                           \
 723         );                                                           \
 724     PerlIO_printf(Perl_debug_log,"\n");                              \
 725 });
 726
 727 /* Mark that we cannot extend a found fixed substring at this point.
 728    Update the longest found anchored substring and the longest found
 729    floating substrings if needed. */
 730
 731 STATIC void
 732 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 733 {
 734     const STRLEN l = CHR_SVLEN(data->last_found);
 735     const STRLEN old_l = CHR_SVLEN(*data->longest);
 736     GET_RE_DEBUG_FLAGS_DECL;
 737
 738     PERL_ARGS_ASSERT_SCAN_COMMIT;
 739
 740     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 741         SvSetMagicSV(*data->longest, data->last_found);
 742         if (*data->longest == data->longest_fixed) {
 743             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 744             if (data->flags & SF_BEFORE_EOL)
 745                 data->flags
 746                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 747             else
 748                 data->flags &= ~SF_FIX_BEFORE_EOL;
 749             data->minlen_fixed=minlenp;
 750             data->lookbehind_fixed=0;
 751         }
 752         else { /* *data->longest == data->longest_float */
 753             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 754             data->offset_float_max = (l
 755                                       ? data->last_start_max
 756                                       : (data->pos_delta == I32_MAX ? I32_MAX : data->pos_min + data->pos_delta));
 757             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 758                 data->offset_float_max = I32_MAX;
 759             if (data->flags & SF_BEFORE_EOL)
 760                 data->flags
 761                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 762             else
 763                 data->flags &= ~SF_FL_BEFORE_EOL;
 764             data->minlen_float=minlenp;
 765             data->lookbehind_float=0;
 766         }
 767     }
 768     SvCUR_set(data->last_found, 0);
 769     {
 770         SV * const sv = data->last_found;
 771         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 772             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 773             if (mg)
 774                 mg->mg_len = 0;
 775         }
 776     }
 777     data->last_end = -1;
 778     data->flags &= ~SF_BEFORE_EOL;
 779     DEBUG_STUDYDATA("commit: ",data,0);
 780 }
 781
 782 /* These macros set, clear and test whether the synthetic start class ('ssc',
 783  * given by the parameter) matches an empty string (EOS).  This uses the
 784  * 'next_off' field in the node, to save a bit in the flags field.  The ssc
 785  * stands alone, so there is never a next_off, so this field is otherwise
 786  * unused.  The EOS information is used only for compilation, but theoretically
 787  * it could be passed on to the execution code.  This could be used to store
 788  * more than one bit of information, but only this one is currently used. */
 789 #define SET_SSC_EOS(node)   STMT_START { (node)->next_off = TRUE; } STMT_END
 790 #define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
 791 #define TEST_SSC_EOS(node)  cBOOL((node)->next_off)
 792
 793 /* Can match anything (initialization) */
 794 STATIC void
 795 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 796 {
 797     PERL_ARGS_ASSERT_CL_ANYTHING;
 798
 799     ANYOF_BITMAP_SETALL(cl);
 800     cl->flags = ANYOF_UNICODE_ALL;
 801     SET_SSC_EOS(cl);
 802
 803     /* If any portion of the regex is to operate under locale rules,
 804      * initialization includes it.  The reason this isn't done for all regexes
 805      * is that the optimizer was written under the assumption that locale was
 806      * all-or-nothing.  Given the complexity and lack of documentation in the
 807      * optimizer, and that there are inadequate test cases for locale, so many
 808      * parts of it may not work properly, it is safest to avoid locale unless
 809      * necessary. */
 810     if (RExC_contains_locale) {
 811         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 812         cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD;
 813     }
 814     else {
 815         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
 816     }
 817 }
 818
 819 /* Can match anything (initialization) */
 820 STATIC int
 821 S_cl_is_anything(const struct regnode_charclass_class *cl)
 822 {
 823     int value;
 824
 825     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 826
 827     for (value = 0; value < ANYOF_MAX; value += 2)
 828         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 829             return 1;
 830     if (!(cl->flags & ANYOF_UNICODE_ALL))
 831         return 0;
 832     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 833         return 0;
 834     return 1;
 835 }
 836
 837 /* Can match anything (initialization) */
 838 STATIC void
 839 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 840 {
 841     PERL_ARGS_ASSERT_CL_INIT;
 842
 843     Zero(cl, 1, struct regnode_charclass_class);
 844     cl->type = ANYOF;
 845     cl_anything(pRExC_state, cl);
 846     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 847 }
 848
 849 /* These two functions currently do the exact same thing */
 850 #define cl_init_zero            S_cl_init
 851
 852 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 853  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 854  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 855 STATIC void
 856 S_cl_and(struct regnode_charclass_class *cl,
 857         const struct regnode_charclass_class *and_with)
 858 {
 859     PERL_ARGS_ASSERT_CL_AND;
 860
 861     assert(PL_regkind[and_with->type] == ANYOF);
 862
 863     /* I (khw) am not sure all these restrictions are necessary XXX */
 864     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 865         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 866         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 867         && !(and_with->flags & ANYOF_LOC_FOLD)
 868         && !(cl->flags & ANYOF_LOC_FOLD)) {
 869         int i;
 870
 871         if (and_with->flags & ANYOF_INVERT)
 872             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 873                 cl->bitmap[i] &= ~and_with->bitmap[i];
 874         else
 875             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 876                 cl->bitmap[i] &= and_with->bitmap[i];
 877     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 878
 879     if (and_with->flags & ANYOF_INVERT) {
 880
 881         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 882          * aren't affected by the inversion.  Those that are affected are
 883          * handled individually below */
 884         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 885         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 886         cl->flags |= affected_flags;
 887
 888         /* We currently don't know how to deal with things that aren't in the
 889          * bitmap, but we know that the intersection is no greater than what
 890          * is already in cl, so let there be false positives that get sorted
 891          * out after the synthetic start class succeeds, and the node is
 892          * matched for real. */
 893
 894         /* The inversion of these two flags indicate that the resulting
 895          * intersection doesn't have them */
 896         if (and_with->flags & ANYOF_UNICODE_ALL) {
 897             cl->flags &= ~ANYOF_UNICODE_ALL;
 898         }
 899         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 900             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 901         }
 902     }
 903     else {   /* and'd node is not inverted */
 904         U8 outside_bitmap_but_not_utf8; /* Temp variable */
 905
 906         if (! ANYOF_NONBITMAP(and_with)) {
 907
 908             /* Here 'and_with' doesn't match anything outside the bitmap
 909              * (except possibly ANYOF_UNICODE_ALL), which means the
 910              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 911              * which case we don't know what the intersection is, but it's no
 912              * greater than what cl already has, so can just leave it alone,
 913              * with possible false positives */
 914             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 915                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 916                 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 917             }
 918         }
 919         else if (! ANYOF_NONBITMAP(cl)) {
 920
 921             /* Here, 'and_with' does match something outside the bitmap, and cl
 922              * doesn't have a list of things to match outside the bitmap.  If
 923              * cl can match all code points above 255, the intersection will
 924              * be those above-255 code points that 'and_with' matches.  If cl
 925              * can't match all Unicode code points, it means that it can't
 926              * match anything outside the bitmap (since the 'if' that got us
 927              * into this block tested for that), so we leave the bitmap empty.
 928              */
 929             if (cl->flags & ANYOF_UNICODE_ALL) {
 930                 ARG_SET(cl, ARG(and_with));
 931
 932                 /* and_with's ARG may match things that don't require UTF8.
 933                  * And now cl's will too, in spite of this being an 'and'.  See
 934                  * the comments below about the kludge */
 935                 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 936             }
 937         }
 938         else {
 939             /* Here, both 'and_with' and cl match something outside the
 940              * bitmap.  Currently we do not do the intersection, so just match
 941              * whatever cl had at the beginning.  */
 942         }
 943
 944
 945         /* Take the intersection of the two sets of flags.  However, the
 946          * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 947          * kludge around the fact that this flag is not treated like the others
 948          * which are initialized in cl_anything().  The way the optimizer works
 949          * is that the synthetic start class (SSC) is initialized to match
 950          * anything, and then the first time a real node is encountered, its
 951          * values are AND'd with the SSC's with the result being the values of
 952          * the real node.  However, there are paths through the optimizer where
 953          * the AND never gets called, so those initialized bits are set
 954          * inappropriately, which is not usually a big deal, as they just cause
 955          * false positives in the SSC, which will just mean a probably
 956          * imperceptible slow down in execution.  However this bit has a
 957          * higher false positive consequence in that it can cause utf8.pm,
 958          * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 959          * bigger slowdown and also causes significant extra memory to be used.
 960          * In order to prevent this, the code now takes a different tack.  The
 961          * bit isn't set unless some part of the regular expression needs it,
 962          * but once set it won't get cleared.  This means that these extra
 963          * modules won't get loaded unless there was some path through the
 964          * pattern that would have required them anyway, and  so any false
 965          * positives that occur by not ANDing them out when they could be
 966          * aren't as severe as they would be if we treated this bit like all
 967          * the others */
 968         outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 969                                       & ANYOF_NONBITMAP_NON_UTF8;
 970         cl->flags &= and_with->flags;
 971         cl->flags |= outside_bitmap_but_not_utf8;
 972     }
 973 }
 974
 975 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 976  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 977  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 978 STATIC void
 979 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 980 {
 981     PERL_ARGS_ASSERT_CL_OR;
 982
 983     if (or_with->flags & ANYOF_INVERT) {
 984
 985         /* Here, the or'd node is to be inverted.  This means we take the
 986          * complement of everything not in the bitmap, but currently we don't
 987          * know what that is, so give up and match anything */
 988         if (ANYOF_NONBITMAP(or_with)) {
 989             cl_anything(pRExC_state, cl);
 990         }
 991         /* We do not use
 992          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 993          *   <= (B1 | !B2) | (CL1 | !CL2)
 994          * which is wasteful if CL2 is small, but we ignore CL2:
 995          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 996          * XXXX Can we handle case-fold?  Unclear:
 997          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 998          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 999          */
1000         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1001              && !(or_with->flags & ANYOF_LOC_FOLD)
1002              && !(cl->flags & ANYOF_LOC_FOLD) ) {
1003             int i;
1004
1005             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1006                 cl->bitmap[i] |= ~or_with->bitmap[i];
1007         } /* XXXX: logic is complicated otherwise */
1008         else {
1009             cl_anything(pRExC_state, cl);
1010         }
1011
1012         /* And, we can just take the union of the flags that aren't affected
1013          * by the inversion */
1014         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
1015
1016         /* For the remaining flags:
1017             ANYOF_UNICODE_ALL and inverted means to not match anything above
1018                     255, which means that the union with cl should just be
1019                     what cl has in it, so can ignore this flag
1020             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
1021                     is 127-255 to match them, but then invert that, so the
1022                     union with cl should just be what cl has in it, so can
1023                     ignore this flag
1024          */
1025     } else {    /* 'or_with' is not inverted */
1026         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
1027         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1028              && (!(or_with->flags & ANYOF_LOC_FOLD)
1029                  || (cl->flags & ANYOF_LOC_FOLD)) ) {
1030             int i;
1031
1032             /* OR char bitmap and class bitmap separately */
1033             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1034                 cl->bitmap[i] |= or_with->bitmap[i];
1035             if (or_with->flags & ANYOF_CLASS) {
1036                 ANYOF_CLASS_OR(or_with, cl);
1037             }
1038         }
1039         else { /* XXXX: logic is complicated, leave it along for a moment. */
1040             cl_anything(pRExC_state, cl);
1041         }
1042
1043         if (ANYOF_NONBITMAP(or_with)) {
1044
1045             /* Use the added node's outside-the-bit-map match if there isn't a
1046              * conflict.  If there is a conflict (both nodes match something
1047              * outside the bitmap, but what they match outside is not the same
1048              * pointer, and hence not easily compared until XXX we extend
1049              * inversion lists this far), give up and allow the start class to
1050              * match everything outside the bitmap.  If that stuff is all above
1051              * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1052             if (! ANYOF_NONBITMAP(cl)) {
1053                 ARG_SET(cl, ARG(or_with));
1054             }
1055             else if (ARG(cl) != ARG(or_with)) {
1056
1057                 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1058                     cl_anything(pRExC_state, cl);
1059                 }
1060                 else {
1061                     cl->flags |= ANYOF_UNICODE_ALL;
1062                 }
1063             }
1064         }
1065
1066         /* Take the union */
1067         cl->flags |= or_with->flags;
1068     }
1069 }
1070
1071 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1072 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1073 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1074 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1075
1076
1077 #ifdef DEBUGGING
1078 /*
1079    dump_trie(trie,widecharmap,revcharmap)
1080    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1081    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1082
1083    These routines dump out a trie in a somewhat readable format.
1084    The _interim_ variants are used for debugging the interim
1085    tables that are used to generate the final compressed
1086    representation which is what dump_trie expects.
1087
1088    Part of the reason for their existence is to provide a form
1089    of documentation as to how the different representations function.
1090
1091 */
1092
1093 /*
1094   Dumps the final compressed table form of the trie to Perl_debug_log.
1095   Used for debugging make_trie().
1096 */
1097
1098 STATIC void
1099 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1100             AV *revcharmap, U32 depth)
1101 {
1102     U32 state;
1103     SV *sv=sv_newmortal();
1104     int colwidth= widecharmap ? 6 : 4;
1105     U16 word;
1106     GET_RE_DEBUG_FLAGS_DECL;
1107
1108     PERL_ARGS_ASSERT_DUMP_TRIE;
1109
1110     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1111         (int)depth * 2 + 2,"",
1112         "Match","Base","Ofs" );
1113
1114     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1115         SV ** const tmp = av_fetch( revcharmap, state, 0);
1116         if ( tmp ) {
1117             PerlIO_printf( Perl_debug_log, "%*s",
1118                 colwidth,
1119                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1120                             PL_colors[0], PL_colors[1],
1121                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1122                             PERL_PV_ESCAPE_FIRSTCHAR
1123                 )
1124             );
1125         }
1126     }
1127     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1128         (int)depth * 2 + 2,"");
1129
1130     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1131         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1132     PerlIO_printf( Perl_debug_log, "\n");
1133
1134     for( state = 1 ; state < trie->statecount ; state++ ) {
1135         const U32 base = trie->states[ state ].trans.base;
1136
1137         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1138
1139         if ( trie->states[ state ].wordnum ) {
1140             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1141         } else {
1142             PerlIO_printf( Perl_debug_log, "%6s", "" );
1143         }
1144
1145         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1146
1147         if ( base ) {
1148             U32 ofs = 0;
1149
1150             while( ( base + ofs  < trie->uniquecharcount ) ||
1151                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1152                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1153                     ofs++;
1154
1155             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1156
1157             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1158                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1159                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1160                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1161                 {
1162                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1163                     colwidth,
1164                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1165                 } else {
1166                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1167                 }
1168             }
1169
1170             PerlIO_printf( Perl_debug_log, "]");
1171
1172         }
1173         PerlIO_printf( Perl_debug_log, "\n" );
1174     }
1175     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1176     for (word=1; word <= trie->wordcount; word++) {
1177         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1178             (int)word, (int)(trie->wordinfo[word].prev),
1179             (int)(trie->wordinfo[word].len));
1180     }
1181     PerlIO_printf(Perl_debug_log, "\n" );
1182 }
1183 /*
1184   Dumps a fully constructed but uncompressed trie in list form.
1185   List tries normally only are used for construction when the number of
1186   possible chars (trie->uniquecharcount) is very high.
1187   Used for debugging make_trie().
1188 */
1189 STATIC void
1190 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1191                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1192                          U32 depth)
1193 {
1194     U32 state;
1195     SV *sv=sv_newmortal();
1196     int colwidth= widecharmap ? 6 : 4;
1197     GET_RE_DEBUG_FLAGS_DECL;
1198
1199     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1200
1201     /* print out the table precompression.  */
1202     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1203         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1204         "------:-----+-----------------\n" );
1205
1206     for( state=1 ; state < next_alloc ; state ++ ) {
1207         U16 charid;
1208
1209         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1210             (int)depth * 2 + 2,"", (UV)state  );
1211         if ( ! trie->states[ state ].wordnum ) {
1212             PerlIO_printf( Perl_debug_log, "%5s| ","");
1213         } else {
1214             PerlIO_printf( Perl_debug_log, "W%4x| ",
1215                 trie->states[ state ].wordnum
1216             );
1217         }
1218         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1219             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1220             if ( tmp ) {
1221                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1222                     colwidth,
1223                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1224                             PL_colors[0], PL_colors[1],
1225                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1226                             PERL_PV_ESCAPE_FIRSTCHAR
1227                     ) ,
1228                     TRIE_LIST_ITEM(state,charid).forid,
1229                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1230                 );
1231                 if (!(charid % 10))
1232                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1233                         (int)((depth * 2) + 14), "");
1234             }
1235         }
1236         PerlIO_printf( Perl_debug_log, "\n");
1237     }
1238 }
1239
1240 /*
1241   Dumps a fully constructed but uncompressed trie in table form.
1242   This is the normal DFA style state transition table, with a few
1243   twists to facilitate compression later.
1244   Used for debugging make_trie().
1245 */
1246 STATIC void
1247 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1248                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1249                           U32 depth)
1250 {
1251     U32 state;
1252     U16 charid;
1253     SV *sv=sv_newmortal();
1254     int colwidth= widecharmap ? 6 : 4;
1255     GET_RE_DEBUG_FLAGS_DECL;
1256
1257     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1258
1259     /*
1260        print out the table precompression so that we can do a visual check
1261        that they are identical.
1262      */
1263
1264     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1265
1266     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1267         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1268         if ( tmp ) {
1269             PerlIO_printf( Perl_debug_log, "%*s",
1270                 colwidth,
1271                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1272                             PL_colors[0], PL_colors[1],
1273                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1274                             PERL_PV_ESCAPE_FIRSTCHAR
1275                 )
1276             );
1277         }
1278     }
1279
1280     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1281
1282     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1283         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1284     }
1285
1286     PerlIO_printf( Perl_debug_log, "\n" );
1287
1288     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1289
1290         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1291             (int)depth * 2 + 2,"",
1292             (UV)TRIE_NODENUM( state ) );
1293
1294         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1295             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1296             if (v)
1297                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1298             else
1299                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1300         }
1301         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1302             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1303         } else {
1304             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1305             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1306         }
1307     }
1308 }
1309
1310 #endif
1311
1312
1313 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1314   startbranch: the first branch in the whole branch sequence
1315   first      : start branch of sequence of branch-exact nodes.
1316                May be the same as startbranch
1317   last       : Thing following the last branch.
1318                May be the same as tail.
1319   tail       : item following the branch sequence
1320   count      : words in the sequence
1321   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1322   depth      : indent depth
1323
1324 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1325
1326 A trie is an N'ary tree where the branches are determined by digital
1327 decomposition of the key. IE, at the root node you look up the 1st character and
1328 follow that branch repeat until you find the end of the branches. Nodes can be
1329 marked as "accepting" meaning they represent a complete word. Eg:
1330
1331   /he|she|his|hers/
1332
1333 would convert into the following structure. Numbers represent states, letters
1334 following numbers represent valid transitions on the letter from that state, if
1335 the number is in square brackets it represents an accepting state, otherwise it
1336 will be in parenthesis.
1337
1338       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1339       |    |
1340       |   (2)
1341       |    |
1342      (1)   +-i->(6)-+-s->[7]
1343       |
1344       +-s->(3)-+-h->(4)-+-e->[5]
1345
1346       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1347
1348 This shows that when matching against the string 'hers' we will begin at state 1
1349 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1350 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1351 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1352 single traverse. We store a mapping from accepting to state to which word was
1353 matched, and then when we have multiple possibilities we try to complete the
1354 rest of the regex in the order in which they occured in the alternation.
1355
1356 The only prior NFA like behaviour that would be changed by the TRIE support is
1357 the silent ignoring of duplicate alternations which are of the form:
1358
1359  / (DUPE|DUPE) X? (?{ ... }) Y /x
1360
1361 Thus EVAL blocks following a trie may be called a different number of times with
1362 and without the optimisation. With the optimisations dupes will be silently
1363 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1364 the following demonstrates:
1365
1366  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1367
1368 which prints out 'word' three times, but
1369
1370  'words'=~/(word|word|word)(?{ print $1 })S/
1371
1372 which doesnt print it out at all. This is due to other optimisations kicking in.
1373
1374 Example of what happens on a structural level:
1375
1376 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1377
1378    1: CURLYM[1] {1,32767}(18)
1379    5:   BRANCH(8)
1380    6:     EXACT <ac>(16)
1381    8:   BRANCH(11)
1382    9:     EXACT <ad>(16)
1383   11:   BRANCH(14)
1384   12:     EXACT <ab>(16)
1385   16:   SUCCEED(0)
1386   17:   NOTHING(18)
1387   18: END(0)
1388
1389 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1390 and should turn into:
1391
1392    1: CURLYM[1] {1,32767}(18)
1393    5:   TRIE(16)
1394         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1395           <ac>
1396           <ad>
1397           <ab>
1398   16:   SUCCEED(0)
1399   17:   NOTHING(18)
1400   18: END(0)
1401
1402 Cases where tail != last would be like /(?foo|bar)baz/:
1403
1404    1: BRANCH(4)
1405    2:   EXACT <foo>(8)
1406    4: BRANCH(7)
1407    5:   EXACT <bar>(8)
1408    7: TAIL(8)
1409    8: EXACT <baz>(10)
1410   10: END(0)
1411
1412 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1413 and would end up looking like:
1414
1415     1: TRIE(8)
1416       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1417         <foo>
1418         <bar>
1419    7: TAIL(8)
1420    8: EXACT <baz>(10)
1421   10: END(0)
1422
1423     d = uvuni_to_utf8_flags(d, uv, 0);
1424
1425 is the recommended Unicode-aware way of saying
1426
1427     *(d++) = uv;
1428 */
1429
1430 #define TRIE_STORE_REVCHAR(val)                                            \
1431     STMT_START {                                                           \
1432         if (UTF) {                                                         \
1433             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1434             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1435             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1436             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1437             SvPOK_on(zlopp);                                               \
1438             SvUTF8_on(zlopp);                                              \
1439             av_push(revcharmap, zlopp);                                    \
1440         } else {                                                           \
1441             char ooooff = (char)val;                                           \
1442             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1443         }                                                                  \
1444         } STMT_END
1445
1446 #define TRIE_READ_CHAR STMT_START {                                                     \
1447     wordlen++;                                                                          \
1448     if ( UTF ) {                                                                        \
1449         /* if it is UTF then it is either already folded, or does not need folding */   \
1450         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
1451     }                                                                                   \
1452     else if (folder == PL_fold_latin1) {                                                \
1453         /* if we use this folder we have to obey unicode rules on latin-1 data */       \
1454         if ( foldlen > 0 ) {                                                            \
1455            uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
1456            foldlen -= len;                                                              \
1457            scan += len;                                                                 \
1458            len = 0;                                                                     \
1459         } else {                                                                        \
1460             len = 1;                                                                    \
1461             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                     \
1462             skiplen = UNISKIP(uvc);                                                     \
1463             foldlen -= skiplen;                                                         \
1464             scan = foldbuf + skiplen;                                                   \
1465         }                                                                               \
1466     } else {                                                                            \
1467         /* raw data, will be folded later if needed */                                  \
1468         uvc = (U32)*uc;                                                                 \
1469         len = 1;                                                                        \
1470     }                                                                                   \
1471 } STMT_END
1472
1473
1474
1475 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1476     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1477         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1478         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1479     }                                                           \
1480     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1481     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1482     TRIE_LIST_CUR( state )++;                                   \
1483 } STMT_END
1484
1485 #define TRIE_LIST_NEW(state) STMT_START {                       \
1486     Newxz( trie->states[ state ].trans.list,               \
1487         4, reg_trie_trans_le );                                 \
1488      TRIE_LIST_CUR( state ) = 1;                                \
1489      TRIE_LIST_LEN( state ) = 4;                                \
1490 } STMT_END
1491
1492 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1493     U16 dupe= trie->states[ state ].wordnum;                    \
1494     regnode * const noper_next = regnext( noper );              \
1495                                                                 \
1496     DEBUG_r({                                                   \
1497         /* store the word for dumping */                        \
1498         SV* tmp;                                                \
1499         if (OP(noper) != NOTHING)                               \
1500             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1501         else                                                    \
1502             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1503         av_push( trie_words, tmp );                             \
1504     });                                                         \
1505                                                                 \
1506     curword++;                                                  \
1507     trie->wordinfo[curword].prev   = 0;                         \
1508     trie->wordinfo[curword].len    = wordlen;                   \
1509     trie->wordinfo[curword].accept = state;                     \
1510                                                                 \
1511     if ( noper_next < tail ) {                                  \
1512         if (!trie->jump)                                        \
1513             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1514         trie->jump[curword] = (U16)(noper_next - convert);      \
1515         if (!jumper)                                            \
1516             jumper = noper_next;                                \
1517         if (!nextbranch)                                        \
1518             nextbranch= regnext(cur);                           \
1519     }                                                           \
1520                                                                 \
1521     if ( dupe ) {                                               \
1522         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1523         /* chain, so that when the bits of chain are later    */\
1524         /* linked together, the dups appear in the chain      */\
1525         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1526         trie->wordinfo[dupe].prev = curword;                    \
1527     } else {                                                    \
1528         /* we haven't inserted this word yet.                */ \
1529         trie->states[ state ].wordnum = curword;                \
1530     }                                                           \
1531 } STMT_END
1532
1533
1534 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1535      ( ( base + charid >=  ucharcount                                   \
1536          && base + charid < ubound                                      \
1537          && state == trie->trans[ base - ucharcount + charid ].check    \
1538          && trie->trans[ base - ucharcount + charid ].next )            \
1539            ? trie->trans[ base - ucharcount + charid ].next             \
1540            : ( state==1 ? special : 0 )                                 \
1541       )
1542
1543 #define MADE_TRIE       1
1544 #define MADE_JUMP_TRIE  2
1545 #define MADE_EXACT_TRIE 4
1546
1547 STATIC I32
1548 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1549 {
1550     dVAR;
1551     /* first pass, loop through and scan words */
1552     reg_trie_data *trie;
1553     HV *widecharmap = NULL;
1554     AV *revcharmap = newAV();
1555     regnode *cur;
1556     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1557     STRLEN len = 0;
1558     UV uvc = 0;
1559     U16 curword = 0;
1560     U32 next_alloc = 0;
1561     regnode *jumper = NULL;
1562     regnode *nextbranch = NULL;
1563     regnode *convert = NULL;
1564     U32 *prev_states; /* temp array mapping each state to previous one */
1565     /* we just use folder as a flag in utf8 */
1566     const U8 * folder = NULL;
1567
1568 #ifdef DEBUGGING
1569     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1570     AV *trie_words = NULL;
1571     /* along with revcharmap, this only used during construction but both are
1572      * useful during debugging so we store them in the struct when debugging.
1573      */
1574 #else
1575     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1576     STRLEN trie_charcount=0;
1577 #endif
1578     SV *re_trie_maxbuff;
1579     GET_RE_DEBUG_FLAGS_DECL;
1580
1581     PERL_ARGS_ASSERT_MAKE_TRIE;
1582 #ifndef DEBUGGING
1583     PERL_UNUSED_ARG(depth);
1584 #endif
1585
1586     switch (flags) {
1587         case EXACT: break;
1588         case EXACTFA:
1589         case EXACTFU_SS:
1590         case EXACTFU_TRICKYFOLD:
1591         case EXACTFU: folder = PL_fold_latin1; break;
1592         case EXACTF:  folder = PL_fold; break;
1593         case EXACTFL: folder = PL_fold_locale; break;
1594         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1595     }
1596
1597     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1598     trie->refcount = 1;
1599     trie->startstate = 1;
1600     trie->wordcount = word_count;
1601     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1602     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1603     if (flags == EXACT)
1604         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1605     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1606                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1607
1608     DEBUG_r({
1609         trie_words = newAV();
1610     });
1611
1612     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1613     if (!SvIOK(re_trie_maxbuff)) {
1614         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1615     }
1616     DEBUG_TRIE_COMPILE_r({
1617                 PerlIO_printf( Perl_debug_log,
1618                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1619                   (int)depth * 2 + 2, "",
1620                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1621                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1622                   (int)depth);
1623     });
1624
1625    /* Find the node we are going to overwrite */
1626     if ( first == startbranch && OP( last ) != BRANCH ) {
1627         /* whole branch chain */
1628         convert = first;
1629     } else {
1630         /* branch sub-chain */
1631         convert = NEXTOPER( first );
1632     }
1633
1634     /*  -- First loop and Setup --
1635
1636        We first traverse the branches and scan each word to determine if it
1637        contains widechars, and how many unique chars there are, this is
1638        important as we have to build a table with at least as many columns as we
1639        have unique chars.
1640
1641        We use an array of integers to represent the character codes 0..255
1642        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1643        native representation of the character value as the key and IV's for the
1644        coded index.
1645
1646        *TODO* If we keep track of how many times each character is used we can
1647        remap the columns so that the table compression later on is more
1648        efficient in terms of memory by ensuring the most common value is in the
1649        middle and the least common are on the outside.  IMO this would be better
1650        than a most to least common mapping as theres a decent chance the most
1651        common letter will share a node with the least common, meaning the node
1652        will not be compressible. With a middle is most common approach the worst
1653        case is when we have the least common nodes twice.
1654
1655      */
1656
1657     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1658         regnode *noper = NEXTOPER( cur );
1659         const U8 *uc = (U8*)STRING( noper );
1660         const U8 *e  = uc + STR_LEN( noper );
1661         STRLEN foldlen = 0;
1662         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1663         STRLEN skiplen = 0;
1664         const U8 *scan = (U8*)NULL;
1665         U32 wordlen      = 0;         /* required init */
1666         STRLEN chars = 0;
1667         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1668
1669         if (OP(noper) == NOTHING) {
1670             regnode *noper_next= regnext(noper);
1671             if (noper_next != tail && OP(noper_next) == flags) {
1672                 noper = noper_next;
1673                 uc= (U8*)STRING(noper);
1674                 e= uc + STR_LEN(noper);
1675                 trie->minlen= STR_LEN(noper);
1676             } else {
1677                 trie->minlen= 0;
1678                 continue;
1679             }
1680         }
1681
1682         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1683             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1684                                           regardless of encoding */
1685             if (OP( noper ) == EXACTFU_SS) {
1686                 /* false positives are ok, so just set this */
1687                 TRIE_BITMAP_SET(trie,0xDF);
1688             }
1689         }
1690         for ( ; uc < e ; uc += len ) {
1691             TRIE_CHARCOUNT(trie)++;
1692             TRIE_READ_CHAR;
1693             chars++;
1694             if ( uvc < 256 ) {
1695                 if ( folder ) {
1696                     U8 folded= folder[ (U8) uvc ];
1697                     if ( !trie->charmap[ folded ] ) {
1698                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
1699                         TRIE_STORE_REVCHAR( folded );
1700                     }
1701                 }
1702                 if ( !trie->charmap[ uvc ] ) {
1703                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1704                     TRIE_STORE_REVCHAR( uvc );
1705                 }
1706                 if ( set_bit ) {
1707                     /* store the codepoint in the bitmap, and its folded
1708                      * equivalent. */
1709                     TRIE_BITMAP_SET(trie, uvc);
1710
1711                     /* store the folded codepoint */
1712                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1713
1714                     if ( !UTF ) {
1715                         /* store first byte of utf8 representation of
1716                            variant codepoints */
1717                         if (! UNI_IS_INVARIANT(uvc)) {
1718                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1719                         }
1720                     }
1721                     set_bit = 0; /* We've done our bit :-) */
1722                 }
1723             } else {
1724                 SV** svpp;
1725                 if ( !widecharmap )
1726                     widecharmap = newHV();
1727
1728                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1729
1730                 if ( !svpp )
1731                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1732
1733                 if ( !SvTRUE( *svpp ) ) {
1734                     sv_setiv( *svpp, ++trie->uniquecharcount );
1735                     TRIE_STORE_REVCHAR(uvc);
1736                 }
1737             }
1738         }
1739         if( cur == first ) {
1740             trie->minlen = chars;
1741             trie->maxlen = chars;
1742         } else if (chars < trie->minlen) {
1743             trie->minlen = chars;
1744         } else if (chars > trie->maxlen) {
1745             trie->maxlen = chars;
1746         }
1747         if (OP( noper ) == EXACTFU_SS) {
1748             /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1749             if (trie->minlen > 1)
1750                 trie->minlen= 1;
1751         }
1752         if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1753             /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1754              *                - We assume that any such sequence might match a 2 byte string */
1755             if (trie->minlen > 2 )
1756                 trie->minlen= 2;
1757         }
1758
1759     } /* end first pass */
1760     DEBUG_TRIE_COMPILE_r(
1761         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1762                 (int)depth * 2 + 2,"",
1763                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1764                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1765                 (int)trie->minlen, (int)trie->maxlen )
1766     );
1767
1768     /*
1769         We now know what we are dealing with in terms of unique chars and
1770         string sizes so we can calculate how much memory a naive
1771         representation using a flat table  will take. If it's over a reasonable
1772         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1773         conservative but potentially much slower representation using an array
1774         of lists.
1775
1776         At the end we convert both representations into the same compressed
1777         form that will be used in regexec.c for matching with. The latter
1778         is a form that cannot be used to construct with but has memory
1779         properties similar to the list form and access properties similar
1780         to the table form making it both suitable for fast searches and
1781         small enough that its feasable to store for the duration of a program.
1782
1783         See the comment in the code where the compressed table is produced
1784         inplace from the flat tabe representation for an explanation of how
1785         the compression works.
1786
1787     */
1788
1789
1790     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1791     prev_states[1] = 0;
1792
1793     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1794         /*
1795             Second Pass -- Array Of Lists Representation
1796
1797             Each state will be represented by a list of charid:state records
1798             (reg_trie_trans_le) the first such element holds the CUR and LEN
1799             points of the allocated array. (See defines above).
1800
1801             We build the initial structure using the lists, and then convert
1802             it into the compressed table form which allows faster lookups
1803             (but cant be modified once converted).
1804         */
1805
1806         STRLEN transcount = 1;
1807
1808         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1809             "%*sCompiling trie using list compiler\n",
1810             (int)depth * 2 + 2, ""));
1811
1812         trie->states = (reg_trie_state *)
1813             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1814                                   sizeof(reg_trie_state) );
1815         TRIE_LIST_NEW(1);
1816         next_alloc = 2;
1817
1818         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1819
1820             regnode *noper   = NEXTOPER( cur );
1821             U8 *uc           = (U8*)STRING( noper );
1822             const U8 *e      = uc + STR_LEN( noper );
1823             U32 state        = 1;         /* required init */
1824             U16 charid       = 0;         /* sanity init */
1825             U8 *scan         = (U8*)NULL; /* sanity init */
1826             STRLEN foldlen   = 0;         /* required init */
1827             U32 wordlen      = 0;         /* required init */
1828             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1829             STRLEN skiplen   = 0;
1830
1831             if (OP(noper) == NOTHING) {
1832                 regnode *noper_next= regnext(noper);
1833                 if (noper_next != tail && OP(noper_next) == flags) {
1834                     noper = noper_next;
1835                     uc= (U8*)STRING(noper);
1836                     e= uc + STR_LEN(noper);
1837                 }
1838             }
1839
1840             if (OP(noper) != NOTHING) {
1841                 for ( ; uc < e ; uc += len ) {
1842
1843                     TRIE_READ_CHAR;
1844
1845                     if ( uvc < 256 ) {
1846                         charid = trie->charmap[ uvc ];
1847                     } else {
1848                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1849                         if ( !svpp ) {
1850                             charid = 0;
1851                         } else {
1852                             charid=(U16)SvIV( *svpp );
1853                         }
1854                     }
1855                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1856                     if ( charid ) {
1857
1858                         U16 check;
1859                         U32 newstate = 0;
1860
1861                         charid--;
1862                         if ( !trie->states[ state ].trans.list ) {
1863                             TRIE_LIST_NEW( state );
1864                         }
1865                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1866                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1867                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1868                                 break;
1869                             }
1870                         }
1871                         if ( ! newstate ) {
1872                             newstate = next_alloc++;
1873                             prev_states[newstate] = state;
1874                             TRIE_LIST_PUSH( state, charid, newstate );
1875                             transcount++;
1876                         }
1877                         state = newstate;
1878                     } else {
1879                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1880                     }
1881                 }
1882             }
1883             TRIE_HANDLE_WORD(state);
1884
1885         } /* end second pass */
1886
1887         /* next alloc is the NEXT state to be allocated */
1888         trie->statecount = next_alloc;
1889         trie->states = (reg_trie_state *)
1890             PerlMemShared_realloc( trie->states,
1891                                    next_alloc
1892                                    * sizeof(reg_trie_state) );
1893
1894         /* and now dump it out before we compress it */
1895         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1896                                                          revcharmap, next_alloc,
1897                                                          depth+1)
1898         );
1899
1900         trie->trans = (reg_trie_trans *)
1901             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1902         {
1903             U32 state;
1904             U32 tp = 0;
1905             U32 zp = 0;
1906
1907
1908             for( state=1 ; state < next_alloc ; state ++ ) {
1909                 U32 base=0;
1910
1911                 /*
1912                 DEBUG_TRIE_COMPILE_MORE_r(
1913                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1914                 );
1915                 */
1916
1917                 if (trie->states[state].trans.list) {
1918                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1919                     U16 maxid=minid;
1920                     U16 idx;
1921
1922                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1923                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1924                         if ( forid < minid ) {
1925                             minid=forid;
1926                         } else if ( forid > maxid ) {
1927                             maxid=forid;
1928                         }
1929                     }
1930                     if ( transcount < tp + maxid - minid + 1) {
1931                         transcount *= 2;
1932                         trie->trans = (reg_trie_trans *)
1933                             PerlMemShared_realloc( trie->trans,
1934                                                      transcount
1935                                                      * sizeof(reg_trie_trans) );
1936                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1937                     }
1938                     base = trie->uniquecharcount + tp - minid;
1939                     if ( maxid == minid ) {
1940                         U32 set = 0;
1941                         for ( ; zp < tp ; zp++ ) {
1942                             if ( ! trie->trans[ zp ].next ) {
1943                                 base = trie->uniquecharcount + zp - minid;
1944                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1945                                 trie->trans[ zp ].check = state;
1946                                 set = 1;
1947                                 break;
1948                             }
1949                         }
1950                         if ( !set ) {
1951                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1952                             trie->trans[ tp ].check = state;
1953                             tp++;
1954                             zp = tp;
1955                         }
1956                     } else {
1957                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1958                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1959                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1960                             trie->trans[ tid ].check = state;
1961                         }
1962                         tp += ( maxid - minid + 1 );
1963                     }
1964                     Safefree(trie->states[ state ].trans.list);
1965                 }
1966                 /*
1967                 DEBUG_TRIE_COMPILE_MORE_r(
1968                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1969                 );
1970                 */
1971                 trie->states[ state ].trans.base=base;
1972             }
1973             trie->lasttrans = tp + 1;
1974         }
1975     } else {
1976         /*
1977            Second Pass -- Flat Table Representation.
1978
1979            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1980            We know that we will need Charcount+1 trans at most to store the data
1981            (one row per char at worst case) So we preallocate both structures
1982            assuming worst case.
1983
1984            We then construct the trie using only the .next slots of the entry
1985            structs.
1986
1987            We use the .check field of the first entry of the node temporarily to
1988            make compression both faster and easier by keeping track of how many non
1989            zero fields are in the node.
1990
1991            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1992            transition.
1993
1994            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1995            number representing the first entry of the node, and state as a
1996            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1997            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1998            are 2 entrys per node. eg:
1999
2000              A B       A B
2001           1. 2 4    1. 3 7
2002           2. 0 3    3. 0 5
2003           3. 0 0    5. 0 0
2004           4. 0 0    7. 0 0
2005
2006            The table is internally in the right hand, idx form. However as we also
2007            have to deal with the states array which is indexed by nodenum we have to
2008            use TRIE_NODENUM() to convert.
2009
2010         */
2011         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2012             "%*sCompiling trie using table compiler\n",
2013             (int)depth * 2 + 2, ""));
2014
2015         trie->trans = (reg_trie_trans *)
2016             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2017                                   * trie->uniquecharcount + 1,
2018                                   sizeof(reg_trie_trans) );
2019         trie->states = (reg_trie_state *)
2020             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2021                                   sizeof(reg_trie_state) );
2022         next_alloc = trie->uniquecharcount + 1;
2023
2024
2025         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2026
2027             regnode *noper   = NEXTOPER( cur );
2028             const U8 *uc     = (U8*)STRING( noper );
2029             const U8 *e      = uc + STR_LEN( noper );
2030
2031             U32 state        = 1;         /* required init */
2032
2033             U16 charid       = 0;         /* sanity init */
2034             U32 accept_state = 0;         /* sanity init */
2035             U8 *scan         = (U8*)NULL; /* sanity init */
2036
2037             STRLEN foldlen   = 0;         /* required init */
2038             U32 wordlen      = 0;         /* required init */
2039             STRLEN skiplen   = 0;
2040             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2041
2042             if (OP(noper) == NOTHING) {
2043                 regnode *noper_next= regnext(noper);
2044                 if (noper_next != tail && OP(noper_next) == flags) {
2045                     noper = noper_next;
2046                     uc= (U8*)STRING(noper);
2047                     e= uc + STR_LEN(noper);
2048                 }
2049             }
2050
2051             if ( OP(noper) != NOTHING ) {
2052                 for ( ; uc < e ; uc += len ) {
2053
2054                     TRIE_READ_CHAR;
2055
2056                     if ( uvc < 256 ) {
2057                         charid = trie->charmap[ uvc ];
2058                     } else {
2059                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2060                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2061                     }
2062                     if ( charid ) {
2063                         charid--;
2064                         if ( !trie->trans[ state + charid ].next ) {
2065                             trie->trans[ state + charid ].next = next_alloc;
2066                             trie->trans[ state ].check++;
2067                             prev_states[TRIE_NODENUM(next_alloc)]
2068                                     = TRIE_NODENUM(state);
2069                             next_alloc += trie->uniquecharcount;
2070                         }
2071                         state = trie->trans[ state + charid ].next;
2072                     } else {
2073                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2074                     }
2075                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
2076                 }
2077             }
2078             accept_state = TRIE_NODENUM( state );
2079             TRIE_HANDLE_WORD(accept_state);
2080
2081         } /* end second pass */
2082
2083         /* and now dump it out before we compress it */
2084         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2085                                                           revcharmap,
2086                                                           next_alloc, depth+1));
2087
2088         {
2089         /*
2090            * Inplace compress the table.*
2091
2092            For sparse data sets the table constructed by the trie algorithm will
2093            be mostly 0/FAIL transitions or to put it another way mostly empty.
2094            (Note that leaf nodes will not contain any transitions.)
2095
2096            This algorithm compresses the tables by eliminating most such
2097            transitions, at the cost of a modest bit of extra work during lookup:
2098
2099            - Each states[] entry contains a .base field which indicates the
2100            index in the state[] array wheres its transition data is stored.
2101
2102            - If .base is 0 there are no valid transitions from that node.
2103
2104            - If .base is nonzero then charid is added to it to find an entry in
2105            the trans array.
2106
2107            -If trans[states[state].base+charid].check!=state then the
2108            transition is taken to be a 0/Fail transition. Thus if there are fail
2109            transitions at the front of the node then the .base offset will point
2110            somewhere inside the previous nodes data (or maybe even into a node
2111            even earlier), but the .check field determines if the transition is
2112            valid.
2113
2114            XXX - wrong maybe?
2115            The following process inplace converts the table to the compressed
2116            table: We first do not compress the root node 1,and mark all its
2117            .check pointers as 1 and set its .base pointer as 1 as well. This
2118            allows us to do a DFA construction from the compressed table later,
2119            and ensures that any .base pointers we calculate later are greater
2120            than 0.
2121
2122            - We set 'pos' to indicate the first entry of the second node.
2123
2124            - We then iterate over the columns of the node, finding the first and
2125            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2126            and set the .check pointers accordingly, and advance pos
2127            appropriately and repreat for the next node. Note that when we copy
2128            the next pointers we have to convert them from the original
2129            NODEIDX form to NODENUM form as the former is not valid post
2130            compression.
2131
2132            - If a node has no transitions used we mark its base as 0 and do not
2133            advance the pos pointer.
2134
2135            - If a node only has one transition we use a second pointer into the
2136            structure to fill in allocated fail transitions from other states.
2137            This pointer is independent of the main pointer and scans forward
2138            looking for null transitions that are allocated to a state. When it
2139            finds one it writes the single transition into the "hole".  If the
2140            pointer doesnt find one the single transition is appended as normal.
2141
2142            - Once compressed we can Renew/realloc the structures to release the
2143            excess space.
2144
2145            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2146            specifically Fig 3.47 and the associated pseudocode.
2147
2148            demq
2149         */
2150         const U32 laststate = TRIE_NODENUM( next_alloc );
2151         U32 state, charid;
2152         U32 pos = 0, zp=0;
2153         trie->statecount = laststate;
2154
2155         for ( state = 1 ; state < laststate ; state++ ) {
2156             U8 flag = 0;
2157             const U32 stateidx = TRIE_NODEIDX( state );
2158             const U32 o_used = trie->trans[ stateidx ].check;
2159             U32 used = trie->trans[ stateidx ].check;
2160             trie->trans[ stateidx ].check = 0;
2161
2162             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2163                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2164                     if ( trie->trans[ stateidx + charid ].next ) {
2165                         if (o_used == 1) {
2166                             for ( ; zp < pos ; zp++ ) {
2167                                 if ( ! trie->trans[ zp ].next ) {
2168                                     break;
2169                                 }
2170                             }
2171                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2172                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2173                             trie->trans[ zp ].check = state;
2174                             if ( ++zp > pos ) pos = zp;
2175                             break;
2176                         }
2177                         used--;
2178                     }
2179                     if ( !flag ) {
2180                         flag = 1;
2181                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2182                     }
2183                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2184                     trie->trans[ pos ].check = state;
2185                     pos++;
2186                 }
2187             }
2188         }
2189         trie->lasttrans = pos + 1;
2190         trie->states = (reg_trie_state *)
2191             PerlMemShared_realloc( trie->states, laststate
2192                                    * sizeof(reg_trie_state) );
2193         DEBUG_TRIE_COMPILE_MORE_r(
2194                 PerlIO_printf( Perl_debug_log,
2195                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2196                     (int)depth * 2 + 2,"",
2197                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2198                     (IV)next_alloc,
2199                     (IV)pos,
2200                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2201             );
2202
2203         } /* end table compress */
2204     }
2205     DEBUG_TRIE_COMPILE_MORE_r(
2206             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2207                 (int)depth * 2 + 2, "",
2208                 (UV)trie->statecount,
2209                 (UV)trie->lasttrans)
2210     );
2211     /* resize the trans array to remove unused space */
2212     trie->trans = (reg_trie_trans *)
2213         PerlMemShared_realloc( trie->trans, trie->lasttrans
2214                                * sizeof(reg_trie_trans) );
2215
2216     {   /* Modify the program and insert the new TRIE node */
2217         U8 nodetype =(U8)(flags & 0xFF);
2218         char *str=NULL;
2219
2220 #ifdef DEBUGGING
2221         regnode *optimize = NULL;
2222 #ifdef RE_TRACK_PATTERN_OFFSETS
2223
2224         U32 mjd_offset = 0;
2225         U32 mjd_nodelen = 0;
2226 #endif /* RE_TRACK_PATTERN_OFFSETS */
2227 #endif /* DEBUGGING */
2228         /*
2229            This means we convert either the first branch or the first Exact,
2230            depending on whether the thing following (in 'last') is a branch
2231            or not and whther first is the startbranch (ie is it a sub part of
2232            the alternation or is it the whole thing.)
2233            Assuming its a sub part we convert the EXACT otherwise we convert
2234            the whole branch sequence, including the first.
2235          */
2236         /* Find the node we are going to overwrite */
2237         if ( first != startbranch || OP( last ) == BRANCH ) {
2238             /* branch sub-chain */
2239             NEXT_OFF( first ) = (U16)(last - first);
2240 #ifdef RE_TRACK_PATTERN_OFFSETS
2241             DEBUG_r({
2242                 mjd_offset= Node_Offset((convert));
2243                 mjd_nodelen= Node_Length((convert));
2244             });
2245 #endif
2246             /* whole branch chain */
2247         }
2248 #ifdef RE_TRACK_PATTERN_OFFSETS
2249         else {
2250             DEBUG_r({
2251                 const  regnode *nop = NEXTOPER( convert );
2252                 mjd_offset= Node_Offset((nop));
2253                 mjd_nodelen= Node_Length((nop));
2254             });
2255         }
2256         DEBUG_OPTIMISE_r(
2257             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2258                 (int)depth * 2 + 2, "",
2259                 (UV)mjd_offset, (UV)mjd_nodelen)
2260         );
2261 #endif
2262         /* But first we check to see if there is a common prefix we can
2263            split out as an EXACT and put in front of the TRIE node.  */
2264         trie->startstate= 1;
2265         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2266             U32 state;
2267             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2268                 U32 ofs = 0;
2269                 I32 idx = -1;
2270                 U32 count = 0;
2271                 const U32 base = trie->states[ state ].trans.base;
2272
2273                 if ( trie->states[state].wordnum )
2274                         count = 1;
2275
2276                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2277                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2278                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2279                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2280                     {
2281                         if ( ++count > 1 ) {
2282                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2283                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2284                             if ( state == 1 ) break;
2285                             if ( count == 2 ) {
2286                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2287                                 DEBUG_OPTIMISE_r(
2288                                     PerlIO_printf(Perl_debug_log,
2289                                         "%*sNew Start State=%"UVuf" Class: [",
2290                                         (int)depth * 2 + 2, "",
2291                                         (UV)state));
2292                                 if (idx >= 0) {
2293                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2294                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2295
2296                                     TRIE_BITMAP_SET(trie,*ch);
2297                                     if ( folder )
2298                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2299                                     DEBUG_OPTIMISE_r(
2300                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2301                                     );
2302                                 }
2303                             }
2304                             TRIE_BITMAP_SET(trie,*ch);
2305                             if ( folder )
2306                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2307                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2308                         }
2309                         idx = ofs;
2310                     }
2311                 }
2312                 if ( count == 1 ) {
2313                     SV **tmp = av_fetch( revcharmap, idx, 0);
2314                     STRLEN len;
2315                     char *ch = SvPV( *tmp, len );
2316                     DEBUG_OPTIMISE_r({
2317                         SV *sv=sv_newmortal();
2318                         PerlIO_printf( Perl_debug_log,
2319                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2320                             (int)depth * 2 + 2, "",
2321                             (UV)state, (UV)idx,
2322                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2323                                 PL_colors[0], PL_colors[1],
2324                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2325                                 PERL_PV_ESCAPE_FIRSTCHAR
2326                             )
2327                         );
2328                     });
2329                     if ( state==1 ) {
2330                         OP( convert ) = nodetype;
2331                         str=STRING(convert);
2332                         STR_LEN(convert)=0;
2333                     }
2334                     STR_LEN(convert) += len;
2335                     while (len--)
2336                         *str++ = *ch++;
2337                 } else {
2338 #ifdef DEBUGGING
2339                     if (state>1)
2340                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2341 #endif
2342                     break;
2343                 }
2344             }
2345             trie->prefixlen = (state-1);
2346             if (str) {
2347                 regnode *n = convert+NODE_SZ_STR(convert);
2348                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2349                 trie->startstate = state;
2350                 trie->minlen -= (state - 1);
2351                 trie->maxlen -= (state - 1);
2352 #ifdef DEBUGGING
2353                /* At least the UNICOS C compiler choked on this
2354                 * being argument to DEBUG_r(), so let's just have
2355                 * it right here. */
2356                if (
2357 #ifdef PERL_EXT_RE_BUILD
2358                    1
2359 #else
2360                    DEBUG_r_TEST
2361 #endif
2362                    ) {
2363                    regnode *fix = convert;
2364                    U32 word = trie->wordcount;
2365                    mjd_nodelen++;
2366                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2367                    while( ++fix < n ) {
2368                        Set_Node_Offset_Length(fix, 0, 0);
2369                    }
2370                    while (word--) {
2371                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2372                        if (tmp) {
2373                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2374                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2375                            else
2376                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2377                        }
2378                    }
2379                }
2380 #endif
2381                 if (trie->maxlen) {
2382                     convert = n;
2383                 } else {
2384                     NEXT_OFF(convert) = (U16)(tail - convert);
2385                     DEBUG_r(optimize= n);
2386                 }
2387             }
2388         }
2389         if (!jumper)
2390             jumper = last;
2391         if ( trie->maxlen ) {
2392             NEXT_OFF( convert ) = (U16)(tail - convert);
2393             ARG_SET( convert, data_slot );
2394             /* Store the offset to the first unabsorbed branch in
2395                jump[0], which is otherwise unused by the jump logic.
2396                We use this when dumping a trie and during optimisation. */
2397             if (trie->jump)
2398                 trie->jump[0] = (U16)(nextbranch - convert);
2399
2400             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2401              *   and there is a bitmap
2402              *   and the first "jump target" node we found leaves enough room
2403              * then convert the TRIE node into a TRIEC node, with the bitmap
2404              * embedded inline in the opcode - this is hypothetically faster.
2405              */
2406             if ( !trie->states[trie->startstate].wordnum
2407                  && trie->bitmap
2408                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2409             {
2410                 OP( convert ) = TRIEC;
2411                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2412                 PerlMemShared_free(trie->bitmap);
2413                 trie->bitmap= NULL;
2414             } else
2415                 OP( convert ) = TRIE;
2416
2417             /* store the type in the flags */
2418             convert->flags = nodetype;
2419             DEBUG_r({
2420             optimize = convert
2421                       + NODE_STEP_REGNODE
2422                       + regarglen[ OP( convert ) ];
2423             });
2424             /* XXX We really should free up the resource in trie now,
2425                    as we won't use them - (which resources?) dmq */
2426         }
2427         /* needed for dumping*/
2428         DEBUG_r(if (optimize) {
2429             regnode *opt = convert;
2430
2431             while ( ++opt < optimize) {
2432                 Set_Node_Offset_Length(opt,0,0);
2433             }
2434             /*
2435                 Try to clean up some of the debris left after the
2436                 optimisation.
2437              */
2438             while( optimize < jumper ) {
2439                 mjd_nodelen += Node_Length((optimize));
2440                 OP( optimize ) = OPTIMIZED;
2441                 Set_Node_Offset_Length(optimize,0,0);
2442                 optimize++;
2443             }
2444             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2445         });
2446     } /* end node insert */
2447
2448     /*  Finish populating the prev field of the wordinfo array.  Walk back
2449      *  from each accept state until we find another accept state, and if
2450      *  so, point the first word's .prev field at the second word. If the
2451      *  second already has a .prev field set, stop now. This will be the
2452      *  case either if we've already processed that word's accept state,
2453      *  or that state had multiple words, and the overspill words were
2454      *  already linked up earlier.
2455      */
2456     {
2457         U16 word;
2458         U32 state;
2459         U16 prev;
2460
2461         for (word=1; word <= trie->wordcount; word++) {
2462             prev = 0;
2463             if (trie->wordinfo[word].prev)
2464                 continue;
2465             state = trie->wordinfo[word].accept;
2466             while (state) {
2467                 state = prev_states[state];
2468                 if (!state)
2469                     break;
2470                 prev = trie->states[state].wordnum;
2471                 if (prev)
2472                     break;
2473             }
2474             trie->wordinfo[word].prev = prev;
2475         }
2476         Safefree(prev_states);
2477     }
2478
2479
2480     /* and now dump out the compressed format */
2481     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2482
2483     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2484 #ifdef DEBUGGING
2485     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2486     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2487 #else
2488     SvREFCNT_dec_NN(revcharmap);
2489 #endif
2490     return trie->jump
2491            ? MADE_JUMP_TRIE
2492            : trie->startstate>1
2493              ? MADE_EXACT_TRIE
2494              : MADE_TRIE;
2495 }
2496
2497 STATIC void
2498 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2499 {
2500 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2501
2502    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2503    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2504    ISBN 0-201-10088-6
2505
2506    We find the fail state for each state in the trie, this state is the longest proper
2507    suffix of the current state's 'word' that is also a proper prefix of another word in our
2508    trie. State 1 represents the word '' and is thus the default fail state. This allows
2509    the DFA not to have to restart after its tried and failed a word at a given point, it
2510    simply continues as though it had been matching the other word in the first place.
2511    Consider
2512       'abcdgu'=~/abcdefg|cdgu/
2513    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2514    fail, which would bring us to the state representing 'd' in the second word where we would
2515    try 'g' and succeed, proceeding to match 'cdgu'.
2516  */
2517  /* add a fail transition */
2518     const U32 trie_offset = ARG(source);
2519     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2520     U32 *q;
2521     const U32 ucharcount = trie->uniquecharcount;
2522     const U32 numstates = trie->statecount;
2523     const U32 ubound = trie->lasttrans + ucharcount;
2524     U32 q_read = 0;
2525     U32 q_write = 0;
2526     U32 charid;
2527     U32 base = trie->states[ 1 ].trans.base;
2528     U32 *fail;
2529     reg_ac_data *aho;
2530     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2531     GET_RE_DEBUG_FLAGS_DECL;
2532
2533     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2534 #ifndef DEBUGGING
2535     PERL_UNUSED_ARG(depth);
2536 #endif
2537
2538
2539     ARG_SET( stclass, data_slot );
2540     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2541     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2542     aho->trie=trie_offset;
2543     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2544     Copy( trie->states, aho->states, numstates, reg_trie_state );
2545     Newxz( q, numstates, U32);
2546     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2547     aho->refcount = 1;
2548     fail = aho->fail;
2549     /* initialize fail[0..1] to be 1 so that we always have
2550        a valid final fail state */
2551     fail[ 0 ] = fail[ 1 ] = 1;
2552
2553     for ( charid = 0; charid < ucharcount ; charid++ ) {
2554         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2555         if ( newstate ) {
2556             q[ q_write ] = newstate;
2557             /* set to point at the root */
2558             fail[ q[ q_write++ ] ]=1;
2559         }
2560     }
2561     while ( q_read < q_write) {
2562         const U32 cur = q[ q_read++ % numstates ];
2563         base = trie->states[ cur ].trans.base;
2564
2565         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2566             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2567             if (ch_state) {
2568                 U32 fail_state = cur;
2569                 U32 fail_base;
2570                 do {
2571                     fail_state = fail[ fail_state ];
2572                     fail_base = aho->states[ fail_state ].trans.base;
2573                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2574
2575                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2576                 fail[ ch_state ] = fail_state;
2577                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2578                 {
2579                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2580                 }
2581                 q[ q_write++ % numstates] = ch_state;
2582             }
2583         }
2584     }
2585     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2586        when we fail in state 1, this allows us to use the
2587        charclass scan to find a valid start char. This is based on the principle
2588        that theres a good chance the string being searched contains lots of stuff
2589        that cant be a start char.
2590      */
2591     fail[ 0 ] = fail[ 1 ] = 0;
2592     DEBUG_TRIE_COMPILE_r({
2593         PerlIO_printf(Perl_debug_log,
2594                       "%*sStclass Failtable (%"UVuf" states): 0",
2595                       (int)(depth * 2), "", (UV)numstates
2596         );
2597         for( q_read=1; q_read<numstates; q_read++ ) {
2598             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2599         }
2600         PerlIO_printf(Perl_debug_log, "\n");
2601     });
2602     Safefree(q);
2603     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2604 }
2605
2606
2607 /*
2608  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2609  * These need to be revisited when a newer toolchain becomes available.
2610  */
2611 #if defined(__sparc64__) && defined(__GNUC__)
2612 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2613 #       undef  SPARC64_GCC_WORKAROUND
2614 #       define SPARC64_GCC_WORKAROUND 1
2615 #   endif
2616 #endif
2617
2618 #define DEBUG_PEEP(str,scan,depth) \
2619     DEBUG_OPTIMISE_r({if (scan){ \
2620        SV * const mysv=sv_newmortal(); \
2621        regnode *Next = regnext(scan); \
2622        regprop(RExC_rx, mysv, scan); \
2623        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2624        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2625        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2626    }});
2627
2628
2629 /* The below joins as many adjacent EXACTish nodes as possible into a single
2630  * one.  The regop may be changed if the node(s) contain certain sequences that
2631  * require special handling.  The joining is only done if:
2632  * 1) there is room in the current conglomerated node to entirely contain the
2633  *    next one.
2634  * 2) they are the exact same node type
2635  *
2636  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2637  * these get optimized out
2638  *
2639  * If a node is to match under /i (folded), the number of characters it matches
2640  * can be different than its character length if it contains a multi-character
2641  * fold.  *min_subtract is set to the total delta of the input nodes.
2642  *
2643  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2644  * and contains LATIN SMALL LETTER SHARP S
2645  *
2646  * This is as good a place as any to discuss the design of handling these
2647  * multi-character fold sequences.  It's been wrong in Perl for a very long
2648  * time.  There are three code points in Unicode whose multi-character folds
2649  * were long ago discovered to mess things up.  The previous designs for
2650  * dealing with these involved assigning a special node for them.  This
2651  * approach doesn't work, as evidenced by this example:
2652  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2653  * Both these fold to "sss", but if the pattern is parsed to create a node that
2654  * would match just the \xDF, it won't be able to handle the case where a
2655  * successful match would have to cross the node's boundary.  The new approach
2656  * that hopefully generally solves the problem generates an EXACTFU_SS node
2657  * that is "sss".
2658  *
2659  * It turns out that there are problems with all multi-character folds, and not
2660  * just these three.  Now the code is general, for all such cases, but the
2661  * three still have some special handling.  The approach taken is:
2662  * 1)   This routine examines each EXACTFish node that could contain multi-
2663  *      character fold sequences.  It returns in *min_subtract how much to
2664  *      subtract from the the actual length of the string to get a real minimum
2665  *      match length; it is 0 if there are no multi-char folds.  This delta is
2666  *      used by the caller to adjust the min length of the match, and the delta
2667  *      between min and max, so that the optimizer doesn't reject these
2668  *      possibilities based on size constraints.
2669  * 2)   Certain of these sequences require special handling by the trie code,
2670  *      so, if found, this code changes the joined node type to special ops:
2671  *      EXACTFU_TRICKYFOLD and EXACTFU_SS.
2672  * 3)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2673  *      is used for an EXACTFU node that contains at least one "ss" sequence in
2674  *      it.  For non-UTF-8 patterns and strings, this is the only case where
2675  *      there is a possible fold length change.  That means that a regular
2676  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
2677  *      with length changes, and so can be processed faster.  regexec.c takes
2678  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
2679  *      pre-folded by regcomp.c.  This saves effort in regex matching.
2680  *      However, the pre-folding isn't done for non-UTF8 patterns because the
2681  *      fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2682  *      down by forcing the pattern into UTF8 unless necessary.  Also what
2683  *      EXACTF and EXACTFL nodes fold to isn't known until runtime.  The fold
2684  *      possibilities for the non-UTF8 patterns are quite simple, except for
2685  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
2686  *      members of a fold-pair, and arrays are set up for all of them so that
2687  *      the other member of the pair can be found quickly.  Code elsewhere in
2688  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2689  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
2690  *      described in the next item.
2691  * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
2692  *      'ss' or not is not knowable at compile time.  It will match iff the
2693  *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
2694  *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
2695  *      it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2696  *      described in item 3).  An assumption that the optimizer part of
2697  *      regexec.c (probably unwittingly) makes is that a character in the
2698  *      pattern corresponds to at most a single character in the target string.
2699  *      (And I do mean character, and not byte here, unlike other parts of the
2700  *      documentation that have never been updated to account for multibyte
2701  *      Unicode.)  This assumption is wrong only in this case, as all other
2702  *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
2703  *      virtue of having this file pre-fold UTF-8 patterns.   I'm
2704  *      reluctant to try to change this assumption, so instead the code punts.
2705  *      This routine examines EXACTF nodes for the sharp s, and returns a
2706  *      boolean indicating whether or not the node is an EXACTF node that
2707  *      contains a sharp s.  When it is true, the caller sets a flag that later
2708  *      causes the optimizer in this file to not set values for the floating
2709  *      and fixed string lengths, and thus avoids the optimizer code in
2710  *      regexec.c that makes the invalid assumption.  Thus, there is no
2711  *      optimization based on string lengths for EXACTF nodes that contain the
2712  *      sharp s.  This only happens for /id rules (which means the pattern
2713  *      isn't in UTF-8).
2714  */
2715
2716 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2717     if (PL_regkind[OP(scan)] == EXACT) \
2718         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2719
2720 STATIC U32
2721 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2722     /* Merge several consecutive EXACTish nodes into one. */
2723     regnode *n = regnext(scan);
2724     U32 stringok = 1;
2725     regnode *next = scan + NODE_SZ_STR(scan);
2726     U32 merged = 0;
2727     U32 stopnow = 0;
2728 #ifdef DEBUGGING
2729     regnode *stop = scan;
2730     GET_RE_DEBUG_FLAGS_DECL;
2731 #else
2732     PERL_UNUSED_ARG(depth);
2733 #endif
2734
2735     PERL_ARGS_ASSERT_JOIN_EXACT;
2736 #ifndef EXPERIMENTAL_INPLACESCAN
2737     PERL_UNUSED_ARG(flags);
2738     PERL_UNUSED_ARG(val);
2739 #endif
2740     DEBUG_PEEP("join",scan,depth);
2741
2742     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2743      * EXACT ones that are mergeable to the current one. */
2744     while (n
2745            && (PL_regkind[OP(n)] == NOTHING
2746                || (stringok && OP(n) == OP(scan)))
2747            && NEXT_OFF(n)
2748            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2749     {
2750
2751         if (OP(n) == TAIL || n > next)
2752             stringok = 0;
2753         if (PL_regkind[OP(n)] == NOTHING) {
2754             DEBUG_PEEP("skip:",n,depth);
2755             NEXT_OFF(scan) += NEXT_OFF(n);
2756             next = n + NODE_STEP_REGNODE;
2757 #ifdef DEBUGGING
2758             if (stringok)
2759                 stop = n;
2760 #endif
2761             n = regnext(n);
2762         }
2763         else if (stringok) {
2764             const unsigned int oldl = STR_LEN(scan);
2765             regnode * const nnext = regnext(n);
2766
2767             /* XXX I (khw) kind of doubt that this works on platforms where
2768              * U8_MAX is above 255 because of lots of other assumptions */
2769             /* Don't join if the sum can't fit into a single node */
2770             if (oldl + STR_LEN(n) > U8_MAX)
2771                 break;
2772
2773             DEBUG_PEEP("merg",n,depth);
2774             merged++;
2775
2776             NEXT_OFF(scan) += NEXT_OFF(n);
2777             STR_LEN(scan) += STR_LEN(n);
2778             next = n + NODE_SZ_STR(n);
2779             /* Now we can overwrite *n : */
2780             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2781 #ifdef DEBUGGING
2782             stop = next - 1;
2783 #endif
2784             n = nnext;
2785             if (stopnow) break;
2786         }
2787
2788 #ifdef EXPERIMENTAL_INPLACESCAN
2789         if (flags && !NEXT_OFF(n)) {
2790             DEBUG_PEEP("atch", val, depth);
2791             if (reg_off_by_arg[OP(n)]) {
2792                 ARG_SET(n, val - n);
2793             }
2794             else {
2795                 NEXT_OFF(n) = val - n;
2796             }
2797             stopnow = 1;
2798         }
2799 #endif
2800     }
2801
2802     *min_subtract = 0;
2803     *has_exactf_sharp_s = FALSE;
2804
2805     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2806      * can now analyze for sequences of problematic code points.  (Prior to
2807      * this final joining, sequences could have been split over boundaries, and
2808      * hence missed).  The sequences only happen in folding, hence for any
2809      * non-EXACT EXACTish node */
2810     if (OP(scan) != EXACT) {
2811         const U8 * const s0 = (U8*) STRING(scan);
2812         const U8 * s = s0;
2813         const U8 * const s_end = s0 + STR_LEN(scan);
2814
2815         /* One pass is made over the node's string looking for all the
2816          * possibilities.  to avoid some tests in the loop, there are two main
2817          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2818          * non-UTF-8 */
2819         if (UTF) {
2820
2821             /* Examine the string for a multi-character fold sequence.  UTF-8
2822              * patterns have all characters pre-folded by the time this code is
2823              * executed */
2824             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2825                                      length sequence we are looking for is 2 */
2826             {
2827                 int count = 0;
2828                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2829                 if (! len) {    /* Not a multi-char fold: get next char */
2830                     s += UTF8SKIP(s);
2831                     continue;
2832                 }
2833
2834                 /* Nodes with 'ss' require special handling, except for EXACTFL
2835                  * and EXACTFA for which there is no multi-char fold to this */
2836                 if (len == 2 && *s == 's' && *(s+1) == 's'
2837                     && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2838                 {
2839                     count = 2;
2840                     OP(scan) = EXACTFU_SS;
2841                     s += 2;
2842                 }
2843                 else if (len == 6   /* len is the same in both ASCII and EBCDIC for these */
2844                          && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
2845                                       COMBINING_DIAERESIS_UTF8
2846                                       COMBINING_ACUTE_ACCENT_UTF8,
2847                                    6)
2848                              || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
2849                                          COMBINING_DIAERESIS_UTF8
2850                                          COMBINING_ACUTE_ACCENT_UTF8,
2851                                      6)))
2852                 {
2853                     count = 3;
2854
2855                     /* These two folds require special handling by trie's, so
2856                      * change the node type to indicate this.  If EXACTFA and
2857                      * EXACTFL were ever to be handled by trie's, this would
2858                      * have to be changed.  If this node has already been
2859                      * changed to EXACTFU_SS in this loop, leave it as is.  (I
2860                      * (khw) think it doesn't matter in regexec.c for UTF
2861                      * patterns, but no need to change it */
2862                     if (OP(scan) == EXACTFU) {
2863                         OP(scan) = EXACTFU_TRICKYFOLD;
2864                     }
2865                     s += 6;
2866                 }
2867                 else { /* Here is a generic multi-char fold. */
2868                     const U8* multi_end  = s + len;
2869
2870                     /* Count how many characters in it.  In the case of /l and
2871                      * /aa, no folds which contain ASCII code points are
2872                      * allowed, so check for those, and skip if found.  (In
2873                      * EXACTFL, no folds are allowed to any Latin1 code point,
2874                      * not just ASCII.  But there aren't any of these
2875                      * currently, nor ever likely, so don't take the time to
2876                      * test for them.  The code that generates the
2877                      * is_MULTI_foo() macros croaks should one actually get put
2878                      * into Unicode .) */
2879                     if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2880                         count = utf8_length(s, multi_end);
2881                         s = multi_end;
2882                     }
2883                     else {
2884                         while (s < multi_end) {
2885                             if (isASCII(*s)) {
2886                                 s++;
2887                                 goto next_iteration;
2888                             }
2889                             else {
2890                                 s += UTF8SKIP(s);
2891                             }
2892                             count++;
2893                         }
2894                     }
2895                 }
2896
2897                 /* The delta is how long the sequence is minus 1 (1 is how long
2898                  * the character that folds to the sequence is) */
2899                 *min_subtract += count - 1;
2900             next_iteration: ;
2901             }
2902         }
2903         else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2904
2905             /* Here, the pattern is not UTF-8.  Look for the multi-char folds
2906              * that are all ASCII.  As in the above case, EXACTFL and EXACTFA
2907              * nodes can't have multi-char folds to this range (and there are
2908              * no existing ones in the upper latin1 range).  In the EXACTF
2909              * case we look also for the sharp s, which can be in the final
2910              * position.  Otherwise we can stop looking 1 byte earlier because
2911              * have to find at least two characters for a multi-fold */
2912             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2913
2914             /* The below is perhaps overboard, but this allows us to save a
2915              * test each time through the loop at the expense of a mask.  This
2916              * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
2917              * by a single bit.  On ASCII they are 32 apart; on EBCDIC, they
2918              * are 64.  This uses an exclusive 'or' to find that bit and then
2919              * inverts it to form a mask, with just a single 0, in the bit
2920              * position where 'S' and 's' differ. */
2921             const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2922             const U8 s_masked = 's' & S_or_s_mask;
2923
2924             while (s < upper) {
2925                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2926                 if (! len) {    /* Not a multi-char fold. */
2927                     if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2928                     {
2929                         *has_exactf_sharp_s = TRUE;
2930                     }
2931                     s++;
2932                     continue;
2933                 }
2934
2935                 if (len == 2
2936                     && ((*s & S_or_s_mask) == s_masked)
2937                     && ((*(s+1) & S_or_s_mask) == s_masked))
2938                 {
2939
2940                     /* EXACTF nodes need to know that the minimum length
2941                      * changed so that a sharp s in the string can match this
2942                      * ss in the pattern, but they remain EXACTF nodes, as they
2943                      * won't match this unless the target string is is UTF-8,
2944                      * which we don't know until runtime */
2945                     if (OP(scan) != EXACTF) {
2946                         OP(scan) = EXACTFU_SS;
2947                     }
2948                 }
2949
2950                 *min_subtract += len - 1;
2951                 s += len;
2952             }
2953         }
2954     }
2955
2956 #ifdef DEBUGGING
2957     /* Allow dumping but overwriting the collection of skipped
2958      * ops and/or strings with fake optimized ops */
2959     n = scan + NODE_SZ_STR(scan);
2960     while (n <= stop) {
2961         OP(n) = OPTIMIZED;
2962         FLAGS(n) = 0;
2963         NEXT_OFF(n) = 0;
2964         n++;
2965     }
2966 #endif
2967     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2968     return stopnow;
2969 }
2970
2971 /* REx optimizer.  Converts nodes into quicker variants "in place".
2972    Finds fixed substrings.  */
2973
2974 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2975    to the position after last scanned or to NULL. */
2976
2977 #define INIT_AND_WITHP \
2978     assert(!and_withp); \
2979     Newx(and_withp,1,struct regnode_charclass_class); \
2980     SAVEFREEPV(and_withp)
2981
2982 /* this is a chain of data about sub patterns we are processing that
2983    need to be handled separately/specially in study_chunk. Its so
2984    we can simulate recursion without losing state.  */
2985 struct scan_frame;
2986 typedef struct scan_frame {
2987     regnode *last;  /* last node to process in this frame */
2988     regnode *next;  /* next node to process when last is reached */
2989     struct scan_frame *prev; /*previous frame*/
2990     I32 stop; /* what stopparen do we use */
2991 } scan_frame;
2992
2993
2994 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2995
2996 STATIC I32
2997 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2998                         I32 *minlenp, I32 *deltap,
2999                         regnode *last,
3000                         scan_data_t *data,
3001                         I32 stopparen,
3002                         U8* recursed,
3003                         struct regnode_charclass_class *and_withp,
3004                         U32 flags, U32 depth)
3005                         /* scanp: Start here (read-write). */
3006                         /* deltap: Write maxlen-minlen here. */
3007                         /* last: Stop before this one. */
3008                         /* data: string data about the pattern */
3009                         /* stopparen: treat close N as END */
3010                         /* recursed: which subroutines have we recursed into */
3011                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3012 {
3013     dVAR;
3014     I32 min = 0;    /* There must be at least this number of characters to match */
3015     I32 pars = 0, code;
3016     regnode *scan = *scanp, *next;
3017     I32 delta = 0;
3018     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3019     int is_inf_internal = 0;            /* The studied chunk is infinite */
3020     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3021     scan_data_t data_fake;
3022     SV *re_trie_maxbuff = NULL;
3023     regnode *first_non_open = scan;
3024     I32 stopmin = I32_MAX;
3025     scan_frame *frame = NULL;
3026     GET_RE_DEBUG_FLAGS_DECL;
3027
3028     PERL_ARGS_ASSERT_STUDY_CHUNK;
3029
3030 #ifdef DEBUGGING
3031     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3032 #endif
3033
3034     if ( depth == 0 ) {
3035         while (first_non_open && OP(first_non_open) == OPEN)
3036             first_non_open=regnext(first_non_open);
3037     }
3038
3039
3040   fake_study_recurse:
3041     while ( scan && OP(scan) != END && scan < last ){
3042         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3043                                    node length to get a real minimum (because
3044                                    the folded version may be shorter) */
3045         bool has_exactf_sharp_s = FALSE;
3046         /* Peephole optimizer: */
3047         DEBUG_STUDYDATA("Peep:", data,depth);
3048         DEBUG_PEEP("Peep",scan,depth);
3049
3050         /* Its not clear to khw or hv why this is done here, and not in the
3051          * clauses that deal with EXACT nodes.  khw's guess is that it's
3052          * because of a previous design */
3053         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3054
3055         /* Follow the next-chain of the current node and optimize
3056            away all the NOTHINGs from it.  */
3057         if (OP(scan) != CURLYX) {
3058             const int max = (reg_off_by_arg[OP(scan)]
3059                        ? I32_MAX
3060                        /* I32 may be smaller than U16 on CRAYs! */
3061                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3062             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3063             int noff;
3064             regnode *n = scan;
3065
3066             /* Skip NOTHING and LONGJMP. */
3067             while ((n = regnext(n))
3068                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3069                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3070                    && off + noff < max)
3071                 off += noff;
3072             if (reg_off_by_arg[OP(scan)])
3073                 ARG(scan) = off;
3074             else
3075                 NEXT_OFF(scan) = off;
3076         }
3077
3078
3079
3080         /* The principal pseudo-switch.  Cannot be a switch, since we
3081            look into several different things.  */
3082         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3083                    || OP(scan) == IFTHEN) {
3084             next = regnext(scan);
3085             code = OP(scan);
3086             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3087
3088             if (OP(next) == code || code == IFTHEN) {
3089                 /* NOTE - There is similar code to this block below for handling
3090                    TRIE nodes on a re-study.  If you change stuff here check there
3091                    too. */
3092                 I32 max1 = 0, min1 = I32_MAX, num = 0;
3093                 struct regnode_charclass_class accum;
3094                 regnode * const startbranch=scan;
3095
3096                 if (flags & SCF_DO_SUBSTR)
3097                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3098                 if (flags & SCF_DO_STCLASS)
3099                     cl_init_zero(pRExC_state, &accum);
3100
3101                 while (OP(scan) == code) {
3102                     I32 deltanext, minnext, f = 0, fake;
3103                     struct regnode_charclass_class this_class;
3104
3105                     num++;
3106                     data_fake.flags = 0;
3107                     if (data) {
3108                         data_fake.whilem_c = data->whilem_c;
3109                         data_fake.last_closep = data->last_closep;
3110                     }
3111                     else
3112                         data_fake.last_closep = &fake;
3113
3114                     data_fake.pos_delta = delta;
3115                     next = regnext(scan);
3116                     scan = NEXTOPER(scan);
3117                     if (code != BRANCH)
3118                         scan = NEXTOPER(scan);
3119                     if (flags & SCF_DO_STCLASS) {
3120                         cl_init(pRExC_state, &this_class);
3121                         data_fake.start_class = &this_class;
3122                         f = SCF_DO_STCLASS_AND;
3123                     }
3124                     if (flags & SCF_WHILEM_VISITED_POS)
3125                         f |= SCF_WHILEM_VISITED_POS;
3126
3127                     /* we suppose the run is continuous, last=next...*/
3128                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3129                                           next, &data_fake,
3130                                           stopparen, recursed, NULL, f,depth+1);
3131                     if (min1 > minnext)
3132                         min1 = minnext;
3133                     if (deltanext == I32_MAX) {
3134                         is_inf = is_inf_internal = 1;
3135                         max1 = I32_MAX;
3136                     } else if (max1 < minnext + deltanext)
3137                         max1 = minnext + deltanext;
3138                     scan = next;
3139                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3140                         pars++;
3141                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3142                         if ( stopmin > minnext)
3143                             stopmin = min + min1;
3144                         flags &= ~SCF_DO_SUBSTR;
3145                         if (data)
3146                             data->flags |= SCF_SEEN_ACCEPT;
3147                     }
3148                     if (data) {
3149                         if (data_fake.flags & SF_HAS_EVAL)
3150                             data->flags |= SF_HAS_EVAL;
3151                         data->whilem_c = data_fake.whilem_c;
3152                     }
3153                     if (flags & SCF_DO_STCLASS)
3154                         cl_or(pRExC_state, &accum, &this_class);
3155                 }
3156                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3157                     min1 = 0;
3158                 if (flags & SCF_DO_SUBSTR) {
3159                     data->pos_min += min1;
3160                     if (data->pos_delta >= I32_MAX - (max1 - min1))
3161                         data->pos_delta = I32_MAX;
3162                     else
3163                         data->pos_delta += max1 - min1;
3164                     if (max1 != min1 || is_inf)
3165                         data->longest = &(data->longest_float);
3166                 }
3167                 min += min1;
3168                 if (delta == I32_MAX || I32_MAX - delta - (max1 - min1) < 0)
3169                     delta = I32_MAX;
3170                 else
3171                     delta += max1 - min1;
3172                 if (flags & SCF_DO_STCLASS_OR) {
3173                     cl_or(pRExC_state, data->start_class, &accum);
3174                     if (min1) {
3175                         cl_and(data->start_class, and_withp);
3176                         flags &= ~SCF_DO_STCLASS;
3177                     }
3178                 }
3179                 else if (flags & SCF_DO_STCLASS_AND) {
3180                     if (min1) {
3181                         cl_and(data->start_class, &accum);
3182                         flags &= ~SCF_DO_STCLASS;
3183                     }
3184                     else {
3185                         /* Switch to OR mode: cache the old value of
3186                          * data->start_class */
3187                         INIT_AND_WITHP;
3188                         StructCopy(data->start_class, and_withp,
3189                                    struct regnode_charclass_class);
3190                         flags &= ~SCF_DO_STCLASS_AND;
3191                         StructCopy(&accum, data->start_class,
3192                                    struct regnode_charclass_class);
3193                         flags |= SCF_DO_STCLASS_OR;
3194                         SET_SSC_EOS(data->start_class);
3195                     }
3196                 }
3197
3198                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3199                 /* demq.
3200
3201                    Assuming this was/is a branch we are dealing with: 'scan' now
3202                    points at the item that follows the branch sequence, whatever
3203                    it is. We now start at the beginning of the sequence and look
3204                    for subsequences of
3205
3206                    BRANCH->EXACT=>x1
3207                    BRANCH->EXACT=>x2
3208                    tail
3209
3210                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
3211
3212                    If we can find such a subsequence we need to turn the first
3213                    element into a trie and then add the subsequent branch exact
3214                    strings to the trie.
3215
3216                    We have two cases
3217
3218                      1. patterns where the whole set of branches can be converted.
3219
3220                      2. patterns where only a subset can be converted.
3221
3222                    In case 1 we can replace the whole set with a single regop
3223                    for the trie. In case 2 we need to keep the start and end
3224                    branches so
3225
3226                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3227                      becomes BRANCH TRIE; BRANCH X;
3228
3229                   There is an additional case, that being where there is a
3230                   common prefix, which gets split out into an EXACT like node
3231                   preceding the TRIE node.
3232
3233                   If x(1..n)==tail then we can do a simple trie, if not we make
3234                   a "jump" trie, such that when we match the appropriate word
3235                   we "jump" to the appropriate tail node. Essentially we turn
3236                   a nested if into a case structure of sorts.
3237
3238                 */
3239
3240                     int made=0;
3241                     if (!re_trie_maxbuff) {
3242                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3243                         if (!SvIOK(re_trie_maxbuff))
3244                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3245                     }
3246                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3247                         regnode *cur;
3248                         regnode *first = (regnode *)NULL;
3249                         regnode *last = (regnode *)NULL;
3250                         regnode *tail = scan;
3251                         U8 trietype = 0;
3252                         U32 count=0;
3253
3254 #ifdef DEBUGGING
3255                         SV * const mysv = sv_newmortal();       /* for dumping */
3256 #endif
3257                         /* var tail is used because there may be a TAIL
3258                            regop in the way. Ie, the exacts will point to the
3259                            thing following the TAIL, but the last branch will
3260                            point at the TAIL. So we advance tail. If we
3261                            have nested (?:) we may have to move through several
3262                            tails.
3263                          */
3264
3265                         while ( OP( tail ) == TAIL ) {
3266                             /* this is the TAIL generated by (?:) */
3267                             tail = regnext( tail );
3268                         }
3269
3270
3271                         DEBUG_TRIE_COMPILE_r({
3272                             regprop(RExC_rx, mysv, tail );
3273                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3274                                 (int)depth * 2 + 2, "",
3275                                 "Looking for TRIE'able sequences. Tail node is: ",
3276                                 SvPV_nolen_const( mysv )
3277                             );
3278                         });
3279
3280                         /*
3281
3282                             Step through the branches
3283                                 cur represents each branch,
3284                                 noper is the first thing to be matched as part of that branch
3285                                 noper_next is the regnext() of that node.
3286
3287                             We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3288                             via a "jump trie" but we also support building with NOJUMPTRIE,
3289                             which restricts the trie logic to structures like /FOO|BAR/.
3290
3291                             If noper is a trieable nodetype then the branch is a possible optimization
3292                             target. If we are building under NOJUMPTRIE then we require that noper_next
3293                             is the same as scan (our current position in the regex program).
3294
3295                             Once we have two or more consecutive such branches we can create a
3296                             trie of the EXACT's contents and stitch it in place into the program.
3297
3298                             If the sequence represents all of the branches in the alternation we
3299                             replace the entire thing with a single TRIE node.
3300
3301                             Otherwise when it is a subsequence we need to stitch it in place and
3302                             replace only the relevant branches. This means the first branch has
3303                             to remain as it is used by the alternation logic, and its next pointer,
3304                             and needs to be repointed at the item on the branch chain following
3305                             the last branch we have optimized away.
3306
3307                             This could be either a BRANCH, in which case the subsequence is internal,
3308                             or it could be the item following the branch sequence in which case the
3309                             subsequence is at the end (which does not necessarily mean the first node
3310                             is the start of the alternation).
3311
3312                             TRIE_TYPE(X) is a define which maps the optype to a trietype.
3313
3314                                 optype          |  trietype
3315                                 ----------------+-----------
3316                                 NOTHING         | NOTHING
3317                                 EXACT           | EXACT
3318                                 EXACTFU         | EXACTFU
3319                                 EXACTFU_SS      | EXACTFU
3320                                 EXACTFU_TRICKYFOLD | EXACTFU
3321                                 EXACTFA         | 0
3322
3323
3324                         */
3325 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3326                        ( EXACT == (X) )   ? EXACT :        \
3327                        ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
3328                        0 )
3329
3330                         /* dont use tail as the end marker for this traverse */
3331                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3332                             regnode * const noper = NEXTOPER( cur );
3333                             U8 noper_type = OP( noper );
3334                             U8 noper_trietype = TRIE_TYPE( noper_type );
3335 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3336                             regnode * const noper_next = regnext( noper );
3337                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3338                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3339 #endif
3340
3341                             DEBUG_TRIE_COMPILE_r({
3342                                 regprop(RExC_rx, mysv, cur);
3343                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3344                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3345
3346                                 regprop(RExC_rx, mysv, noper);
3347                                 PerlIO_printf( Perl_debug_log, " -> %s",
3348                                     SvPV_nolen_const(mysv));
3349
3350                                 if ( noper_next ) {
3351                                   regprop(RExC_rx, mysv, noper_next );
3352                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3353                                     SvPV_nolen_const(mysv));
3354                                 }
3355                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3356                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3357                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3358                                 );
3359                             });
3360
3361                             /* Is noper a trieable nodetype that can be merged with the
3362                              * current trie (if there is one)? */
3363                             if ( noper_trietype
3364                                   &&
3365                                   (
3366                                         ( noper_trietype == NOTHING)
3367                                         || ( trietype == NOTHING )
3368                                         || ( trietype == noper_trietype )
3369                                   )
3370 #ifdef NOJUMPTRIE
3371                                   && noper_next == tail
3372 #endif
3373                                   && count < U16_MAX)
3374                             {
3375                                 /* Handle mergable triable node
3376                                  * Either we are the first node in a new trieable sequence,
3377                                  * in which case we do some bookkeeping, otherwise we update
3378                                  * the end pointer. */
3379                                 if ( !first ) {
3380                                     first = cur;
3381                                     if ( noper_trietype == NOTHING ) {
3382 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3383                                         regnode * const noper_next = regnext( noper );
3384                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3385                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3386 #endif
3387
3388                                         if ( noper_next_trietype ) {
3389                                             trietype = noper_next_trietype;
3390                                         } else if (noper_next_type)  {
3391                                             /* a NOTHING regop is 1 regop wide. We need at least two
3392                                              * for a trie so we can't merge this in */
3393                                             first = NULL;
3394                                         }
3395                                     } else {
3396                                         trietype = noper_trietype;
3397                                     }
3398                                 } else {
3399                                     if ( trietype == NOTHING )
3400                                         trietype = noper_trietype;
3401                                     last = cur;
3402                                 }
3403                                 if (first)
3404                                     count++;
3405                             } /* end handle mergable triable node */
3406                             else {
3407                                 /* handle unmergable node -
3408                                  * noper may either be a triable node which can not be tried
3409                                  * together with the current trie, or a non triable node */
3410                                 if ( last ) {
3411                                     /* If last is set and trietype is not NOTHING then we have found
3412                                      * at least two triable branch sequences in a row of a similar
3413                                      * trietype so we can turn them into a trie. If/when we
3414                                      * allow NOTHING to start a trie sequence this condition will be
3415                                      * required, and it isn't expensive so we leave it in for now. */
3416                                     if ( trietype && trietype != NOTHING )
3417                                         make_trie( pRExC_state,
3418                                                 startbranch, first, cur, tail, count,
3419                                                 trietype, depth+1 );
3420                                     last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3421                                 }
3422                                 if ( noper_trietype
3423 #ifdef NOJUMPTRIE
3424                                      && noper_next == tail
3425 #endif
3426                                 ){
3427                                     /* noper is triable, so we can start a new trie sequence */
3428                                     count = 1;
3429                                     first = cur;
3430                                     trietype = noper_trietype;
3431                                 } else if (first) {
3432                                     /* if we already saw a first but the current node is not triable then we have
3433                                      * to reset the first information. */
3434                                     count = 0;
3435                                     first = NULL;
3436                                     trietype = 0;
3437                                 }
3438                             } /* end handle unmergable node */
3439                         } /* loop over branches */
3440                         DEBUG_TRIE_COMPILE_r({
3441                             regprop(RExC_rx, mysv, cur);
3442                             PerlIO_printf( Perl_debug_log,
3443                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3444                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3445
3446                         });
3447                         if ( last && trietype ) {
3448                             if ( trietype != NOTHING ) {
3449                                 /* the last branch of the sequence was part of a trie,
3450                                  * so we have to construct it here outside of the loop
3451                                  */
3452                                 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3453 #ifdef TRIE_STUDY_OPT
3454                                 if ( ((made == MADE_EXACT_TRIE &&
3455                                      startbranch == first)
3456                                      || ( first_non_open == first )) &&
3457                                      depth==0 ) {
3458                                     flags |= SCF_TRIE_RESTUDY;
3459                                     if ( startbranch == first
3460                                          && scan == tail )
3461                                     {
3462                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3463                                     }
3464                                 }
3465 #endif
3466                             } else {
3467                                 /* at this point we know whatever we have is a NOTHING sequence/branch
3468                                  * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3469                                  */
3470                                 if ( startbranch == first ) {
3471                                     regnode *opt;
3472                                     /* the entire thing is a NOTHING sequence, something like this:
3473                                      * (?:|) So we can turn it into a plain NOTHING op. */
3474                                     DEBUG_TRIE_COMPILE_r({
3475                                         regprop(RExC_rx, mysv, cur);
3476                                         PerlIO_printf( Perl_debug_log,
3477                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3478                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3479
3480                                     });
3481                                     OP(startbranch)= NOTHING;
3482                                     NEXT_OFF(startbranch)= tail - startbranch;
3483                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
3484                                         OP(opt)= OPTIMIZED;
3485                                 }
3486                             }
3487                         } /* end if ( last) */
3488                     } /* TRIE_MAXBUF is non zero */
3489
3490                 } /* do trie */
3491
3492             }
3493             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3494                 scan = NEXTOPER(NEXTOPER(scan));
3495             } else                      /* single branch is optimized. */
3496                 scan = NEXTOPER(scan);
3497             continue;
3498         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3499             scan_frame *newframe = NULL;
3500             I32 paren;
3501             regnode *start;
3502             regnode *end;
3503
3504             if (OP(scan) != SUSPEND) {
3505             /* set the pointer */
3506                 if (OP(scan) == GOSUB) {
3507                     paren = ARG(scan);
3508                     RExC_recurse[ARG2L(scan)] = scan;
3509                     start = RExC_open_parens[paren-1];
3510                     end   = RExC_close_parens[paren-1];
3511                 } else {
3512                     paren = 0;
3513                     start = RExC_rxi->program + 1;
3514                     end   = RExC_opend;
3515                 }
3516                 if (!recursed) {
3517                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3518                     SAVEFREEPV(recursed);
3519                 }
3520                 if (!PAREN_TEST(recursed,paren+1)) {
3521                     PAREN_SET(recursed,paren+1);
3522                     Newx(newframe,1,scan_frame);
3523                 } else {
3524                     if (flags & SCF_DO_SUBSTR) {
3525                         SCAN_COMMIT(pRExC_state,data,minlenp);
3526                         data->longest = &(data->longest_float);
3527                     }
3528                     is_inf = is_inf_internal = 1;
3529                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3530                         cl_anything(pRExC_state, data->start_class);
3531                     flags &= ~SCF_DO_STCLASS;
3532                 }
3533             } else {
3534                 Newx(newframe,1,scan_frame);
3535                 paren = stopparen;
3536                 start = scan+2;
3537                 end = regnext(scan);
3538             }
3539             if (newframe) {
3540                 assert(start);
3541                 assert(end);
3542                 SAVEFREEPV(newframe);
3543                 newframe->next = regnext(scan);
3544                 newframe->last = last;
3545                 newframe->stop = stopparen;
3546                 newframe->prev = frame;
3547
3548                 frame = newframe;
3549                 scan =  start;
3550                 stopparen = paren;
3551                 last = end;
3552
3553                 continue;
3554             }
3555         }
3556         else if (OP(scan) == EXACT) {
3557             I32 l = STR_LEN(scan);
3558             UV uc;
3559             if (UTF) {
3560                 const U8 * const s = (U8*)STRING(scan);
3561                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3562                 l = utf8_length(s, s + l);
3563             } else {
3564                 uc = *((U8*)STRING(scan));
3565             }
3566             min += l;
3567             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3568                 /* The code below prefers earlier match for fixed
3569                    offset, later match for variable offset.  */
3570                 if (data->last_end == -1) { /* Update the start info. */
3571                     data->last_start_min = data->pos_min;
3572                     data->last_start_max = is_inf
3573                         ? I32_MAX : data->pos_min + data->pos_delta;
3574                 }
3575                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3576                 if (UTF)
3577                     SvUTF8_on(data->last_found);
3578                 {
3579                     SV * const sv = data->last_found;
3580                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3581                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3582                     if (mg && mg->mg_len >= 0)
3583                         mg->mg_len += utf8_length((U8*)STRING(scan),
3584                                                   (U8*)STRING(scan)+STR_LEN(scan));
3585                 }
3586                 data->last_end = data->pos_min + l;
3587                 data->pos_min += l; /* As in the first entry. */
3588                 data->flags &= ~SF_BEFORE_EOL;
3589             }
3590             if (flags & SCF_DO_STCLASS_AND) {
3591                 /* Check whether it is compatible with what we know already! */
3592                 int compat = 1;
3593
3594
3595                 /* If compatible, we or it in below.  It is compatible if is
3596                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3597                  * it's for a locale.  Even if there isn't unicode semantics
3598                  * here, at runtime there may be because of matching against a
3599                  * utf8 string, so accept a possible false positive for
3600                  * latin1-range folds */
3601                 if (uc >= 0x100 ||
3602                     (!(data->start_class->flags & ANYOF_LOCALE)
3603                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3604                     && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3605                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3606                     )
3607                 {
3608                     compat = 0;
3609                 }
3610                 ANYOF_CLASS_ZERO(data->start_class);
3611                 ANYOF_BITMAP_ZERO(data->start_class);
3612                 if (compat)
3613                     ANYOF_BITMAP_SET(data->start_class, uc);
3614                 else if (uc >= 0x100) {
3615                     int i;
3616
3617                     /* Some Unicode code points fold to the Latin1 range; as
3618                      * XXX temporary code, instead of figuring out if this is
3619                      * one, just assume it is and set all the start class bits
3620                      * that could be some such above 255 code point's fold
3621                      * which will generate fals positives.  As the code
3622                      * elsewhere that does compute the fold settles down, it
3623                      * can be extracted out and re-used here */
3624                     for (i = 0; i < 256; i++){
3625                         if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3626                             ANYOF_BITMAP_SET(data->start_class, i);
3627                         }
3628                     }
3629                 }
3630                 CLEAR_SSC_EOS(data->start_class);
3631                 if (uc < 0x100)
3632                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3633             }
3634             else if (flags & SCF_DO_STCLASS_OR) {
3635                 /* false positive possible if the class is case-folded */
3636                 if (uc < 0x100)
3637                     ANYOF_BITMAP_SET(data->start_class, uc);
3638                 else
3639                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3640                 CLEAR_SSC_EOS(data->start_class);
3641                 cl_and(data->start_class, and_withp);
3642             }
3643             flags &= ~SCF_DO_STCLASS;
3644         }
3645         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3646             I32 l = STR_LEN(scan);
3647             UV uc = *((U8*)STRING(scan));
3648
3649             /* Search for fixed substrings supports EXACT only. */
3650             if (flags & SCF_DO_SUBSTR) {
3651                 assert(data);
3652                 SCAN_COMMIT(pRExC_state, data, minlenp);
3653             }
3654             if (UTF) {
3655                 const U8 * const s = (U8 *)STRING(scan);
3656                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3657                 l = utf8_length(s, s + l);
3658             }
3659             if (has_exactf_sharp_s) {
3660                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3661             }
3662             min += l - min_subtract;
3663             assert (min >= 0);
3664             delta += min_subtract;
3665             if (flags & SCF_DO_SUBSTR) {
3666                 data->pos_min += l - min_subtract;
3667                 if (data->pos_min < 0) {
3668                     data->pos_min = 0;
3669                 }
3670                 data->pos_delta += min_subtract;
3671                 if (min_subtract) {
3672                     data->longest = &(data->longest_float);
3673                 }
3674             }
3675             if (flags & SCF_DO_STCLASS_AND) {
3676                 /* Check whether it is compatible with what we know already! */
3677                 int compat = 1;
3678                 if (uc >= 0x100 ||
3679                  (!(data->start_class->flags & ANYOF_LOCALE)
3680                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3681                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3682                 {
3683                     compat = 0;
3684                 }
3685                 ANYOF_CLASS_ZERO(data->start_class);
3686                 ANYOF_BITMAP_ZERO(data->start_class);
3687                 if (compat) {
3688                     ANYOF_BITMAP_SET(data->start_class, uc);
3689                     CLEAR_SSC_EOS(data->start_class);
3690                     if (OP(scan) == EXACTFL) {
3691                         /* XXX This set is probably no longer necessary, and
3692                          * probably wrong as LOCALE now is on in the initial
3693                          * state */
3694                         data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3695                     }
3696                     else {
3697
3698                         /* Also set the other member of the fold pair.  In case
3699                          * that unicode semantics is called for at runtime, use
3700                          * the full latin1 fold.  (Can't do this for locale,
3701                          * because not known until runtime) */
3702                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3703
3704                         /* All other (EXACTFL handled above) folds except under
3705                          * /iaa that include s, S, and sharp_s also may include
3706                          * the others */
3707                         if (OP(scan) != EXACTFA) {
3708                             if (uc == 's' || uc == 'S') {
3709                                 ANYOF_BITMAP_SET(data->start_class,
3710                                                  LATIN_SMALL_LETTER_SHARP_S);
3711                             }
3712                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3713                                 ANYOF_BITMAP_SET(data->start_class, 's');
3714                                 ANYOF_BITMAP_SET(data->start_class, 'S');
3715                             }
3716                         }
3717                     }
3718                 }
3719                 else if (uc >= 0x100) {
3720                     int i;
3721                     for (i = 0; i < 256; i++){
3722                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3723                             ANYOF_BITMAP_SET(data->start_class, i);
3724                         }
3725                     }
3726                 }
3727             }
3728             else if (flags & SCF_DO_STCLASS_OR) {
3729                 if (data->start_class->flags & ANYOF_LOC_FOLD) {
3730                     /* false positive possible if the class is case-folded.
3731                        Assume that the locale settings are the same... */
3732                     if (uc < 0x100) {
3733                         ANYOF_BITMAP_SET(data->start_class, uc);
3734                         if (OP(scan) != EXACTFL) {
3735
3736                             /* And set the other member of the fold pair, but
3737                              * can't do that in locale because not known until
3738                              * run-time */
3739                             ANYOF_BITMAP_SET(data->start_class,
3740                                              PL_fold_latin1[uc]);
3741
3742                             /* All folds except under /iaa that include s, S,
3743                              * and sharp_s also may include the others */
3744                             if (OP(scan) != EXACTFA) {
3745                                 if (uc == 's' || uc == 'S') {
3746                                     ANYOF_BITMAP_SET(data->start_class,
3747                                                    LATIN_SMALL_LETTER_SHARP_S);
3748                                 }
3749                                 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3750                                     ANYOF_BITMAP_SET(data->start_class, 's');
3751                                     ANYOF_BITMAP_SET(data->start_class, 'S');
3752                                 }
3753                             }
3754                         }
3755                     }
3756                     CLEAR_SSC_EOS(data->start_class);
3757                 }
3758                 cl_and(data->start_class, and_withp);
3759             }
3760             flags &= ~SCF_DO_STCLASS;
3761         }
3762         else if (REGNODE_VARIES(OP(scan))) {
3763             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3764             I32 f = flags, pos_before = 0;
3765             regnode * const oscan = scan;
3766             struct regnode_charclass_class this_class;
3767             struct regnode_charclass_class *oclass = NULL;
3768             I32 next_is_eval = 0;
3769
3770             switch (PL_regkind[OP(scan)]) {
3771             case WHILEM:                /* End of (?:...)* . */
3772                 scan = NEXTOPER(scan);
3773                 goto finish;
3774             case PLUS:
3775                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3776                     next = NEXTOPER(scan);
3777                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3778                         mincount = 1;
3779                         maxcount = REG_INFTY;
3780                         next = regnext(scan);
3781                         scan = NEXTOPER(scan);
3782                         goto do_curly;
3783                     }
3784                 }
3785                 if (flags & SCF_DO_SUBSTR)
3786                     data->pos_min++;
3787                 min++;
3788                 /* Fall through. */
3789             case STAR:
3790                 if (flags & SCF_DO_STCLASS) {
3791                     mincount = 0;
3792                     maxcount = REG_INFTY;
3793                     next = regnext(scan);
3794                     scan = NEXTOPER(scan);
3795                     goto do_curly;
3796                 }
3797                 is_inf = is_inf_internal = 1;
3798                 scan = regnext(scan);
3799                 if (flags & SCF_DO_SUBSTR) {
3800                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3801                     data->longest = &(data->longest_float);
3802                 }
3803                 goto optimize_curly_tail;
3804             case CURLY:
3805                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3806                     && (scan->flags == stopparen))
3807                 {
3808                     mincount = 1;
3809                     maxcount = 1;
3810                 } else {
3811                     mincount = ARG1(scan);
3812                     maxcount = ARG2(scan);
3813                 }
3814                 next = regnext(scan);
3815                 if (OP(scan) == CURLYX) {
3816                     I32 lp = (data ? *(data->last_closep) : 0);
3817                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3818                 }
3819                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3820                 next_is_eval = (OP(scan) == EVAL);
3821               do_curly:
3822                 if (flags & SCF_DO_SUBSTR) {
3823                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3824                     pos_before = data->pos_min;
3825                 }
3826                 if (data) {
3827                     fl = data->flags;
3828                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3829                     if (is_inf)
3830                         data->flags |= SF_IS_INF;
3831                 }
3832                 if (flags & SCF_DO_STCLASS) {
3833                     cl_init(pRExC_state, &this_class);
3834                     oclass = data->start_class;
3835                     data->start_class = &this_class;
3836                     f |= SCF_DO_STCLASS_AND;
3837                     f &= ~SCF_DO_STCLASS_OR;
3838                 }
3839                 /* Exclude from super-linear cache processing any {n,m}
3840                    regops for which the combination of input pos and regex
3841                    pos is not enough information to determine if a match
3842                    will be possible.
3843
3844                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3845                    regex pos at the \s*, the prospects for a match depend not
3846                    only on the input position but also on how many (bar\s*)
3847                    repeats into the {4,8} we are. */
3848                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3849                     f &= ~SCF_WHILEM_VISITED_POS;
3850
3851                 /* This will finish on WHILEM, setting scan, or on NULL: */
3852                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3853                                       last, data, stopparen, recursed, NULL,
3854                                       (mincount == 0
3855                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3856
3857                 if (flags & SCF_DO_STCLASS)
3858                     data->start_class = oclass;
3859                 if (mincount == 0 || minnext == 0) {
3860                     if (flags & SCF_DO_STCLASS_OR) {
3861                         cl_or(pRExC_state, data->start_class, &this_class);
3862                     }
3863                     else if (flags & SCF_DO_STCLASS_AND) {
3864                         /* Switch to OR mode: cache the old value of
3865                          * data->start_class */
3866                         INIT_AND_WITHP;
3867                         StructCopy(data->start_class, and_withp,
3868                                    struct regnode_charclass_class);
3869                         flags &= ~SCF_DO_STCLASS_AND;
3870                         StructCopy(&this_class, data->start_class,
3871                                    struct regnode_charclass_class);
3872                         flags |= SCF_DO_STCLASS_OR;
3873                         SET_SSC_EOS(data->start_class);
3874                     }
3875                 } else {                /* Non-zero len */
3876                     if (flags & SCF_DO_STCLASS_OR) {
3877                         cl_or(pRExC_state, data->start_class, &this_class);
3878                         cl_and(data->start_class, and_withp);
3879                     }
3880                     else if (flags & SCF_DO_STCLASS_AND)
3881                         cl_and(data->start_class, &this_class);
3882                     flags &= ~SCF_DO_STCLASS;
3883                 }
3884                 if (!scan)              /* It was not CURLYX, but CURLY. */
3885                     scan = next;
3886                 if ( /* ? quantifier ok, except for (?{ ... }) */
3887                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3888                     && (minnext == 0) && (deltanext == 0)
3889                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3890                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3891                 {
3892                     /* Fatal warnings may leak the regexp without this: */
3893                     SAVEFREESV(RExC_rx_sv);
3894                     ckWARNreg(RExC_parse,
3895                               "Quantifier unexpected on zero-length expression");
3896                     (void)ReREFCNT_inc(RExC_rx_sv);
3897                 }
3898
3899                 min += minnext * mincount;
3900                 is_inf_internal |= deltanext == I32_MAX
3901                                      || (maxcount == REG_INFTY && minnext + deltanext > 0);
3902                 is_inf |= is_inf_internal;
3903                 if (is_inf)
3904                     delta = I32_MAX;
3905                 else
3906                     delta += (minnext + deltanext) * maxcount - minnext * mincount;
3907
3908                 /* Try powerful optimization CURLYX => CURLYN. */
3909                 if (  OP(oscan) == CURLYX && data
3910                       && data->flags & SF_IN_PAR
3911                       && !(data->flags & SF_HAS_EVAL)
3912                       && !deltanext && minnext == 1 ) {
3913                     /* Try to optimize to CURLYN.  */
3914                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3915                     regnode * const nxt1 = nxt;
3916 #ifdef DEBUGGING
3917                     regnode *nxt2;
3918 #endif
3919
3920                     /* Skip open. */
3921                     nxt = regnext(nxt);
3922                     if (!REGNODE_SIMPLE(OP(nxt))
3923                         && !(PL_regkind[OP(nxt)] == EXACT
3924                              && STR_LEN(nxt) == 1))
3925                         goto nogo;
3926 #ifdef DEBUGGING
3927                     nxt2 = nxt;
3928 #endif
3929                     nxt = regnext(nxt);
3930                     if (OP(nxt) != CLOSE)
3931                         goto nogo;
3932                     if (RExC_open_parens) {
3933                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3934                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3935                     }
3936                     /* Now we know that nxt2 is the only contents: */
3937                     oscan->flags = (U8)ARG(nxt);
3938                     OP(oscan) = CURLYN;
3939                     OP(nxt1) = NOTHING; /* was OPEN. */
3940
3941 #ifdef DEBUGGING
3942                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3943                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3944                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3945                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3946                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3947                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3948 #endif
3949                 }
3950               nogo:
3951
3952                 /* Try optimization CURLYX => CURLYM. */
3953                 if (  OP(oscan) == CURLYX && data
3954                       && !(data->flags & SF_HAS_PAR)
3955                       && !(data->flags & SF_HAS_EVAL)
3956                       && !deltanext     /* atom is fixed width */
3957                       && minnext != 0   /* CURLYM can't handle zero width */
3958                       && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3959                 ) {
3960                     /* XXXX How to optimize if data == 0? */
3961                     /* Optimize to a simpler form.  */
3962                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3963                     regnode *nxt2;
3964
3965                     OP(oscan) = CURLYM;
3966                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3967                             && (OP(nxt2) != WHILEM))
3968                         nxt = nxt2;
3969                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3970                     /* Need to optimize away parenths. */
3971                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3972                         /* Set the parenth number.  */
3973                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3974
3975                         oscan->flags = (U8)ARG(nxt);
3976                         if (RExC_open_parens) {
3977                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3978                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3979                         }
3980                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3981                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3982
3983 #ifdef DEBUGGING
3984                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3985                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3986                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3987                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3988 #endif
3989 #if 0
3990                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3991                             regnode *nnxt = regnext(nxt1);
3992                             if (nnxt == nxt) {
3993                                 if (reg_off_by_arg[OP(nxt1)])
3994                                     ARG_SET(nxt1, nxt2 - nxt1);
3995                                 else if (nxt2 - nxt1 < U16_MAX)
3996                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3997                                 else
3998                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3999                             }
4000                             nxt1 = nnxt;
4001                         }
4002 #endif
4003                         /* Optimize again: */
4004                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4005                                     NULL, stopparen, recursed, NULL, 0,depth+1);
4006                     }
4007                     else
4008                         oscan->flags = 0;
4009                 }
4010                 else if ((OP(oscan) == CURLYX)
4011                          && (flags & SCF_WHILEM_VISITED_POS)
4012                          /* See the comment on a similar expression above.
4013                             However, this time it's not a subexpression
4014                             we care about, but the expression itself. */
4015                          && (maxcount == REG_INFTY)
4016                          && data && ++data->whilem_c < 16) {
4017                     /* This stays as CURLYX, we can put the count/of pair. */
4018                     /* Find WHILEM (as in regexec.c) */
4019                     regnode *nxt = oscan + NEXT_OFF(oscan);
4020
4021                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4022                         nxt += ARG(nxt);
4023                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4024                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4025                 }
4026                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4027                     pars++;
4028                 if (flags & SCF_DO_SUBSTR) {
4029                     SV *last_str = NULL;
4030                     int counted = mincount != 0;
4031
4032                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4033 #if defined(SPARC64_GCC_WORKAROUND)
4034                         I32 b = 0;
4035                         STRLEN l = 0;
4036                         const char *s = NULL;
4037                         I32 old = 0;
4038
4039                         if (pos_before >= data->last_start_min)
4040                             b = pos_before;
4041                         else
4042                             b = data->last_start_min;
4043
4044                         l = 0;
4045                         s = SvPV_const(data->last_found, l);
4046                         old = b - data->last_start_min;
4047
4048 #else
4049                         I32 b = pos_before >= data->last_start_min
4050                             ? pos_before : data->last_start_min;
4051                         STRLEN l;
4052                         const char * const s = SvPV_const(data->last_found, l);
4053                         I32 old = b - data->last_start_min;
4054 #endif
4055
4056                         if (UTF)
4057                             old = utf8_hop((U8*)s, old) - (U8*)s;
4058                         l -= old;
4059                         /* Get the added string: */
4060                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4061                         if (deltanext == 0 && pos_before == b) {
4062                             /* What was added is a constant string */
4063                             if (mincount > 1) {
4064                                 SvGROW(last_str, (mincount * l) + 1);
4065                                 repeatcpy(SvPVX(last_str) + l,
4066                                           SvPVX_const(last_str), l, mincount - 1);
4067                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4068                                 /* Add additional parts. */
4069                                 SvCUR_set(data->last_found,
4070                                           SvCUR(data->last_found) - l);
4071                                 sv_catsv(data->last_found, last_str);
4072                                 {
4073                                     SV * sv = data->last_found;
4074                                     MAGIC *mg =
4075                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4076                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4077                                     if (mg && mg->mg_len >= 0)
4078                                         mg->mg_len += CHR_SVLEN(last_str) - l;
4079                                 }
4080                                 data->last_end += l * (mincount - 1);
4081                             }
4082                         } else {
4083                             /* start offset must point into the last copy */
4084                             data->last_start_min += minnext * (mincount - 1);
4085                             data->last_start_max += is_inf ? I32_MAX
4086                                 : (maxcount - 1) * (minnext + data->pos_delta);
4087                         }
4088                     }
4089                     /* It is counted once already... */
4090                     data->pos_min += minnext * (mincount - counted);
4091 #if 0
4092 PerlIO_printf(Perl_debug_log, "counted=%d deltanext=%d I32_MAX=%d minnext=%d maxcount=%d mincount=%d\n",
4093     counted, deltanext, I32_MAX, minnext, maxcount, mincount);
4094 if (deltanext != I32_MAX)
4095 PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount, I32_MAX - data->pos_delta);
4096 #endif
4097                     if (deltanext == I32_MAX || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= I32_MAX - data->pos_delta)
4098                         data->pos_delta = I32_MAX;
4099                     else
4100                         data->pos_delta += - counted * deltanext +
4101                         (minnext + deltanext) * maxcount - minnext * mincount;
4102                     if (mincount != maxcount) {
4103                          /* Cannot extend fixed substrings found inside
4104                             the group.  */
4105                         SCAN_COMMIT(pRExC_state,data,minlenp);
4106                         if (mincount && last_str) {
4107                             SV * const sv = data->last_found;
4108                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4109                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4110
4111                             if (mg)
4112                                 mg->mg_len = -1;
4113                             sv_setsv(sv, last_str);
4114                             data->last_end = data->pos_min;
4115                             data->last_start_min =
4116                                 data->pos_min - CHR_SVLEN(last_str);
4117                             data->last_start_max = is_inf
4118                                 ? I32_MAX
4119                                 : data->pos_min + data->pos_delta
4120                                 - CHR_SVLEN(last_str);
4121                         }
4122                         data->longest = &(data->longest_float);
4123                     }
4124                     SvREFCNT_dec(last_str);
4125                 }
4126                 if (data && (fl & SF_HAS_EVAL))
4127                     data->flags |= SF_HAS_EVAL;
4128               optimize_curly_tail:
4129                 if (OP(oscan) != CURLYX) {
4130                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4131                            && NEXT_OFF(next))
4132                         NEXT_OFF(oscan) += NEXT_OFF(next);
4133                 }
4134                 continue;
4135             default:                    /* REF, and CLUMP only? */
4136                 if (flags & SCF_DO_SUBSTR) {
4137                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
4138                     data->longest = &(data->longest_float);
4139                 }
4140                 is_inf = is_inf_internal = 1;
4141                 if (flags & SCF_DO_STCLASS_OR)
4142                     cl_anything(pRExC_state, data->start_class);
4143                 flags &= ~SCF_DO_STCLASS;
4144                 break;
4145             }
4146         }
4147         else if (OP(scan) == LNBREAK) {
4148             if (flags & SCF_DO_STCLASS) {
4149                 int value = 0;
4150                 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4151                 if (flags & SCF_DO_STCLASS_AND) {
4152                     for (value = 0; value < 256; value++)
4153                         if (!is_VERTWS_cp(value))
4154                             ANYOF_BITMAP_CLEAR(data->start_class, value);
4155                 }
4156                 else {
4157                     for (value = 0; value < 256; value++)
4158                         if (is_VERTWS_cp(value))
4159                             ANYOF_BITMAP_SET(data->start_class, value);
4160                 }
4161                 if (flags & SCF_DO_STCLASS_OR)
4162                     cl_and(data->start_class, and_withp);
4163                 flags &= ~SCF_DO_STCLASS;
4164             }
4165             min++;
4166             delta++;    /* Because of the 2 char string cr-lf */
4167             if (flags & SCF_DO_SUBSTR) {
4168                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4169                 data->pos_min += 1;
4170                 data->pos_delta += 1;
4171                 data->longest = &(data->longest_float);
4172             }
4173         }
4174         else if (REGNODE_SIMPLE(OP(scan))) {
4175             int value = 0;
4176
4177             if (flags & SCF_DO_SUBSTR) {
4178                 SCAN_COMMIT(pRExC_state,data,minlenp);
4179                 data->pos_min++;
4180             }
4181             min++;
4182             if (flags & SCF_DO_STCLASS) {
4183                 int loop_max = 256;
4184                 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4185
4186                 /* Some of the logic below assumes that switching
4187                    locale on will only add false positives. */
4188                 switch (PL_regkind[OP(scan)]) {
4189                     U8 classnum;
4190
4191                 case SANY:
4192                 default:
4193 #ifdef DEBUGGING
4194                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4195 #endif
4196                  do_default:
4197                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4198                         cl_anything(pRExC_state, data->start_class);
4199                     break;
4200                 case REG_ANY:
4201                     if (OP(scan) == SANY)
4202                         goto do_default;
4203                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4204                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4205                                 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4206                         cl_anything(pRExC_state, data->start_class);
4207                     }
4208                     if (flags & SCF_DO_STCLASS_AND || !value)
4209                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4210                     break;
4211                 case ANYOF:
4212                     if (flags & SCF_DO_STCLASS_AND)
4213                         cl_and(data->start_class,
4214                                (struct regnode_charclass_class*)scan);
4215                     else
4216                         cl_or(pRExC_state, data->start_class,
4217                               (struct regnode_charclass_class*)scan);
4218                     break;
4219                 case POSIXA:
4220                     loop_max = 128;
4221                     /* FALL THROUGH */
4222                 case POSIXL:
4223                 case POSIXD:
4224                 case POSIXU:
4225                     classnum = FLAGS(scan);
4226                     if (flags & SCF_DO_STCLASS_AND) {
4227                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4228                             ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
4229                             for (value = 0; value < loop_max; value++) {
4230                                 if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4231                                     ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4232                                 }
4233                             }
4234                         }
4235                     }
4236                     else {
4237                         if (data->start_class->flags & ANYOF_LOCALE) {
4238                             ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum));
4239                         }
4240                         else {
4241
4242                         /* Even if under locale, set the bits for non-locale
4243                          * in case it isn't a true locale-node.  This will
4244                          * create false positives if it truly is locale */
4245                         for (value = 0; value < loop_max; value++) {
4246                             if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4247                                 ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4248                             }
4249                         }
4250                         }
4251                     }
4252                     break;
4253                 case NPOSIXA:
4254                     loop_max = 128;
4255                     /* FALL THROUGH */
4256                 case NPOSIXL:
4257                 case NPOSIXU:
4258                 case NPOSIXD:
4259                     classnum = FLAGS(scan);
4260                     if (flags & SCF_DO_STCLASS_AND) {
4261                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4262                             ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
4263                             for (value = 0; value < loop_max; value++) {
4264                                 if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4265                                     ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4266                                 }
4267                             }
4268                         }
4269                     }
4270                     else {
4271                         if (data->start_class->flags & ANYOF_LOCALE) {
4272                             ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1);
4273                         }
4274                         else {
4275
4276                         /* Even if under locale, set the bits for non-locale in
4277                          * case it isn't a true locale-node.  This will create
4278                          * false positives if it truly is locale */
4279                         for (value = 0; value < loop_max; value++) {
4280                             if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4281                                 ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4282                             }
4283                         }
4284                         if (PL_regkind[OP(scan)] == NPOSIXD) {
4285                             data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
4286                         }
4287                         }
4288                     }
4289                     break;
4290                 }
4291                 if (flags & SCF_DO_STCLASS_OR)
4292                     cl_and(data->start_class, and_withp);
4293                 flags &= ~SCF_DO_STCLASS;
4294             }
4295         }
4296         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4297             data->flags |= (OP(scan) == MEOL
4298                             ? SF_BEFORE_MEOL
4299                             : SF_BEFORE_SEOL);
4300             SCAN_COMMIT(pRExC_state, data, minlenp);
4301
4302         }
4303         else if (  PL_regkind[OP(scan)] == BRANCHJ
4304                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4305                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4306                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4307             if ( OP(scan) == UNLESSM &&
4308                  scan->flags == 0 &&
4309                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4310                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4311             ) {
4312                 regnode *opt;
4313                 regnode *upto= regnext(scan);
4314                 DEBUG_PARSE_r({
4315                     SV * const mysv_val=sv_newmortal();
4316                     DEBUG_STUDYDATA("OPFAIL",data,depth);
4317
4318                     /*DEBUG_PARSE_MSG("opfail");*/
4319                     regprop(RExC_rx, mysv_val, upto);
4320                     PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4321                                   SvPV_nolen_const(mysv_val),
4322                                   (IV)REG_NODE_NUM(upto),
4323                                   (IV)(upto - scan)
4324                     );
4325                 });
4326                 OP(scan) = OPFAIL;
4327                 NEXT_OFF(scan) = upto - scan;
4328                 for (opt= scan + 1; opt < upto ; opt++)
4329                     OP(opt) = OPTIMIZED;
4330                 scan= upto;
4331                 continue;
4332             }
4333             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4334                 || OP(scan) == UNLESSM )
4335             {
4336                 /* Negative Lookahead/lookbehind
4337                    In this case we can't do fixed string optimisation.
4338                 */
4339
4340                 I32 deltanext, minnext, fake = 0;
4341                 regnode *nscan;
4342                 struct regnode_charclass_class intrnl;
4343                 int f = 0;
4344
4345                 data_fake.flags = 0;
4346                 if (data) {
4347                     data_fake.whilem_c = data->whilem_c;
4348                     data_fake.last_closep = data->last_closep;
4349                 }
4350                 else
4351                     data_fake.last_closep = &fake;
4352                 data_fake.pos_delta = delta;
4353                 if ( flags & SCF_DO_STCLASS && !scan->flags
4354                      && OP(scan) == IFMATCH ) { /* Lookahead */
4355                     cl_init(pRExC_state, &intrnl);
4356                     data_fake.start_class = &intrnl;
4357                     f |= SCF_DO_STCLASS_AND;
4358                 }
4359                 if (flags & SCF_WHILEM_VISITED_POS)
4360                     f |= SCF_WHILEM_VISITED_POS;
4361                 next = regnext(scan);
4362                 nscan = NEXTOPER(NEXTOPER(scan));
4363                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4364                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4365                 if (scan->flags) {
4366                     if (deltanext) {
4367                         FAIL("Variable length lookbehind not implemented");
4368                     }
4369                     else if (minnext > (I32)U8_MAX) {
4370                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4371                     }
4372                     scan->flags = (U8)minnext;
4373                 }
4374                 if (data) {
4375                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4376                         pars++;
4377                     if (data_fake.flags & SF_HAS_EVAL)
4378                         data->flags |= SF_HAS_EVAL;
4379                     data->whilem_c = data_fake.whilem_c;
4380                 }
4381                 if (f & SCF_DO_STCLASS_AND) {
4382                     if (flags & SCF_DO_STCLASS_OR) {
4383                         /* OR before, AND after: ideally we would recurse with
4384                          * data_fake to get the AND applied by study of the
4385                          * remainder of the pattern, and then derecurse;
4386                          * *** HACK *** for now just treat as "no information".
4387                          * See [perl #56690].
4388                          */
4389                         cl_init(pRExC_state, data->start_class);
4390                     }  else {
4391                         /* AND before and after: combine and continue */
4392                         const int was = TEST_SSC_EOS(data->start_class);
4393
4394                         cl_and(data->start_class, &intrnl);
4395                         if (was)
4396                             SET_SSC_EOS(data->start_class);
4397                     }
4398                 }
4399             }
4400 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4401             else {
4402                 /* Positive Lookahead/lookbehind
4403                    In this case we can do fixed string optimisation,
4404                    but we must be careful about it. Note in the case of
4405                    lookbehind the positions will be offset by the minimum
4406                    length of the pattern, something we won't know about
4407                    until after the recurse.
4408                 */
4409                 I32 deltanext, fake = 0;
4410                 regnode *nscan;
4411                 struct regnode_charclass_class intrnl;
4412                 int f = 0;
4413                 /* We use SAVEFREEPV so that when the full compile
4414                     is finished perl will clean up the allocated
4415                     minlens when it's all done. This way we don't
4416                     have to worry about freeing them when we know
4417                     they wont be used, which would be a pain.
4418                  */
4419                 I32 *minnextp;
4420                 Newx( minnextp, 1, I32 );
4421                 SAVEFREEPV(minnextp);
4422
4423                 if (data) {
4424                     StructCopy(data, &data_fake, scan_data_t);
4425                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4426                         f |= SCF_DO_SUBSTR;
4427                         if (scan->flags)
4428                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4429                         data_fake.last_found=newSVsv(data->last_found);
4430                     }
4431                 }
4432                 else
4433                     data_fake.last_closep = &fake;
4434                 data_fake.flags = 0;
4435                 data_fake.pos_delta = delta;
4436                 if (is_inf)
4437                     data_fake.flags |= SF_IS_INF;
4438                 if ( flags & SCF_DO_STCLASS && !scan->flags
4439                      && OP(scan) == IFMATCH ) { /* Lookahead */
4440                     cl_init(pRExC_state, &intrnl);
4441                     data_fake.start_class = &intrnl;
4442                     f |= SCF_DO_STCLASS_AND;
4443                 }
4444                 if (flags & SCF_WHILEM_VISITED_POS)
4445                     f |= SCF_WHILEM_VISITED_POS;
4446                 next = regnext(scan);
4447                 nscan = NEXTOPER(NEXTOPER(scan));
4448
4449                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4450                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4451                 if (scan->flags) {
4452                     if (deltanext) {
4453                         FAIL("Variable length lookbehind not implemented");
4454                     }
4455                     else if (*minnextp > (I32)U8_MAX) {
4456                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4457                     }
4458                     scan->flags = (U8)*minnextp;
4459                 }
4460
4461                 *minnextp += min;
4462
4463                 if (f & SCF_DO_STCLASS_AND) {
4464                     const int was = TEST_SSC_EOS(data.start_class);
4465
4466                     cl_and(data->start_class, &intrnl);
4467                     if (was)
4468                         SET_SSC_EOS(data->start_class);
4469                 }
4470                 if (data) {
4471                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4472                         pars++;
4473                     if (data_fake.flags & SF_HAS_EVAL)
4474                         data->flags |= SF_HAS_EVAL;
4475                     data->whilem_c = data_fake.whilem_c;
4476                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4477                         if (RExC_rx->minlen<*minnextp)
4478                             RExC_rx->minlen=*minnextp;
4479                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4480                         SvREFCNT_dec_NN(data_fake.last_found);
4481
4482                         if ( data_fake.minlen_fixed != minlenp )
4483                         {
4484                             data->offset_fixed= data_fake.offset_fixed;
4485                             data->minlen_fixed= data_fake.minlen_fixed;
4486                             data->lookbehind_fixed+= scan->flags;
4487                         }
4488                         if ( data_fake.minlen_float != minlenp )
4489                         {
4490                             data->minlen_float= data_fake.minlen_float;
4491                             data->offset_float_min=data_fake.offset_float_min;
4492                             data->offset_float_max=data_fake.offset_float_max;
4493                             data->lookbehind_float+= scan->flags;
4494                         }
4495                     }
4496                 }
4497             }
4498 #endif
4499         }
4500         else if (OP(scan) == OPEN) {
4501             if (stopparen != (I32)ARG(scan))
4502                 pars++;
4503         }
4504         else if (OP(scan) == CLOSE) {
4505             if (stopparen == (I32)ARG(scan)) {
4506                 break;
4507             }
4508             if ((I32)ARG(scan) == is_par) {
4509                 next = regnext(scan);
4510
4511                 if ( next && (OP(next) != WHILEM) && next < last)
4512                     is_par = 0;         /* Disable optimization */
4513             }
4514             if (data)
4515                 *(data->last_closep) = ARG(scan);
4516         }
4517         else if (OP(scan) == EVAL) {
4518                 if (data)
4519                     data->flags |= SF_HAS_EVAL;
4520         }
4521         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4522             if (flags & SCF_DO_SUBSTR) {
4523                 SCAN_COMMIT(pRExC_state,data,minlenp);
4524                 flags &= ~SCF_DO_SUBSTR;
4525             }
4526             if (data && OP(scan)==ACCEPT) {
4527                 data->flags |= SCF_SEEN_ACCEPT;
4528                 if (stopmin > min)
4529                     stopmin = min;
4530             }
4531         }
4532         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4533         {
4534                 if (flags & SCF_DO_SUBSTR) {
4535                     SCAN_COMMIT(pRExC_state,data,minlenp);
4536                     data->longest = &(data->longest_float);
4537                 }
4538                 is_inf = is_inf_internal = 1;
4539                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4540                     cl_anything(pRExC_state, data->start_class);
4541                 flags &= ~SCF_DO_STCLASS;
4542         }
4543         else if (OP(scan) == GPOS) {
4544             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4545                 !(delta || is_inf || (data && data->pos_delta)))
4546             {
4547                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4548                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4549                 if (RExC_rx->gofs < (U32)min)
4550                     RExC_rx->gofs = min;
4551             } else {
4552                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4553                 RExC_rx->gofs = 0;
4554             }
4555         }
4556 #ifdef TRIE_STUDY_OPT
4557 #ifdef FULL_TRIE_STUDY
4558         else if (PL_regkind[OP(scan)] == TRIE) {
4559             /* NOTE - There is similar code to this block above for handling
4560                BRANCH nodes on the initial study.  If you change stuff here
4561                check there too. */
4562             regnode *trie_node= scan;
4563             regnode *tail= regnext(scan);
4564             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4565             I32 max1 = 0, min1 = I32_MAX;
4566             struct regnode_charclass_class accum;
4567
4568             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4569                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4570             if (flags & SCF_DO_STCLASS)
4571                 cl_init_zero(pRExC_state, &accum);
4572
4573             if (!trie->jump) {
4574                 min1= trie->minlen;
4575                 max1= trie->maxlen;
4576             } else {
4577                 const regnode *nextbranch= NULL;
4578                 U32 word;
4579
4580                 for ( word=1 ; word <= trie->wordcount ; word++)
4581                 {
4582                     I32 deltanext=0, minnext=0, f = 0, fake;
4583                     struct regnode_charclass_class this_class;
4584
4585                     data_fake.flags = 0;
4586                     if (data) {
4587                         data_fake.whilem_c = data->whilem_c;
4588                         data_fake.last_closep = data->last_closep;
4589                     }
4590                     else
4591                         data_fake.last_closep = &fake;
4592                     data_fake.pos_delta = delta;
4593                     if (flags & SCF_DO_STCLASS) {
4594                         cl_init(pRExC_state, &this_class);
4595                         data_fake.start_class = &this_class;
4596                         f = SCF_DO_STCLASS_AND;
4597                     }
4598                     if (flags & SCF_WHILEM_VISITED_POS)
4599                         f |= SCF_WHILEM_VISITED_POS;
4600
4601                     if (trie->jump[word]) {
4602                         if (!nextbranch)
4603                             nextbranch = trie_node + trie->jump[0];
4604                         scan= trie_node + trie->jump[word];
4605                         /* We go from the jump point to the branch that follows
4606                            it. Note this means we need the vestigal unused branches
4607                            even though they arent otherwise used.
4608                          */
4609                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4610                             &deltanext, (regnode *)nextbranch, &data_fake,
4611                             stopparen, recursed, NULL, f,depth+1);
4612                     }
4613                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4614                         nextbranch= regnext((regnode*)nextbranch);
4615
4616                     if (min1 > (I32)(minnext + trie->minlen))
4617                         min1 = minnext + trie->minlen;
4618                     if (deltanext == I32_MAX) {
4619                         is_inf = is_inf_internal = 1;
4620                         max1 = I32_MAX;
4621                     } else if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4622                         max1 = minnext + deltanext + trie->maxlen;
4623
4624                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4625                         pars++;
4626                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4627                         if ( stopmin > min + min1)
4628                             stopmin = min + min1;
4629                         flags &= ~SCF_DO_SUBSTR;
4630                         if (data)
4631                             data->flags |= SCF_SEEN_ACCEPT;
4632                     }
4633                     if (data) {
4634                         if (data_fake.flags & SF_HAS_EVAL)
4635                             data->flags |= SF_HAS_EVAL;
4636                         data->whilem_c = data_fake.whilem_c;
4637                     }
4638                     if (flags & SCF_DO_STCLASS)
4639                         cl_or(pRExC_state, &accum, &this_class);
4640                 }
4641             }
4642             if (flags & SCF_DO_SUBSTR) {
4643                 data->pos_min += min1;
4644                 data->pos_delta += max1 - min1;
4645                 if (max1 != min1 || is_inf)
4646                     data->longest = &(data->longest_float);
4647             }
4648             min += min1;
4649             delta += max1 - min1;
4650             if (flags & SCF_DO_STCLASS_OR) {
4651                 cl_or(pRExC_state, data->start_class, &accum);
4652                 if (min1) {
4653                     cl_and(data->start_class, and_withp);
4654                     flags &= ~SCF_DO_STCLASS;
4655                 }
4656             }
4657             else if (flags & SCF_DO_STCLASS_AND) {
4658                 if (min1) {
4659                     cl_and(data->start_class, &accum);
4660                     flags &= ~SCF_DO_STCLASS;
4661                 }
4662                 else {
4663                     /* Switch to OR mode: cache the old value of
4664                      * data->start_class */
4665                     INIT_AND_WITHP;
4666                     StructCopy(data->start_class, and_withp,
4667                                struct regnode_charclass_class);
4668                     flags &= ~SCF_DO_STCLASS_AND;
4669                     StructCopy(&accum, data->start_class,
4670                                struct regnode_charclass_class);
4671                     flags |= SCF_DO_STCLASS_OR;
4672                     SET_SSC_EOS(data->start_class);
4673                 }
4674             }
4675             scan= tail;
4676             continue;
4677         }
4678 #else
4679         else if (PL_regkind[OP(scan)] == TRIE) {
4680             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4681             U8*bang=NULL;
4682
4683             min += trie->minlen;
4684             delta += (trie->maxlen - trie->minlen);
4685             flags &= ~SCF_DO_STCLASS; /* xxx */
4686             if (flags & SCF_DO_SUBSTR) {
4687                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4688                 data->pos_min += trie->minlen;
4689                 data->pos_delta += (trie->maxlen - trie->minlen);
4690                 if (trie->maxlen != trie->minlen)
4691                     data->longest = &(data->longest_float);
4692             }
4693             if (trie->jump) /* no more substrings -- for now /grr*/
4694                 flags &= ~SCF_DO_SUBSTR;
4695         }
4696 #endif /* old or new */
4697 #endif /* TRIE_STUDY_OPT */
4698
4699         /* Else: zero-length, ignore. */
4700         scan = regnext(scan);
4701     }
4702     if (frame) {
4703         last = frame->last;
4704         scan = frame->next;
4705         stopparen = frame->stop;
4706         frame = frame->prev;
4707         goto fake_study_recurse;
4708     }
4709
4710   finish:
4711     assert(!frame);
4712     DEBUG_STUDYDATA("pre-fin:",data,depth);
4713
4714     *scanp = scan;
4715     *deltap = is_inf_internal ? I32_MAX : delta;
4716     if (flags & SCF_DO_SUBSTR && is_inf)
4717         data->pos_delta = I32_MAX - data->pos_min;
4718     if (is_par > (I32)U8_MAX)
4719         is_par = 0;
4720     if (is_par && pars==1 && data) {
4721         data->flags |= SF_IN_PAR;
4722         data->flags &= ~SF_HAS_PAR;
4723     }
4724     else if (pars && data) {
4725         data->flags |= SF_HAS_PAR;
4726         data->flags &= ~SF_IN_PAR;
4727     }
4728     if (flags & SCF_DO_STCLASS_OR)
4729         cl_and(data->start_class, and_withp);
4730     if (flags & SCF_TRIE_RESTUDY)
4731         data->flags |=  SCF_TRIE_RESTUDY;
4732
4733     DEBUG_STUDYDATA("post-fin:",data,depth);
4734
4735     return min < stopmin ? min : stopmin;
4736 }
4737
4738 STATIC U32
4739 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4740 {
4741     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4742
4743     PERL_ARGS_ASSERT_ADD_DATA;
4744
4745     Renewc(RExC_rxi->data,
4746            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4747            char, struct reg_data);
4748     if(count)
4749         Renew(RExC_rxi->data->what, count + n, U8);
4750     else
4751         Newx(RExC_rxi->data->what, n, U8);
4752     RExC_rxi->data->count = count + n;
4753     Copy(s, RExC_rxi->data->what + count, n, U8);
4754     return count;
4755 }
4756
4757 /*XXX: todo make this not included in a non debugging perl */
4758 #ifndef PERL_IN_XSUB_RE
4759 void
4760 Perl_reginitcolors(pTHX)
4761 {
4762     dVAR;
4763     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4764     if (s) {
4765         char *t = savepv(s);
4766         int i = 0;
4767         PL_colors[0] = t;
4768         while (++i < 6) {
4769             t = strchr(t, '\t');
4770             if (t) {
4771                 *t = '\0';
4772                 PL_colors[i] = ++t;
4773             }
4774             else
4775                 PL_colors[i] = t = (char *)"";
4776         }
4777     } else {
4778         int i = 0;
4779         while (i < 6)
4780             PL_colors[i++] = (char *)"";
4781     }
4782     PL_colorset = 1;
4783 }
4784 #endif
4785
4786
4787 #ifdef TRIE_STUDY_OPT
4788 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
4789     STMT_START {                                            \
4790         if (                                                \
4791               (data.flags & SCF_TRIE_RESTUDY)               \
4792               && ! restudied++                              \
4793         ) {                                                 \
4794             dOsomething;                                    \
4795             goto reStudy;                                   \
4796         }                                                   \
4797     } STMT_END
4798 #else
4799 #define CHECK_RESTUDY_GOTO_butfirst
4800 #endif
4801
4802 /*
4803  * pregcomp - compile a regular expression into internal code
4804  *
4805  * Decides which engine's compiler to call based on the hint currently in
4806  * scope
4807  */
4808
4809 #ifndef PERL_IN_XSUB_RE
4810
4811 /* return the currently in-scope regex engine (or the default if none)  */
4812
4813 regexp_engine const *
4814 Perl_current_re_engine(pTHX)
4815 {
4816     dVAR;
4817
4818     if (IN_PERL_COMPILETIME) {
4819         HV * const table = GvHV(PL_hintgv);
4820         SV **ptr;
4821
4822         if (!table)
4823             return &PL_core_reg_engine;
4824         ptr = hv_fetchs(table, "regcomp", FALSE);
4825         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4826             return &PL_core_reg_engine;
4827         return INT2PTR(regexp_engine*,SvIV(*ptr));
4828     }
4829     else {
4830         SV *ptr;
4831         if (!PL_curcop->cop_hints_hash)
4832             return &PL_core_reg_engine;
4833         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4834         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4835             return &PL_core_reg_engine;
4836         return INT2PTR(regexp_engine*,SvIV(ptr));
4837     }
4838 }
4839
4840
4841 REGEXP *
4842 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4843 {
4844     dVAR;
4845     regexp_engine const *eng = current_re_engine();
4846     GET_RE_DEBUG_FLAGS_DECL;
4847
4848     PERL_ARGS_ASSERT_PREGCOMP;
4849
4850     /* Dispatch a request to compile a regexp to correct regexp engine. */
4851     DEBUG_COMPILE_r({
4852         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4853                         PTR2UV(eng));
4854     });
4855     return CALLREGCOMP_ENG(eng, pattern, flags);
4856 }
4857 #endif
4858
4859 /* public(ish) entry point for the perl core's own regex compiling code.
4860  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4861  * pattern rather than a list of OPs, and uses the internal engine rather
4862  * than the current one */
4863
4864 REGEXP *
4865 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4866 {
4867     SV *pat = pattern; /* defeat constness! */
4868     PERL_ARGS_ASSERT_RE_COMPILE;
4869     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4870 #ifdef PERL_IN_XSUB_RE
4871                                 &my_reg_engine,
4872 #else
4873                                 &PL_core_reg_engine,
4874 #endif
4875                                 NULL, NULL, rx_flags, 0);
4876 }
4877
4878 /* see if there are any run-time code blocks in the pattern.
4879  * False positives are allowed */
4880
4881 static bool
4882 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4883                     U32 pm_flags, char *pat, STRLEN plen)
4884 {
4885     int n = 0;
4886     STRLEN s;
4887
4888     /* avoid infinitely recursing when we recompile the pattern parcelled up
4889      * as qr'...'. A single constant qr// string can't have have any
4890      * run-time component in it, and thus, no runtime code. (A non-qr
4891      * string, however, can, e.g. $x =~ '(?{})') */
4892     if  ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
4893         return 0;
4894
4895     for (s = 0; s < plen; s++) {
4896         if (n < pRExC_state->num_code_blocks
4897             && s == pRExC_state->code_blocks[n].start)
4898         {
4899             s = pRExC_state->code_blocks[n].end;
4900             n++;
4901             continue;
4902         }
4903         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
4904          * positives here */
4905         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
4906             (pat[s+2] == '{'
4907                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
4908         )
4909             return 1;
4910     }
4911     return 0;
4912 }
4913
4914 /* Handle run-time code blocks. We will already have compiled any direct
4915  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
4916  * copy of it, but with any literal code blocks blanked out and
4917  * appropriate chars escaped; then feed it into
4918  *
4919  *    eval "qr'modified_pattern'"
4920  *
4921  * For example,
4922  *
4923  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
4924  *
4925  * becomes
4926  *
4927  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
4928  *
4929  * After eval_sv()-ing that, grab any new code blocks from the returned qr
4930  * and merge them with any code blocks of the original regexp.
4931  *
4932  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
4933  * instead, just save the qr and return FALSE; this tells our caller that
4934  * the original pattern needs upgrading to utf8.
4935  */
4936
4937 static bool
4938 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
4939     char *pat, STRLEN plen)
4940 {
4941     SV *qr;
4942
4943     GET_RE_DEBUG_FLAGS_DECL;
4944
4945     if (pRExC_state->runtime_code_qr) {
4946         /* this is the second time we've been called; this should
4947          * only happen if the main pattern got upgraded to utf8
4948          * during compilation; re-use the qr we compiled first time
4949          * round (which should be utf8 too)
4950          */
4951         qr = pRExC_state->runtime_code_qr;
4952         pRExC_state->runtime_code_qr = NULL;
4953         assert(RExC_utf8 && SvUTF8(qr));
4954     }
4955     else {
4956         int n = 0;
4957         STRLEN s;
4958         char *p, *newpat;
4959         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
4960         SV *sv, *qr_ref;
4961         dSP;
4962
4963         /* determine how many extra chars we need for ' and \ escaping */
4964         for (s = 0; s < plen; s++) {
4965             if (pat[s] == '\'' || pat[s] == '\\')
4966                 newlen++;
4967         }
4968
4969         Newx(newpat, newlen, char);
4970         p = newpat;
4971         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
4972
4973         for (s = 0; s < plen; s++) {
4974             if (n < pRExC_state->num_code_blocks
4975                 && s == pRExC_state->code_blocks[n].start)
4976             {
4977                 /* blank out literal code block */
4978                 assert(pat[s] == '(');
4979                 while (s <= pRExC_state->code_blocks[n].end) {
4980                     *p++ = '_';
4981                     s++;
4982                 }
4983                 s--;
4984                 n++;
4985                 continue;
4986             }
4987             if (pat[s] == '\'' || pat[s] == '\\')
4988                 *p++ = '\\';
4989             *p++ = pat[s];
4990         }
4991         *p++ = '\'';
4992         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
4993             *p++ = 'x';
4994         *p++ = '\0';
4995         DEBUG_COMPILE_r({
4996             PerlIO_printf(Perl_debug_log,
4997                 "%sre-parsing pattern for runtime code:%s %s\n",
4998                 PL_colors[4],PL_colors[5],newpat);
4999         });
5000
5001         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5002         Safefree(newpat);
5003
5004         ENTER;
5005         SAVETMPS;
5006         save_re_context();
5007         PUSHSTACKi(PERLSI_REQUIRE);
5008         /* this causes the toker to collapse \\ into \ when parsing
5009          * qr''; normally only q'' does this. It also alters hints
5010          * handling */
5011         PL_reg_state.re_reparsing = TRUE;
5012         eval_sv(sv, G_SCALAR);
5013         SvREFCNT_dec_NN(sv);
5014         SPAGAIN;
5015         qr_ref = POPs;
5016         PUTBACK;
5017         {
5018             SV * const errsv = ERRSV;
5019             if (SvTRUE_NN(errsv))
5020             {
5021                 Safefree(pRExC_state->code_blocks);
5022                 /* use croak_sv ? */
5023                 Perl_croak_nocontext("%s", SvPV_nolen_const(errsv));
5024             }
5025         }
5026         assert(SvROK(qr_ref));
5027         qr = SvRV(qr_ref);
5028         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5029         /* the leaving below frees the tmp qr_ref.
5030          * Give qr a life of its own */
5031         SvREFCNT_inc(qr);
5032         POPSTACK;
5033         FREETMPS;
5034         LEAVE;
5035
5036     }
5037
5038     if (!RExC_utf8 && SvUTF8(qr)) {
5039         /* first time through; the pattern got upgraded; save the
5040          * qr for the next time through */
5041         assert(!pRExC_state->runtime_code_qr);
5042         pRExC_state->runtime_code_qr = qr;
5043         return 0;
5044     }
5045
5046
5047     /* extract any code blocks within the returned qr//  */
5048
5049
5050     /* merge the main (r1) and run-time (r2) code blocks into one */
5051     {
5052         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5053         struct reg_code_block *new_block, *dst;
5054         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5055         int i1 = 0, i2 = 0;
5056
5057         if (!r2->num_code_blocks) /* we guessed wrong */
5058         {
5059             SvREFCNT_dec_NN(qr);
5060             return 1;
5061         }
5062
5063         Newx(new_block,
5064             r1->num_code_blocks + r2->num_code_blocks,
5065             struct reg_code_block);
5066         dst = new_block;
5067
5068         while (    i1 < r1->num_code_blocks
5069                 || i2 < r2->num_code_blocks)
5070         {
5071             struct reg_code_block *src;
5072             bool is_qr = 0;
5073
5074             if (i1 == r1->num_code_blocks) {
5075                 src = &r2->code_blocks[i2++];
5076                 is_qr = 1;
5077             }
5078             else if (i2 == r2->num_code_blocks)
5079                 src = &r1->code_blocks[i1++];
5080             else if (  r1->code_blocks[i1].start
5081                      < r2->code_blocks[i2].start)
5082             {
5083                 src = &r1->code_blocks[i1++];
5084                 assert(src->end < r2->code_blocks[i2].start);
5085             }
5086             else {
5087                 assert(  r1->code_blocks[i1].start
5088                        > r2->code_blocks[i2].start);
5089                 src = &r2->code_blocks[i2++];
5090                 is_qr = 1;
5091                 assert(src->end < r1->code_blocks[i1].start);
5092             }
5093
5094             assert(pat[src->start] == '(');
5095             assert(pat[src->end]   == ')');
5096             dst->start      = src->start;
5097             dst->end        = src->end;
5098             dst->block      = src->block;
5099             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5100                                     : src->src_regex;
5101             dst++;
5102         }
5103         r1->num_code_blocks += r2->num_code_blocks;
5104         Safefree(r1->code_blocks);
5105         r1->code_blocks = new_block;
5106     }
5107
5108     SvREFCNT_dec_NN(qr);
5109     return 1;
5110 }
5111
5112
5113 STATIC bool
5114 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5115 {
5116     /* This is the common code for setting up the floating and fixed length
5117      * string data extracted from Perlre_op_compile() below.  Returns a boolean
5118      * as to whether succeeded or not */
5119
5120     I32 t,ml;
5121
5122     if (! (longest_length
5123            || (eol /* Can't have SEOL and MULTI */
5124                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5125           )
5126             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5127         || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5128     {
5129         return FALSE;
5130     }
5131
5132     /* copy the information about the longest from the reg_scan_data
5133         over to the program. */
5134     if (SvUTF8(sv_longest)) {
5135         *rx_utf8 = sv_longest;
5136         *rx_substr = NULL;
5137     } else {
5138         *rx_substr = sv_longest;
5139         *rx_utf8 = NULL;
5140     }
5141     /* end_shift is how many chars that must be matched that
5142         follow this item. We calculate it ahead of time as once the
5143         lookbehind offset is added in we lose the ability to correctly
5144         calculate it.*/
5145     ml = minlen ? *(minlen) : (I32)longest_length;
5146     *rx_end_shift = ml - offset
5147         - longest_length + (SvTAIL(sv_longest) != 0)
5148         + lookbehind;
5149
5150     t = (eol/* Can't have SEOL and MULTI */
5151          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5152     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5153
5154     return TRUE;
5155 }
5156
5157 /*
5158  * Perl_re_op_compile - the perl internal RE engine's function to compile a
5159  * regular expression into internal code.
5160  * The pattern may be passed either as:
5161  *    a list of SVs (patternp plus pat_count)
5162  *    a list of OPs (expr)
5163  * If both are passed, the SV list is used, but the OP list indicates
5164  * which SVs are actually pre-compiled code blocks
5165  *
5166  * The SVs in the list have magic and qr overloading applied to them (and
5167  * the list may be modified in-place with replacement SVs in the latter
5168  * case).
5169  *
5170  * If the pattern hasn't changed from old_re, then old_re will be
5171  * returned.
5172  *
5173  * eng is the current engine. If that engine has an op_comp method, then
5174  * handle directly (i.e. we assume that op_comp was us); otherwise, just
5175  * do the initial concatenation of arguments and pass on to the external
5176  * engine.
5177  *
5178  * If is_bare_re is not null, set it to a boolean indicating whether the
5179  * arg list reduced (after overloading) to a single bare regex which has
5180  * been returned (i.e. /$qr/).
5181  *
5182  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5183  *
5184  * pm_flags contains the PMf_* flags, typically based on those from the
5185  * pm_flags field of the related PMOP. Currently we're only interested in
5186  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5187  *
5188  * We can't allocate space until we know how big the compiled form will be,
5189  * but we can't compile it (and thus know how big it is) until we've got a
5190  * place to put the code.  So we cheat:  we compile it twice, once with code
5191  * generation turned off and size counting turned on, and once "for real".
5192  * This also means that we don't allocate space until we are sure that the
5193  * thing really will compile successfully, and we never have to move the
5194  * code and thus invalidate pointers into it.  (Note that it has to be in
5195  * one piece because free() must be able to free it all.) [NB: not true in perl]
5196  *
5197  * Beware that the optimization-preparation code in here knows about some
5198  * of the structure of the compiled regexp.  [I'll say.]
5199  */
5200
5201 REGEXP *
5202 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5203                     OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5204                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5205 {
5206     dVAR;
5207     REGEXP *rx;
5208     struct regexp *r;
5209     regexp_internal *ri;
5210     STRLEN plen;
5211     char  * VOL exp;
5212     char* xend;
5213     regnode *scan;
5214     I32 flags;
5215     I32 minlen = 0;
5216     U32 rx_flags;
5217     SV * VOL pat;
5218     SV * VOL code_blocksv = NULL;
5219
5220     /* these are all flags - maybe they should be turned
5221      * into a single int with different bit masks */
5222     I32 sawlookahead = 0;
5223     I32 sawplus = 0;
5224     I32 sawopen = 0;
5225     bool used_setjump = FALSE;
5226     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5227     bool code_is_utf8 = 0;
5228     bool VOL recompile = 0;
5229     bool runtime_code = 0;
5230     U8 jump_ret = 0;
5231     dJMPENV;
5232     scan_data_t data;
5233     RExC_state_t RExC_state;
5234     RExC_state_t * const pRExC_state = &RExC_state;
5235 #ifdef TRIE_STUDY_OPT
5236     int restudied;
5237     RExC_state_t copyRExC_state;
5238 #endif
5239     GET_RE_DEBUG_FLAGS_DECL;
5240
5241     PERL_ARGS_ASSERT_RE_OP_COMPILE;
5242
5243     DEBUG_r(if (!PL_colorset) reginitcolors());
5244
5245 #ifndef PERL_IN_XSUB_RE
5246     /* Initialize these here instead of as-needed, as is quick and avoids
5247      * having to test them each time otherwise */
5248     if (! PL_AboveLatin1) {
5249         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5250         PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5251         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5252
5253         PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5254                                 = _new_invlist_C_array(L1PosixAlnum_invlist);
5255         PL_Posix_ptrs[_CC_ALPHANUMERIC]
5256                                 = _new_invlist_C_array(PosixAlnum_invlist);
5257
5258         PL_L1Posix_ptrs[_CC_ALPHA]
5259                                 = _new_invlist_C_array(L1PosixAlpha_invlist);
5260         PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5261
5262         PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5263         PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5264
5265         /* Cased is the same as Alpha in the ASCII range */
5266         PL_L1Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(L1Cased_invlist);
5267         PL_Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(PosixAlpha_invlist);
5268
5269         PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5270         PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5271
5272         PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5273         PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5274
5275         PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5276         PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5277
5278         PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5279         PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5280
5281         PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5282         PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5283
5284         PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5285         PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5286
5287         PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5288         PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5289         PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5290         PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5291
5292         PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5293         PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
5294
5295         PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
5296
5297         PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
5298         PL_L1Posix_ptrs[_CC_WORDCHAR]
5299                                 = _new_invlist_C_array(L1PosixWord_invlist);
5300
5301         PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
5302         PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
5303
5304         PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5305     }
5306 #endif
5307
5308     pRExC_state->code_blocks = NULL;
5309     pRExC_state->num_code_blocks = 0;
5310
5311     if (is_bare_re)
5312         *is_bare_re = FALSE;
5313
5314     if (expr && (expr->op_type == OP_LIST ||
5315                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5316
5317         /* is the source UTF8, and how many code blocks are there? */
5318         OP *o;
5319         int ncode = 0;
5320
5321         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5322             if (o->op_type == OP_CONST) {
5323                 /* skip if we have SVs as well as OPs. In this case,
5324                  * a) we decide utf8 based on SVs not OPs;
5325                  * b) the current pad may not match that which the ops
5326                  *    were compiled in, so, so on threaded builds,
5327                  *    cSVOPo_sv would look in the wrong pad */
5328                 if (!pat_count && SvUTF8(cSVOPo_sv))
5329                     code_is_utf8 = 1;
5330             }
5331             else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5332                 /* count of DO blocks */
5333                 ncode++;
5334         }
5335         if (ncode) {
5336             pRExC_state->num_code_blocks = ncode;
5337             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5338         }
5339     }
5340
5341     if (pat_count) {
5342         /* handle a list of SVs */
5343
5344         SV **svp;
5345
5346         /* apply magic and RE overloading to each arg */
5347         for (svp = patternp; svp < patternp + pat_count; svp++) {
5348             SV *rx = *svp;
5349             SvGETMAGIC(rx);
5350             if (SvROK(rx) && SvAMAGIC(rx)) {
5351                 SV *sv = AMG_CALLunary(rx, regexp_amg);
5352                 if (sv) {
5353                     if (SvROK(sv))
5354                         sv = SvRV(sv);
5355                     if (SvTYPE(sv) != SVt_REGEXP)
5356                         Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5357                     *svp = sv;
5358                 }
5359             }
5360         }
5361
5362         if (pat_count > 1) {
5363             /* concat multiple args and find any code block indexes */
5364
5365             OP *o = NULL;
5366             int n = 0;
5367             bool utf8 = 0;
5368             STRLEN orig_patlen = 0;
5369
5370             if (pRExC_state->num_code_blocks) {
5371                 o = cLISTOPx(expr)->op_first;
5372                 assert(   o->op_type == OP_PUSHMARK
5373                        || (o->op_type == OP_NULL && o->op_targ == OP_PUSHMARK)
5374                        || o->op_type == OP_PADRANGE);
5375                 o = o->op_sibling;
5376             }
5377
5378             pat = newSVpvn("", 0);
5379             SAVEFREESV(pat);
5380
5381             /* determine if the pattern is going to be utf8 (needed
5382              * in advance to align code block indices correctly).
5383              * XXX This could fail to be detected for an arg with
5384              * overloading but not concat overloading; but the main effect
5385              * in this obscure case is to need a 'use re eval' for a
5386              * literal code block */
5387             for (svp = patternp; svp < patternp + pat_count; svp++) {
5388                 if (SvUTF8(*svp))
5389                     utf8 = 1;
5390             }
5391             if (utf8)
5392                 SvUTF8_on(pat);
5393
5394             for (svp = patternp; svp < patternp + pat_count; svp++) {
5395                 SV *sv, *msv = *svp;
5396                 SV *rx;
5397                 bool code = 0;
5398                 /* we make the assumption here that each op in the list of
5399                  * op_siblings maps to one SV pushed onto the stack,
5400                  * except for code blocks, with have both an OP_NULL and
5401                  * and OP_CONST.
5402                  * This allows us to match up the list of SVs against the
5403                  * list of OPs to find the next code block.
5404                  *
5405                  * Note that       PUSHMARK PADSV PADSV ..
5406                  * is optimised to
5407                  *                 PADRANGE NULL  NULL  ..
5408                  * so the alignment still works. */
5409                 if (o) {
5410                     if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5411                         assert(n < pRExC_state->num_code_blocks);
5412                         pRExC_state->code_blocks[n].start = SvCUR(pat);
5413                         pRExC_state->code_blocks[n].block = o;
5414                         pRExC_state->code_blocks[n].src_regex = NULL;
5415                         n++;
5416                         code = 1;
5417                         o = o->op_sibling; /* skip CONST */
5418                         assert(o);
5419                     }
5420                     o = o->op_sibling;;
5421                 }
5422
5423                 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5424                         (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5425                 {
5426                     sv_setsv(pat, sv);
5427                     /* overloading involved: all bets are off over literal
5428                      * code. Pretend we haven't seen it */
5429                     pRExC_state->num_code_blocks -= n;
5430                     n = 0;
5431                     rx = NULL;
5432
5433                 }
5434                 else  {
5435                     while (SvAMAGIC(msv)
5436                             && (sv = AMG_CALLunary(msv, string_amg))
5437                             && sv != msv
5438                             &&  !(   SvROK(msv)
5439                                   && SvROK(sv)
5440                                   && SvRV(msv) == SvRV(sv))
5441                     ) {
5442                         msv = sv;
5443                         SvGETMAGIC(msv);
5444                     }
5445                     if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5446                         msv = SvRV(msv);
5447                     orig_patlen = SvCUR(pat);
5448                     sv_catsv_nomg(pat, msv);
5449                     rx = msv;
5450                     if (code)
5451                         pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5452                 }
5453
5454                 /* extract any code blocks within any embedded qr//'s */
5455                 if (rx && SvTYPE(rx) == SVt_REGEXP
5456                     && RX_ENGINE((REGEXP*)rx)->op_comp)
5457                 {
5458
5459                     RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5460                     if (ri->num_code_blocks) {
5461                         int i;
5462                         /* the presence of an embedded qr// with code means
5463                          * we should always recompile: the text of the
5464                          * qr// may not have changed, but it may be a
5465                          * different closure than last time */
5466                         recompile = 1;
5467                         Renew(pRExC_state->code_blocks,
5468                             pRExC_state->num_code_blocks + ri->num_code_blocks,
5469                             struct reg_code_block);
5470                         pRExC_state->num_code_blocks += ri->num_code_blocks;
5471                         for (i=0; i < ri->num_code_blocks; i++) {
5472                             struct reg_code_block *src, *dst;
5473                             STRLEN offset =  orig_patlen
5474                                 + ReANY((REGEXP *)rx)->pre_prefix;
5475                             assert(n < pRExC_state->num_code_blocks);
5476                             src = &ri->code_blocks[i];
5477                             dst = &pRExC_state->code_blocks[n];
5478                             dst->start      = src->start + offset;
5479                             dst->end        = src->end   + offset;
5480                             dst->block      = src->block;
5481                             dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5482                                                     src->src_regex
5483                                                         ? src->src_regex
5484                                                         : (REGEXP*)rx);
5485                             n++;
5486                         }
5487                     }
5488                 }
5489             }
5490             SvSETMAGIC(pat);
5491         }
5492         else {
5493             SV *sv;
5494             pat = *patternp;
5495             while (SvAMAGIC(pat)
5496                     && (sv = AMG_CALLunary(pat, string_amg))
5497                     && sv != pat)
5498             {
5499                 pat = sv;
5500                 SvGETMAGIC(pat);
5501             }
5502         }
5503
5504         /* handle bare regex: foo =~ $re */
5505         {
5506             SV *re = pat;
5507             if (SvROK(re))
5508                 re = SvRV(re);
5509             if (SvTYPE(re) == SVt_REGEXP) {
5510                 if (is_bare_re)
5511                     *is_bare_re = TRUE;
5512                 SvREFCNT_inc(re);
5513                 Safefree(pRExC_state->code_blocks);
5514                 return (REGEXP*)re;
5515             }
5516         }
5517     }
5518     else {
5519         /* not a list of SVs, so must be a list of OPs */
5520         assert(expr);
5521         if (expr->op_type == OP_LIST) {
5522             int i = -1;
5523             bool is_code = 0;
5524             OP *o;
5525
5526             pat = newSVpvn("", 0);
5527             SAVEFREESV(pat);
5528             if (code_is_utf8)
5529                 SvUTF8_on(pat);
5530
5531             /* given a list of CONSTs and DO blocks in expr, append all
5532              * the CONSTs to pat, and record the start and end of each
5533              * code block in code_blocks[] (each DO{} op is followed by an
5534              * OP_CONST containing the corresponding literal '(?{...})
5535              * text)
5536              */
5537             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5538                 if (o->op_type == OP_CONST) {
5539                     sv_catsv(pat, cSVOPo_sv);
5540                     if (is_code) {
5541                         pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5542                         is_code = 0;
5543                     }
5544                 }
5545                 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5546                     assert(i+1 < pRExC_state->num_code_blocks);
5547                     pRExC_state->code_blocks[++i].start = SvCUR(pat);
5548                     pRExC_state->code_blocks[i].block = o;
5549                     pRExC_state->code_blocks[i].src_regex = NULL;
5550                     is_code = 1;
5551                 }
5552             }
5553         }
5554         else {
5555             assert(expr->op_type == OP_CONST);
5556             pat = cSVOPx_sv(expr);
5557         }
5558     }
5559
5560     exp = SvPV_nomg(pat, plen);
5561
5562     if (!eng->op_comp) {
5563         if ((SvUTF8(pat) && IN_BYTES)
5564                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5565         {
5566             /* make a temporary copy; either to convert to bytes,
5567              * or to avoid repeating get-magic / overloaded stringify */
5568             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5569                                         (IN_BYTES ? 0 : SvUTF8(pat)));
5570         }
5571         Safefree(pRExC_state->code_blocks);
5572         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5573     }
5574
5575     /* ignore the utf8ness if the pattern is 0 length */
5576     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5577     RExC_uni_semantics = 0;
5578     RExC_contains_locale = 0;
5579     pRExC_state->runtime_code_qr = NULL;
5580
5581     /****************** LONG JUMP TARGET HERE***********************/
5582     /* Longjmp back to here if have to switch in midstream to utf8 */
5583     if (! RExC_orig_utf8) {
5584         JMPENV_PUSH(jump_ret);
5585         used_setjump = TRUE;
5586     }
5587
5588     if (jump_ret == 0) {    /* First time through */
5589         xend = exp + plen;
5590
5591         DEBUG_COMPILE_r({
5592             SV *dsv= sv_newmortal();
5593             RE_PV_QUOTED_DECL(s, RExC_utf8,
5594                 dsv, exp, plen, 60);
5595             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5596                            PL_colors[4],PL_colors[5],s);
5597         });
5598     }
5599     else {  /* longjumped back */
5600         U8 *src, *dst;
5601         int n=0;
5602         STRLEN s = 0, d = 0;
5603         bool do_end = 0;
5604
5605         /* If the cause for the longjmp was other than changing to utf8, pop
5606          * our own setjmp, and longjmp to the correct handler */
5607         if (jump_ret != UTF8_LONGJMP) {
5608             JMPENV_POP;
5609             JMPENV_JUMP(jump_ret);
5610         }
5611
5612         GET_RE_DEBUG_FLAGS;
5613
5614         /* It's possible to write a regexp in ascii that represents Unicode
5615         codepoints outside of the byte range, such as via \x{100}. If we
5616         detect such a sequence we have to convert the entire pattern to utf8
5617         and then recompile, as our sizing calculation will have been based
5618         on 1 byte == 1 character, but we will need to use utf8 to encode
5619         at least some part of the pattern, and therefore must convert the whole
5620         thing.
5621         -- dmq */
5622         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5623             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5624
5625         /* upgrade pattern to UTF8, and if there are code blocks,
5626          * recalculate the indices.
5627          * This is essentially an unrolled Perl_bytes_to_utf8() */
5628
5629         src = (U8*)SvPV_nomg(pat, plen);
5630         Newx(dst, plen * 2 + 1, U8);
5631
5632         while (s < plen) {
5633             const UV uv = NATIVE_TO_ASCII(src[s]);
5634             if (UNI_IS_INVARIANT(uv))
5635                 dst[d]   = (U8)UTF_TO_NATIVE(uv);
5636             else {
5637                 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5638                 dst[d]   = (U8)UTF8_EIGHT_BIT_LO(uv);
5639             }
5640             if (n < pRExC_state->num_code_blocks) {
5641                 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5642                     pRExC_state->code_blocks[n].start = d;
5643                     assert(dst[d] == '(');
5644                     do_end = 1;
5645                 }
5646                 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5647                     pRExC_state->code_blocks[n].end = d;
5648                     assert(dst[d] == ')');
5649                     do_end = 0;
5650                     n++;
5651                 }
5652             }
5653             s++;
5654             d++;
5655         }
5656         dst[d] = '\0';
5657         plen = d;
5658         exp = (char*) dst;
5659         xend = exp + plen;
5660         SAVEFREEPV(exp);
5661         RExC_orig_utf8 = RExC_utf8 = 1;
5662     }
5663
5664     /* return old regex if pattern hasn't changed */
5665
5666     if (   old_re
5667         && !recompile
5668         && !!RX_UTF8(old_re) == !!RExC_utf8
5669         && RX_PRECOMP(old_re)
5670         && RX_PRELEN(old_re) == plen
5671         && memEQ(RX_PRECOMP(old_re), exp, plen))
5672     {
5673         /* with runtime code, always recompile */
5674         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5675                                             exp, plen);
5676         if (!runtime_code) {
5677             if (used_setjump) {
5678                 JMPENV_POP;
5679             }
5680             Safefree(pRExC_state->code_blocks);
5681             return old_re;
5682         }
5683     }
5684     else if ((pm_flags & PMf_USE_RE_EVAL)
5685                 /* this second condition covers the non-regex literal case,
5686                  * i.e.  $foo =~ '(?{})'. */
5687                 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5688                     && (PL_hints & HINT_RE_EVAL))
5689     )
5690         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5691                             exp, plen);
5692
5693 #ifdef TRIE_STUDY_OPT
5694     restudied = 0;
5695 #endif
5696
5697     rx_flags = orig_rx_flags;
5698
5699     if (initial_charset == REGEX_LOCALE_CHARSET) {
5700         RExC_contains_locale = 1;
5701     }
5702     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5703
5704         /* Set to use unicode semantics if the pattern is in utf8 and has the
5705          * 'depends' charset specified, as it means unicode when utf8  */
5706         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5707     }
5708
5709     RExC_precomp = exp;
5710     RExC_flags = rx_flags;
5711     RExC_pm_flags = pm_flags;
5712
5713     if (runtime_code) {
5714         if (TAINTING_get && TAINT_get)
5715             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5716
5717         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5718             /* whoops, we have a non-utf8 pattern, whilst run-time code
5719              * got compiled as utf8. Try again with a utf8 pattern */
5720              JMPENV_JUMP(UTF8_LONGJMP);
5721         }
5722     }
5723     assert(!pRExC_state->runtime_code_qr);
5724
5725     RExC_sawback = 0;
5726
5727     RExC_seen = 0;
5728     RExC_in_lookbehind = 0;
5729     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5730     RExC_extralen = 0;
5731     RExC_override_recoding = 0;
5732     RExC_in_multi_char_class = 0;
5733
5734     /* First pass: determine size, legality. */
5735     RExC_parse = exp;
5736     RExC_start = exp;
5737     RExC_end = xend;
5738     RExC_naughty = 0;
5739     RExC_npar = 1;
5740     RExC_nestroot = 0;
5741     RExC_size = 0L;
5742     RExC_emit = &PL_regdummy;
5743     RExC_whilem_seen = 0;
5744     RExC_open_parens = NULL;
5745     RExC_close_parens = NULL;
5746     RExC_opend = NULL;
5747     RExC_paren_names = NULL;
5748 #ifdef DEBUGGING
5749     RExC_paren_name_list = NULL;
5750 #endif
5751     RExC_recurse = NULL;
5752     RExC_recurse_count = 0;
5753     pRExC_state->code_index = 0;
5754
5755 #if 0 /* REGC() is (currently) a NOP at the first pass.
5756        * Clever compilers notice this and complain. --jhi */
5757     REGC((U8)REG_MAGIC, (char*)RExC_emit);
5758 #endif
5759     DEBUG_PARSE_r(
5760         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5761         RExC_lastnum=0;
5762         RExC_lastparse=NULL;
5763     );
5764     /* reg may croak on us, not giving us a chance to free
5765        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
5766        need it to survive as long as the regexp (qr/(?{})/).
5767        We must check that code_blocksv is not already set, because we may
5768        have longjmped back. */
5769     if (pRExC_state->code_blocks && !code_blocksv) {
5770         code_blocksv = newSV_type(SVt_PV);
5771         SAVEFREESV(code_blocksv);
5772         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
5773         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
5774     }
5775     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5776         if (flags & RESTART_UTF8)
5777             JMPENV_JUMP(UTF8_LONGJMP);
5778         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#X", flags);
5779     }
5780     if (code_blocksv)
5781         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
5782
5783     /* Here, finished first pass.  Get rid of any added setjmp */
5784     if (used_setjump) {
5785         JMPENV_POP;
5786     }
5787
5788     DEBUG_PARSE_r({
5789         PerlIO_printf(Perl_debug_log,
5790             "Required size %"IVdf" nodes\n"
5791             "Starting second pass (creation)\n",
5792             (IV)RExC_size);
5793         RExC_lastnum=0;
5794         RExC_lastparse=NULL;
5795     });
5796
5797     /* The first pass could have found things that force Unicode semantics */
5798     if ((RExC_utf8 || RExC_uni_semantics)
5799          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5800     {
5801         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5802     }
5803
5804     /* Small enough for pointer-storage convention?
5805        If extralen==0, this means that we will not need long jumps. */
5806     if (RExC_size >= 0x10000L && RExC_extralen)
5807         RExC_size += RExC_extralen;
5808     else
5809         RExC_extralen = 0;
5810     if (RExC_whilem_seen > 15)
5811         RExC_whilem_seen = 15;
5812
5813     /* Allocate space and zero-initialize. Note, the two step process
5814        of zeroing when in debug mode, thus anything assigned has to
5815        happen after that */
5816     rx = (REGEXP*) newSV_type(SVt_REGEXP);
5817     r = ReANY(rx);
5818     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5819          char, regexp_internal);
5820     if ( r == NULL || ri == NULL )
5821         FAIL("Regexp out of space");
5822 #ifdef DEBUGGING
5823     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5824     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5825 #else
5826     /* bulk initialize base fields with 0. */
5827     Zero(ri, sizeof(regexp_internal), char);
5828 #endif
5829
5830     /* non-zero initialization begins here */
5831     RXi_SET( r, ri );
5832     r->engine= eng;
5833     r->extflags = rx_flags;
5834     if (pm_flags & PMf_IS_QR) {
5835         ri->code_blocks = pRExC_state->code_blocks;
5836         ri->num_code_blocks = pRExC_state->num_code_blocks;
5837     }
5838     else
5839     {
5840         int n;
5841         for (n = 0; n < pRExC_state->num_code_blocks; n++)
5842             if (pRExC_state->code_blocks[n].src_regex)
5843                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
5844         SAVEFREEPV(pRExC_state->code_blocks);
5845     }
5846
5847     {
5848         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5849         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5850
5851         /* The caret is output if there are any defaults: if not all the STD
5852          * flags are set, or if no character set specifier is needed */
5853         bool has_default =
5854                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5855                     || ! has_charset);
5856         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5857         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5858                             >> RXf_PMf_STD_PMMOD_SHIFT);
5859         const char *fptr = STD_PAT_MODS;        /*"msix"*/
5860         char *p;
5861         /* Allocate for the worst case, which is all the std flags are turned
5862          * on.  If more precision is desired, we could do a population count of
5863          * the flags set.  This could be done with a small lookup table, or by
5864          * shifting, masking and adding, or even, when available, assembly
5865          * language for a machine-language population count.
5866          * We never output a minus, as all those are defaults, so are
5867          * covered by the caret */
5868         const STRLEN wraplen = plen + has_p + has_runon
5869             + has_default       /* If needs a caret */
5870
5871                 /* If needs a character set specifier */
5872             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5873             + (sizeof(STD_PAT_MODS) - 1)
5874             + (sizeof("(?:)") - 1);
5875
5876         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
5877         r->xpv_len_u.xpvlenu_pv = p;
5878         if (RExC_utf8)
5879             SvFLAGS(rx) |= SVf_UTF8;
5880         *p++='('; *p++='?';
5881
5882         /* If a default, cover it using the caret */
5883         if (has_default) {
5884             *p++= DEFAULT_PAT_MOD;
5885         }
5886         if (has_charset) {
5887             STRLEN len;
5888             const char* const name = get_regex_charset_name(r->extflags, &len);
5889             Copy(name, p, len, char);
5890             p += len;
5891         }
5892         if (has_p)
5893             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5894         {
5895             char ch;
5896             while((ch = *fptr++)) {
5897                 if(reganch & 1)
5898                     *p++ = ch;
5899                 reganch >>= 1;
5900             }
5901         }
5902
5903         *p++ = ':';
5904         Copy(RExC_precomp, p, plen, char);
5905         assert ((RX_WRAPPED(rx) - p) < 16);
5906         r->pre_prefix = p - RX_WRAPPED(rx);
5907         p += plen;
5908         if (has_runon)
5909             *p++ = '\n';
5910         *p++ = ')';
5911         *p = 0;
5912         SvCUR_set(rx, p - RX_WRAPPED(rx));
5913     }
5914
5915     r->intflags = 0;
5916     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5917
5918     if (RExC_seen & REG_SEEN_RECURSE) {
5919         Newxz(RExC_open_parens, RExC_npar,regnode *);
5920         SAVEFREEPV(RExC_open_parens);
5921         Newxz(RExC_close_parens,RExC_npar,regnode *);
5922         SAVEFREEPV(RExC_close_parens);
5923     }
5924
5925     /* Useful during FAIL. */
5926 #ifdef RE_TRACK_PATTERN_OFFSETS
5927     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5928     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5929                           "%s %"UVuf" bytes for offset annotations.\n",
5930                           ri->u.offsets ? "Got" : "Couldn't get",
5931                           (UV)((2*RExC_size+1) * sizeof(U32))));
5932 #endif
5933     SetProgLen(ri,RExC_size);
5934     RExC_rx_sv = rx;
5935     RExC_rx = r;
5936     RExC_rxi = ri;
5937
5938     /* Second pass: emit code. */
5939     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
5940     RExC_pm_flags = pm_flags;
5941     RExC_parse = exp;
5942     RExC_end = xend;
5943     RExC_naughty = 0;
5944     RExC_npar = 1;
5945     RExC_emit_start = ri->program;
5946     RExC_emit = ri->program;
5947     RExC_emit_bound = ri->program + RExC_size + 1;
5948     pRExC_state->code_index = 0;
5949
5950     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5951     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5952         ReREFCNT_dec(rx);
5953         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#X", flags);
5954     }
5955     /* XXXX To minimize changes to RE engine we always allocate
5956        3-units-long substrs field. */
5957     Newx(r->substrs, 1, struct reg_substr_data);
5958     if (RExC_recurse_count) {
5959         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5960         SAVEFREEPV(RExC_recurse);
5961     }
5962
5963 reStudy:
5964     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5965     Zero(r->substrs, 1, struct reg_substr_data);
5966
5967 #ifdef TRIE_STUDY_OPT
5968     if (!restudied) {
5969         StructCopy(&zero_scan_data, &data, scan_data_t);
5970         copyRExC_state = RExC_state;
5971     } else {
5972         U32 seen=RExC_seen;
5973         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5974
5975         RExC_state = copyRExC_state;
5976         if (seen & REG_TOP_LEVEL_BRANCHES)
5977             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5978         else
5979             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5980         StructCopy(&zero_scan_data, &data, scan_data_t);
5981     }
5982 #else
5983     StructCopy(&zero_scan_data, &data, scan_data_t);
5984 #endif
5985
5986     /* Dig out information for optimizations. */
5987     r->extflags = RExC_flags; /* was pm_op */
5988     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5989
5990     if (UTF)
5991         SvUTF8_on(rx);  /* Unicode in it? */
5992     ri->regstclass = NULL;
5993     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
5994         r->intflags |= PREGf_NAUGHTY;
5995     scan = ri->program + 1;             /* First BRANCH. */
5996
5997     /* testing for BRANCH here tells us whether there is "must appear"
5998        data in the pattern. If there is then we can use it for optimisations */
5999     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
6000         I32 fake;
6001         STRLEN longest_float_length, longest_fixed_length;
6002         struct regnode_charclass_class ch_class; /* pointed to by data */
6003         int stclass_flag;
6004         I32 last_close = 0; /* pointed to by data */
6005         regnode *first= scan;
6006         regnode *first_next= regnext(first);
6007         /*
6008          * Skip introductions and multiplicators >= 1
6009          * so that we can extract the 'meat' of the pattern that must
6010          * match in the large if() sequence following.
6011          * NOTE that EXACT is NOT covered here, as it is normally
6012          * picked up by the optimiser separately.
6013          *
6014          * This is unfortunate as the optimiser isnt handling lookahead
6015          * properly currently.
6016          *
6017          */
6018         while ((OP(first) == OPEN && (sawopen = 1)) ||
6019                /* An OR of *one* alternative - should not happen now. */
6020             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6021             /* for now we can't handle lookbehind IFMATCH*/
6022             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6023             (OP(first) == PLUS) ||
6024             (OP(first) == MINMOD) ||
6025                /* An {n,m} with n>0 */
6026             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6027             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6028         {
6029                 /*
6030                  * the only op that could be a regnode is PLUS, all the rest
6031                  * will be regnode_1 or regnode_2.
6032                  *
6033                  */
6034                 if (OP(first) == PLUS)
6035                     sawplus = 1;
6036                 else
6037                     first += regarglen[OP(first)];
6038
6039                 first = NEXTOPER(first);
6040                 first_next= regnext(first);
6041         }
6042
6043         /* Starting-point info. */
6044       again:
6045         DEBUG_PEEP("first:",first,0);
6046         /* Ignore EXACT as we deal with it later. */
6047         if (PL_regkind[OP(first)] == EXACT) {
6048             if (OP(first) == EXACT)
6049                 NOOP;   /* Empty, get anchored substr later. */
6050             else
6051                 ri->regstclass = first;
6052         }
6053 #ifdef TRIE_STCLASS
6054         else if (PL_regkind[OP(first)] == TRIE &&
6055                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6056         {
6057             regnode *trie_op;
6058             /* this can happen only on restudy */
6059             if ( OP(first) == TRIE ) {
6060                 struct regnode_1 *trieop = (struct regnode_1 *)
6061                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
6062                 StructCopy(first,trieop,struct regnode_1);
6063                 trie_op=(regnode *)trieop;
6064             } else {
6065                 struct regnode_charclass *trieop = (struct regnode_charclass *)
6066                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6067                 StructCopy(first,trieop,struct regnode_charclass);
6068                 trie_op=(regnode *)trieop;
6069             }
6070             OP(trie_op)+=2;
6071             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6072             ri->regstclass = trie_op;
6073         }
6074 #endif
6075         else if (REGNODE_SIMPLE(OP(first)))
6076             ri->regstclass = first;
6077         else if (PL_regkind[OP(first)] == BOUND ||
6078                  PL_regkind[OP(first)] == NBOUND)
6079             ri->regstclass = first;
6080         else if (PL_regkind[OP(first)] == BOL) {
6081             r->extflags |= (OP(first) == MBOL
6082                            ? RXf_ANCH_MBOL
6083                            : (OP(first) == SBOL
6084                               ? RXf_ANCH_SBOL
6085                               : RXf_ANCH_BOL));
6086             first = NEXTOPER(first);
6087             goto again;
6088         }
6089         else if (OP(first) == GPOS) {
6090             r->extflags |= RXf_ANCH_GPOS;
6091             first = NEXTOPER(first);
6092             goto again;
6093         }
6094         else if ((!sawopen || !RExC_sawback) &&
6095             (OP(first) == STAR &&
6096             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6097             !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6098         {
6099             /* turn .* into ^.* with an implied $*=1 */
6100             const int type =
6101                 (OP(NEXTOPER(first)) == REG_ANY)
6102                     ? RXf_ANCH_MBOL
6103                     : RXf_ANCH_SBOL;
6104             r->extflags |= type;
6105             r->intflags |= PREGf_IMPLICIT;
6106             first = NEXTOPER(first);
6107             goto again;
6108         }
6109         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6110             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6111             /* x+ must match at the 1st pos of run of x's */
6112             r->intflags |= PREGf_SKIP;
6113
6114         /* Scan is after the zeroth branch, first is atomic matcher. */
6115 #ifdef TRIE_STUDY_OPT
6116         DEBUG_PARSE_r(
6117             if (!restudied)
6118                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6119                               (IV)(first - scan + 1))
6120         );
6121 #else
6122         DEBUG_PARSE_r(
6123             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6124                 (IV)(first - scan + 1))
6125         );
6126 #endif
6127
6128
6129         /*
6130         * If there's something expensive in the r.e., find the
6131         * longest literal string that must appear and make it the
6132         * regmust.  Resolve ties in favor of later strings, since
6133         * the regstart check works with the beginning of the r.e.
6134         * and avoiding duplication strengthens checking.  Not a
6135         * strong reason, but sufficient in the absence of others.
6136         * [Now we resolve ties in favor of the earlier string if
6137         * it happens that c_offset_min has been invalidated, since the
6138         * earlier string may buy us something the later one won't.]
6139         */
6140
6141         data.longest_fixed = newSVpvs("");
6142         data.longest_float = newSVpvs("");
6143         data.last_found = newSVpvs("");
6144         data.longest = &(data.longest_fixed);
6145         ENTER_with_name("study_chunk");
6146         SAVEFREESV(data.longest_fixed);
6147         SAVEFREESV(data.longest_float);
6148         SAVEFREESV(data.last_found);
6149         first = scan;
6150         if (!ri->regstclass) {
6151             cl_init(pRExC_state, &ch_class);
6152             data.start_class = &ch_class;
6153             stclass_flag = SCF_DO_STCLASS_AND;
6154         } else                          /* XXXX Check for BOUND? */
6155             stclass_flag = 0;
6156         data.last_closep = &last_close;
6157
6158         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6159             &data, -1, NULL, NULL,
6160             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6161
6162
6163         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6164
6165
6166         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6167              && data.last_start_min == 0 && data.last_end > 0
6168              && !RExC_seen_zerolen
6169              && !(RExC_seen & REG_SEEN_VERBARG)
6170              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6171             r->extflags |= RXf_CHECK_ALL;
6172         scan_commit(pRExC_state, &data,&minlen,0);
6173
6174         longest_float_length = CHR_SVLEN(data.longest_float);
6175
6176         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6177                    && data.offset_fixed == data.offset_float_min
6178                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6179             && S_setup_longest (aTHX_ pRExC_state,
6180                                     data.longest_float,
6181                                     &(r->float_utf8),
6182                                     &(r->float_substr),
6183                                     &(r->float_end_shift),
6184                                     data.lookbehind_float,
6185                                     data.offset_float_min,
6186                                     data.minlen_float,
6187                                     longest_float_length,
6188                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
6189                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6190         {
6191             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6192             r->float_max_offset = data.offset_float_max;
6193             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6194                 r->float_max_offset -= data.lookbehind_float;
6195             SvREFCNT_inc_simple_void_NN(data.longest_float);
6196         }
6197         else {
6198             r->float_substr = r->float_utf8 = NULL;
6199             longest_float_length = 0;
6200         }
6201
6202         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6203
6204         if (S_setup_longest (aTHX_ pRExC_state,
6205                                 data.longest_fixed,
6206                                 &(r->anchored_utf8),
6207                                 &(r->anchored_substr),
6208                                 &(r->anchored_end_shift),
6209                                 data.lookbehind_fixed,
6210                                 data.offset_fixed,
6211                                 data.minlen_fixed,
6212                                 longest_fixed_length,
6213                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6214                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6215         {
6216             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6217             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6218         }
6219         else {
6220             r->anchored_substr = r->anchored_utf8 = NULL;
6221             longest_fixed_length = 0;
6222         }
6223         LEAVE_with_name("study_chunk");
6224
6225         if (ri->regstclass
6226             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6227             ri->regstclass = NULL;
6228
6229         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6230             && stclass_flag
6231             && ! TEST_SSC_EOS(data.start_class)
6232             && !cl_is_anything(data.start_class))
6233         {
6234             const U32 n = add_data(pRExC_state, 1, "f");
6235             OP(data.start_class) = ANYOF_SYNTHETIC;
6236
6237             Newx(RExC_rxi->data->data[n], 1,
6238                 struct regnode_charclass_class);
6239             StructCopy(data.start_class,
6240                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6241                        struct regnode_charclass_class);
6242             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6243             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6244             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6245                       regprop(r, sv, (regnode*)data.start_class);
6246                       PerlIO_printf(Perl_debug_log,
6247                                     "synthetic stclass \"%s\".\n",
6248                                     SvPVX_const(sv));});
6249         }
6250
6251         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6252         if (longest_fixed_length > longest_float_length) {
6253             r->check_end_shift = r->anchored_end_shift;
6254             r->check_substr = r->anchored_substr;
6255             r->check_utf8 = r->anchored_utf8;
6256             r->check_offset_min = r->check_offset_max = r->anchored_offset;
6257             if (r->extflags & RXf_ANCH_SINGLE)
6258                 r->extflags |= RXf_NOSCAN;
6259         }
6260         else {
6261             r->check_end_shift = r->float_end_shift;
6262             r->check_substr = r->float_substr;
6263             r->check_utf8 = r->float_utf8;
6264             r->check_offset_min = r->float_min_offset;
6265             r->check_offset_max = r->float_max_offset;
6266         }
6267         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6268            This should be changed ASAP!  */
6269         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6270             r->extflags |= RXf_USE_INTUIT;
6271             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6272                 r->extflags |= RXf_INTUIT_TAIL;
6273         }
6274         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6275         if ( (STRLEN)minlen < longest_float_length )
6276             minlen= longest_float_length;
6277         if ( (STRLEN)minlen < longest_fixed_length )
6278             minlen= longest_fixed_length;
6279         */
6280     }
6281     else {
6282         /* Several toplevels. Best we can is to set minlen. */
6283         I32 fake;
6284         struct regnode_charclass_class ch_class;
6285         I32 last_close = 0;
6286
6287         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6288
6289         scan = ri->program + 1;
6290         cl_init(pRExC_state, &ch_class);
6291         data.start_class = &ch_class;
6292         data.last_closep = &last_close;
6293
6294
6295         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6296             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6297
6298         CHECK_RESTUDY_GOTO_butfirst(NOOP);
6299
6300         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6301                 = r->float_substr = r->float_utf8 = NULL;
6302
6303         if (! TEST_SSC_EOS(data.start_class)
6304             && !cl_is_anything(data.start_class))
6305         {
6306             const U32 n = add_data(pRExC_state, 1, "f");
6307             OP(data.start_class) = ANYOF_SYNTHETIC;
6308
6309             Newx(RExC_rxi->data->data[n], 1,
6310                 struct regnode_charclass_class);
6311             StructCopy(data.start_class,
6312                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6313                        struct regnode_charclass_class);
6314             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6315             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6316             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6317                       regprop(r, sv, (regnode*)data.start_class);
6318                       PerlIO_printf(Perl_debug_log,
6319                                     "synthetic stclass \"%s\".\n",
6320                                     SvPVX_const(sv));});
6321         }
6322     }
6323
6324     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6325        the "real" pattern. */
6326     DEBUG_OPTIMISE_r({
6327         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6328                       (IV)minlen, (IV)r->minlen);
6329     });
6330     r->minlenret = minlen;
6331     if (r->minlen < minlen)
6332         r->minlen = minlen;
6333
6334     if (RExC_seen & REG_SEEN_GPOS)
6335         r->extflags |= RXf_GPOS_SEEN;
6336     if (RExC_seen & REG_SEEN_LOOKBEHIND)
6337         r->extflags |= RXf_LOOKBEHIND_SEEN;
6338     if (pRExC_state->num_code_blocks)
6339         r->extflags |= RXf_EVAL_SEEN;
6340     if (RExC_seen & REG_SEEN_CANY)
6341         r->extflags |= RXf_CANY_SEEN;
6342     if (RExC_seen & REG_SEEN_VERBARG)
6343     {
6344         r->intflags |= PREGf_VERBARG_SEEN;
6345         r->extflags |= RXf_MODIFIES_VARS;
6346     }
6347     if (RExC_seen & REG_SEEN_CUTGROUP)
6348         r->intflags |= PREGf_CUTGROUP_SEEN;
6349     if (pm_flags & PMf_USE_RE_EVAL)
6350         r->intflags |= PREGf_USE_RE_EVAL;
6351     if (RExC_paren_names)
6352         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6353     else
6354         RXp_PAREN_NAMES(r) = NULL;
6355
6356 #ifdef STUPID_PATTERN_CHECKS
6357     if (RX_PRELEN(rx) == 0)
6358         r->extflags |= RXf_NULL;
6359     if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6360         r->extflags |= RXf_WHITE;
6361     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6362         r->extflags |= RXf_START_ONLY;
6363 #else
6364     {
6365         regnode *first = ri->program + 1;
6366         U8 fop = OP(first);
6367
6368         if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6369             r->extflags |= RXf_NULL;
6370         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6371             r->extflags |= RXf_START_ONLY;
6372         else if (fop == PLUS && PL_regkind[OP(NEXTOPER(first))] == POSIXD && FLAGS(NEXTOPER(first)) == _CC_SPACE
6373                              && OP(regnext(first)) == END)
6374             r->extflags |= RXf_WHITE;
6375     }
6376 #endif
6377 #ifdef DEBUGGING
6378     if (RExC_paren_names) {
6379         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6380         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6381     } else
6382 #endif
6383         ri->name_list_idx = 0;
6384
6385     if (RExC_recurse_count) {
6386         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6387             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6388             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6389         }
6390     }
6391     Newxz(r->offs, RExC_npar, regexp_paren_pair);
6392     /* assume we don't need to swap parens around before we match */
6393
6394     DEBUG_DUMP_r({
6395         PerlIO_printf(Perl_debug_log,"Final program:\n");
6396         regdump(r);
6397     });
6398 #ifdef RE_TRACK_PATTERN_OFFSETS
6399     DEBUG_OFFSETS_r(if (ri->u.offsets) {
6400         const U32 len = ri->u.offsets[0];
6401         U32 i;
6402         GET_RE_DEBUG_FLAGS_DECL;
6403         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6404         for (i = 1; i <= len; i++) {
6405             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6406                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6407                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6408             }
6409         PerlIO_printf(Perl_debug_log, "\n");
6410     });
6411 #endif
6412
6413 #ifdef USE_ITHREADS
6414     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6415      * by setting the regexp SV to readonly-only instead. If the
6416      * pattern's been recompiled, the USEDness should remain. */
6417     if (old_re && SvREADONLY(old_re))
6418         SvREADONLY_on(rx);
6419 #endif
6420     return rx;
6421 }
6422
6423
6424 SV*
6425 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6426                     const U32 flags)
6427 {
6428     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6429
6430     PERL_UNUSED_ARG(value);
6431
6432     if (flags & RXapif_FETCH) {
6433         return reg_named_buff_fetch(rx, key, flags);
6434     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6435         Perl_croak_no_modify();
6436         return NULL;
6437     } else if (flags & RXapif_EXISTS) {
6438         return reg_named_buff_exists(rx, key, flags)
6439             ? &PL_sv_yes
6440             : &PL_sv_no;
6441     } else if (flags & RXapif_REGNAMES) {
6442         return reg_named_buff_all(rx, flags);
6443     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6444         return reg_named_buff_scalar(rx, flags);
6445     } else {
6446         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6447         return NULL;
6448     }
6449 }
6450
6451 SV*
6452 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6453                          const U32 flags)
6454 {
6455     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6456     PERL_UNUSED_ARG(lastkey);
6457
6458     if (flags & RXapif_FIRSTKEY)
6459         return reg_named_buff_firstkey(rx, flags);
6460     else if (flags & RXapif_NEXTKEY)
6461         return reg_named_buff_nextkey(rx, flags);
6462     else {
6463         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6464         return NULL;
6465     }
6466 }
6467
6468 SV*
6469 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6470                           const U32 flags)
6471 {
6472     AV *retarray = NULL;
6473     SV *ret;
6474     struct regexp *const rx = ReANY(r);
6475
6476     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6477
6478     if (flags & RXapif_ALL)
6479         retarray=newAV();
6480
6481     if (rx && RXp_PAREN_NAMES(rx)) {
6482         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6483         if (he_str) {
6484             IV i;
6485             SV* sv_dat=HeVAL(he_str);
6486             I32 *nums=(I32*)SvPVX(sv_dat);
6487             for ( i=0; i<SvIVX(sv_dat); i++ ) {
6488                 if ((I32)(rx->nparens) >= nums[i]
6489                     && rx->offs[nums[i]].start != -1
6490                     && rx->offs[nums[i]].end != -1)
6491                 {
6492                     ret = newSVpvs("");
6493                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6494                     if (!retarray)
6495                         return ret;
6496                 } else {
6497                     if (retarray)
6498                         ret = newSVsv(&PL_sv_undef);
6499                 }
6500                 if (retarray)
6501                     av_push(retarray, ret);
6502             }
6503             if (retarray)
6504                 return newRV_noinc(MUTABLE_SV(retarray));
6505         }
6506     }
6507     return NULL;
6508 }
6509
6510 bool
6511 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6512                            const U32 flags)
6513 {
6514     struct regexp *const rx = ReANY(r);
6515
6516     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6517
6518     if (rx && RXp_PAREN_NAMES(rx)) {
6519         if (flags & RXapif_ALL) {
6520             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6521         } else {
6522             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6523             if (sv) {
6524                 SvREFCNT_dec_NN(sv);
6525                 return TRUE;
6526             } else {
6527                 return FALSE;
6528             }
6529         }
6530     } else {
6531         return FALSE;
6532     }
6533 }
6534
6535 SV*
6536 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6537 {
6538     struct regexp *const rx = ReANY(r);
6539
6540     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6541
6542     if ( rx && RXp_PAREN_NAMES(rx) ) {
6543         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6544
6545         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6546     } else {
6547         return FALSE;
6548     }
6549 }
6550
6551 SV*
6552 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6553 {
6554     struct regexp *const rx = ReANY(r);
6555     GET_RE_DEBUG_FLAGS_DECL;
6556
6557     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6558
6559     if (rx && RXp_PAREN_NAMES(rx)) {
6560         HV *hv = RXp_PAREN_NAMES(rx);
6561         HE *temphe;
6562         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6563             IV i;
6564             IV parno = 0;
6565             SV* sv_dat = HeVAL(temphe);
6566             I32 *nums = (I32*)SvPVX(sv_dat);
6567             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6568                 if ((I32)(rx->lastparen) >= nums[i] &&
6569                     rx->offs[nums[i]].start != -1 &&
6570                     rx->offs[nums[i]].end != -1)
6571                 {
6572                     parno = nums[i];
6573                     break;
6574                 }
6575             }
6576             if (parno || flags & RXapif_ALL) {
6577                 return newSVhek(HeKEY_hek(temphe));
6578             }
6579         }
6580     }
6581     return NULL;
6582 }
6583
6584 SV*
6585 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6586 {
6587     SV *ret;
6588     AV *av;
6589     I32 length;
6590     struct regexp *const rx = ReANY(r);
6591
6592     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6593
6594     if (rx && RXp_PAREN_NAMES(rx)) {
6595         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6596             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6597         } else if (flags & RXapif_ONE) {
6598             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6599             av = MUTABLE_AV(SvRV(ret));
6600             length = av_len(av);
6601             SvREFCNT_dec_NN(ret);
6602             return newSViv(length + 1);
6603         } else {
6604             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6605             return NULL;
6606         }
6607     }
6608     return &PL_sv_undef;
6609 }
6610
6611 SV*
6612 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6613 {
6614     struct regexp *const rx = ReANY(r);
6615     AV *av = newAV();
6616
6617     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6618
6619     if (rx && RXp_PAREN_NAMES(rx)) {
6620         HV *hv= RXp_PAREN_NAMES(rx);
6621         HE *temphe;
6622         (void)hv_iterinit(hv);
6623         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6624             IV i;
6625             IV parno = 0;
6626             SV* sv_dat = HeVAL(temphe);
6627             I32 *nums = (I32*)SvPVX(sv_dat);
6628             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6629                 if ((I32)(rx->lastparen) >= nums[i] &&
6630                     rx->offs[nums[i]].start != -1 &&
6631                     rx->offs[nums[i]].end != -1)
6632                 {
6633                     parno = nums[i];
6634                     break;
6635                 }
6636             }
6637             if (parno || flags & RXapif_ALL) {
6638                 av_push(av, newSVhek(HeKEY_hek(temphe)));
6639             }
6640         }
6641     }
6642
6643     return newRV_noinc(MUTABLE_SV(av));
6644 }
6645
6646 void
6647 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6648                              SV * const sv)
6649 {
6650     struct regexp *const rx = ReANY(r);
6651     char *s = NULL;
6652     I32 i = 0;
6653     I32 s1, t1;
6654     I32 n = paren;
6655
6656     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6657
6658     if ( (    n == RX_BUFF_IDX_CARET_PREMATCH
6659            || n == RX_BUFF_IDX_CARET_FULLMATCH
6660            || n == RX_BUFF_IDX_CARET_POSTMATCH
6661          )
6662          && !(rx->extflags & RXf_PMf_KEEPCOPY)
6663     )
6664         goto ret_undef;
6665
6666     if (!rx->subbeg)
6667         goto ret_undef;
6668
6669     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6670         /* no need to distinguish between them any more */
6671         n = RX_BUFF_IDX_FULLMATCH;
6672
6673     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6674         && rx->offs[0].start != -1)
6675     {
6676         /* $`, ${^PREMATCH} */
6677         i = rx->offs[0].start;
6678         s = rx->subbeg;
6679     }
6680     else
6681     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6682         && rx->offs[0].end != -1)
6683     {
6684         /* $', ${^POSTMATCH} */
6685         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6686         i = rx->sublen + rx->suboffset - rx->offs[0].end;
6687     }
6688     else
6689     if ( 0 <= n && n <= (I32)rx->nparens &&
6690         (s1 = rx->offs[n].start) != -1 &&
6691         (t1 = rx->offs[n].end) != -1)
6692     {
6693         /* $&, ${^MATCH},  $1 ... */
6694         i = t1 - s1;
6695         s = rx->subbeg + s1 - rx->suboffset;
6696     } else {
6697         goto ret_undef;
6698     }
6699
6700     assert(s >= rx->subbeg);
6701     assert(rx->sublen >= (s - rx->subbeg) + i );
6702     if (i >= 0) {
6703 #if NO_TAINT_SUPPORT
6704         sv_setpvn(sv, s, i);
6705 #else
6706         const int oldtainted = TAINT_get;
6707         TAINT_NOT;
6708         sv_setpvn(sv, s, i);
6709         TAINT_set(oldtainted);
6710 #endif
6711         if ( (rx->extflags & RXf_CANY_SEEN)
6712             ? (RXp_MATCH_UTF8(rx)
6713                         && (!i || is_utf8_string((U8*)s, i)))
6714             : (RXp_MATCH_UTF8(rx)) )
6715         {
6716             SvUTF8_on(sv);
6717         }
6718         else
6719             SvUTF8_off(sv);
6720         if (TAINTING_get) {
6721             if (RXp_MATCH_TAINTED(rx)) {
6722                 if (SvTYPE(sv) >= SVt_PVMG) {
6723                     MAGIC* const mg = SvMAGIC(sv);
6724                     MAGIC* mgt;
6725                     TAINT;
6726                     SvMAGIC_set(sv, mg->mg_moremagic);
6727                     SvTAINT(sv);
6728                     if ((mgt = SvMAGIC(sv))) {
6729                         mg->mg_moremagic = mgt;
6730                         SvMAGIC_set(sv, mg);
6731                     }
6732                 } else {
6733                     TAINT;
6734                     SvTAINT(sv);
6735                 }
6736             } else
6737                 SvTAINTED_off(sv);
6738         }
6739     } else {
6740       ret_undef:
6741         sv_setsv(sv,&PL_sv_undef);
6742         return;
6743     }
6744 }
6745
6746 void
6747 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6748                                                          SV const * const value)
6749 {
6750     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6751
6752     PERL_UNUSED_ARG(rx);
6753     PERL_UNUSED_ARG(paren);
6754     PERL_UNUSED_ARG(value);
6755
6756     if (!PL_localizing)
6757         Perl_croak_no_modify();
6758 }
6759
6760 I32
6761 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6762                               const I32 paren)
6763 {
6764     struct regexp *const rx = ReANY(r);
6765     I32 i;
6766     I32 s1, t1;
6767
6768     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6769
6770     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6771     switch (paren) {
6772       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6773          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6774             goto warn_undef;
6775         /*FALLTHROUGH*/
6776
6777       case RX_BUFF_IDX_PREMATCH:       /* $` */
6778         if (rx->offs[0].start != -1) {
6779                         i = rx->offs[0].start;
6780                         if (i > 0) {
6781                                 s1 = 0;
6782                                 t1 = i;
6783                                 goto getlen;
6784                         }
6785             }
6786         return 0;
6787
6788       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6789          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6790             goto warn_undef;
6791       case RX_BUFF_IDX_POSTMATCH:       /* $' */
6792             if (rx->offs[0].end != -1) {
6793                         i = rx->sublen - rx->offs[0].end;
6794                         if (i > 0) {
6795                                 s1 = rx->offs[0].end;
6796                                 t1 = rx->sublen;
6797                                 goto getlen;
6798                         }
6799             }
6800         return 0;
6801
6802       case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
6803          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6804             goto warn_undef;
6805         /*FALLTHROUGH*/
6806
6807       /* $& / ${^MATCH}, $1, $2, ... */
6808       default:
6809             if (paren <= (I32)rx->nparens &&
6810             (s1 = rx->offs[paren].start) != -1 &&
6811             (t1 = rx->offs[paren].end) != -1)
6812             {
6813             i = t1 - s1;
6814             goto getlen;
6815         } else {
6816           warn_undef:
6817             if (ckWARN(WARN_UNINITIALIZED))
6818                 report_uninit((const SV *)sv);
6819             return 0;
6820         }
6821     }
6822   getlen:
6823     if (i > 0 && RXp_MATCH_UTF8(rx)) {
6824         const char * const s = rx->subbeg - rx->suboffset + s1;
6825         const U8 *ep;
6826         STRLEN el;
6827
6828         i = t1 - s1;
6829         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6830                         i = el;
6831     }
6832     return i;
6833 }
6834
6835 SV*
6836 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6837 {
6838     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6839         PERL_UNUSED_ARG(rx);
6840         if (0)
6841             return NULL;
6842         else
6843             return newSVpvs("Regexp");
6844 }
6845
6846 /* Scans the name of a named buffer from the pattern.
6847  * If flags is REG_RSN_RETURN_NULL returns null.
6848  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6849  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6850  * to the parsed name as looked up in the RExC_paren_names hash.
6851  * If there is an error throws a vFAIL().. type exception.
6852  */
6853
6854 #define REG_RSN_RETURN_NULL    0
6855 #define REG_RSN_RETURN_NAME    1
6856 #define REG_RSN_RETURN_DATA    2
6857
6858 STATIC SV*
6859 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6860 {
6861     char *name_start = RExC_parse;
6862
6863     PERL_ARGS_ASSERT_REG_SCAN_NAME;
6864
6865     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6866          /* skip IDFIRST by using do...while */
6867         if (UTF)
6868             do {
6869                 RExC_parse += UTF8SKIP(RExC_parse);
6870             } while (isWORDCHAR_utf8((U8*)RExC_parse));
6871         else
6872             do {
6873                 RExC_parse++;
6874             } while (isWORDCHAR(*RExC_parse));
6875     } else {
6876         RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6877         vFAIL("Group name must start with a non-digit word character");
6878     }
6879     if ( flags ) {
6880         SV* sv_name
6881             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6882                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6883         if ( flags == REG_RSN_RETURN_NAME)
6884             return sv_name;
6885         else if (flags==REG_RSN_RETURN_DATA) {
6886             HE *he_str = NULL;
6887             SV *sv_dat = NULL;
6888             if ( ! sv_name )      /* should not happen*/
6889                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6890             if (RExC_paren_names)
6891                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6892             if ( he_str )
6893                 sv_dat = HeVAL(he_str);
6894             if ( ! sv_dat )
6895                 vFAIL("Reference to nonexistent named group");
6896             return sv_dat;
6897         }
6898         else {
6899             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6900                        (unsigned long) flags);
6901         }
6902         assert(0); /* NOT REACHED */
6903     }
6904     return NULL;
6905 }
6906
6907 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
6908     int rem=(int)(RExC_end - RExC_parse);                       \
6909     int cut;                                                    \
6910     int num;                                                    \
6911     int iscut=0;                                                \
6912     if (rem>10) {                                               \
6913         rem=10;                                                 \
6914         iscut=1;                                                \
6915     }                                                           \
6916     cut=10-rem;                                                 \
6917     if (RExC_lastparse!=RExC_parse)                             \
6918         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
6919             rem, RExC_parse,                                    \
6920             cut + 4,                                            \
6921             iscut ? "..." : "<"                                 \
6922         );                                                      \
6923     else                                                        \
6924         PerlIO_printf(Perl_debug_log,"%16s","");                \
6925                                                                 \
6926     if (SIZE_ONLY)                                              \
6927        num = RExC_size + 1;                                     \
6928     else                                                        \
6929        num=REG_NODE_NUM(RExC_emit);                             \
6930     if (RExC_lastnum!=num)                                      \
6931        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
6932     else                                                        \
6933        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
6934     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
6935         (int)((depth*2)), "",                                   \
6936         (funcname)                                              \
6937     );                                                          \
6938     RExC_lastnum=num;                                           \
6939     RExC_lastparse=RExC_parse;                                  \
6940 })
6941
6942
6943
6944 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
6945     DEBUG_PARSE_MSG((funcname));                            \
6946     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
6947 })
6948 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
6949     DEBUG_PARSE_MSG((funcname));                            \
6950     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
6951 })
6952
6953 /* This section of code defines the inversion list object and its methods.  The
6954  * interfaces are highly subject to change, so as much as possible is static to
6955  * this file.  An inversion list is here implemented as a malloc'd C UV array
6956  * with some added info that is placed as UVs at the beginning in a header
6957  * portion.  An inversion list for Unicode is an array of code points, sorted
6958  * by ordinal number.  The zeroth element is the first code point in the list.
6959  * The 1th element is the first element beyond that not in the list.  In other
6960  * words, the first range is
6961  *  invlist[0]..(invlist[1]-1)
6962  * The other ranges follow.  Thus every element whose index is divisible by two
6963  * marks the beginning of a range that is in the list, and every element not
6964  * divisible by two marks the beginning of a range not in the list.  A single
6965  * element inversion list that contains the single code point N generally
6966  * consists of two elements
6967  *  invlist[0] == N
6968  *  invlist[1] == N+1
6969  * (The exception is when N is the highest representable value on the
6970  * machine, in which case the list containing just it would be a single
6971  * element, itself.  By extension, if the last range in the list extends to
6972  * infinity, then the first element of that range will be in the inversion list
6973  * at a position that is divisible by two, and is the final element in the
6974  * list.)
6975  * Taking the complement (inverting) an inversion list is quite simple, if the
6976  * first element is 0, remove it; otherwise add a 0 element at the beginning.
6977  * This implementation reserves an element at the beginning of each inversion
6978  * list to contain 0 when the list contains 0, and contains 1 otherwise.  The
6979  * actual beginning of the list is either that element if 0, or the next one if
6980  * 1.
6981  *
6982  * More about inversion lists can be found in "Unicode Demystified"
6983  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6984  * More will be coming when functionality is added later.
6985  *
6986  * The inversion list data structure is currently implemented as an SV pointing
6987  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
6988  * array of UV whose memory management is automatically handled by the existing
6989  * facilities for SV's.
6990  *
6991  * Some of the methods should always be private to the implementation, and some
6992  * should eventually be made public */
6993
6994 /* The header definitions are in F<inline_invlist.c> */
6995 #define TO_INTERNAL_SIZE(x) (((x) + HEADER_LENGTH) * sizeof(UV))
6996 #define FROM_INTERNAL_SIZE(x) (((x)/ sizeof(UV)) - HEADER_LENGTH)
6997
6998 #define INVLIST_INITIAL_LEN 10
6999
7000 PERL_STATIC_INLINE UV*
7001 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7002 {
7003     /* Returns a pointer to the first element in the inversion list's array.
7004      * This is called upon initialization of an inversion list.  Where the
7005      * array begins depends on whether the list has the code point U+0000
7006      * in it or not.  The other parameter tells it whether the code that
7007      * follows this call is about to put a 0 in the inversion list or not.
7008      * The first element is either the element with 0, if 0, or the next one,
7009      * if 1 */
7010
7011     UV* zero = get_invlist_zero_addr(invlist);
7012
7013     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7014
7015     /* Must be empty */
7016     assert(! *_get_invlist_len_addr(invlist));
7017
7018     /* 1^1 = 0; 1^0 = 1 */
7019     *zero = 1 ^ will_have_0;
7020     return zero + *zero;
7021 }
7022
7023 PERL_STATIC_INLINE UV*
7024 S_invlist_array(pTHX_ SV* const invlist)
7025 {
7026     /* Returns the pointer to the inversion list's array.  Every time the
7027      * length changes, this needs to be called in case malloc or realloc moved
7028      * it */
7029
7030     PERL_ARGS_ASSERT_INVLIST_ARRAY;
7031
7032     /* Must not be empty.  If these fail, you probably didn't check for <len>
7033      * being non-zero before trying to get the array */
7034     assert(*_get_invlist_len_addr(invlist));
7035     assert(*get_invlist_zero_addr(invlist) == 0
7036            || *get_invlist_zero_addr(invlist) == 1);
7037
7038     /* The array begins either at the element reserved for zero if the
7039      * list contains 0 (that element will be set to 0), or otherwise the next
7040      * element (in which case the reserved element will be set to 1). */
7041     return (UV *) (get_invlist_zero_addr(invlist)
7042                    + *get_invlist_zero_addr(invlist));
7043 }
7044
7045 PERL_STATIC_INLINE void
7046 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7047 {
7048     /* Sets the current number of elements stored in the inversion list */
7049
7050     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7051
7052     *_get_invlist_len_addr(invlist) = len;
7053
7054     assert(len <= SvLEN(invlist));
7055
7056     SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7057     /* If the list contains U+0000, that element is part of the header,
7058      * and should not be counted as part of the array.  It will contain
7059      * 0 in that case, and 1 otherwise.  So we could flop 0=>1, 1=>0 and
7060      * subtract:
7061      *  SvCUR_set(invlist,
7062      *            TO_INTERNAL_SIZE(len
7063      *                             - (*get_invlist_zero_addr(inv_list) ^ 1)));
7064      * But, this is only valid if len is not 0.  The consequences of not doing
7065      * this is that the memory allocation code may think that 1 more UV is
7066      * being used than actually is, and so might do an unnecessary grow.  That
7067      * seems worth not bothering to make this the precise amount.
7068      *
7069      * Note that when inverting, SvCUR shouldn't change */
7070 }
7071
7072 PERL_STATIC_INLINE IV*
7073 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7074 {
7075     /* Return the address of the UV that is reserved to hold the cached index
7076      * */
7077
7078     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7079
7080     return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
7081 }
7082
7083 PERL_STATIC_INLINE IV
7084 S_invlist_previous_index(pTHX_ SV* const invlist)
7085 {
7086     /* Returns cached index of previous search */
7087
7088     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7089
7090     return *get_invlist_previous_index_addr(invlist);
7091 }
7092
7093 PERL_STATIC_INLINE void
7094 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7095 {
7096     /* Caches <index> for later retrieval */
7097
7098     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7099
7100     assert(index == 0 || index < (int) _invlist_len(invlist));
7101
7102     *get_invlist_previous_index_addr(invlist) = index;
7103 }
7104
7105 PERL_STATIC_INLINE UV
7106 S_invlist_max(pTHX_ SV* const invlist)
7107 {
7108     /* Returns the maximum number of elements storable in the inversion list's
7109      * array, without having to realloc() */
7110
7111     PERL_ARGS_ASSERT_INVLIST_MAX;
7112
7113     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7114            ? _invlist_len(invlist)
7115            : FROM_INTERNAL_SIZE(SvLEN(invlist));
7116 }
7117
7118 PERL_STATIC_INLINE UV*
7119 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7120 {
7121     /* Return the address of the UV that is reserved to hold 0 if the inversion
7122      * list contains 0.  This has to be the last element of the heading, as the
7123      * list proper starts with either it if 0, or the next element if not.
7124      * (But we force it to contain either 0 or 1) */
7125
7126     PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7127
7128     return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7129 }
7130
7131 #ifndef PERL_IN_XSUB_RE
7132 SV*
7133 Perl__new_invlist(pTHX_ IV initial_size)
7134 {
7135
7136     /* Return a pointer to a newly constructed inversion list, with enough
7137      * space to store 'initial_size' elements.  If that number is negative, a
7138      * system default is used instead */
7139
7140     SV* new_list;
7141
7142     if (initial_size < 0) {
7143         initial_size = INVLIST_INITIAL_LEN;
7144     }
7145
7146     /* Allocate the initial space */
7147     new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7148     invlist_set_len(new_list, 0);
7149
7150     /* Force iterinit() to be used to get iteration to work */
7151     *get_invlist_iter_addr(new_list) = UV_MAX;
7152
7153     /* This should force a segfault if a method doesn't initialize this
7154      * properly */
7155     *get_invlist_zero_addr(new_list) = UV_MAX;
7156
7157     *get_invlist_previous_index_addr(new_list) = 0;
7158     *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7159 #if HEADER_LENGTH != 5
7160 #   error Need to regenerate INVLIST_VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7161 #endif
7162
7163     return new_list;
7164 }
7165 #endif
7166
7167 STATIC SV*
7168 S__new_invlist_C_array(pTHX_ UV* list)
7169 {
7170     /* Return a pointer to a newly constructed inversion list, initialized to
7171      * point to <list>, which has to be in the exact correct inversion list
7172      * form, including internal fields.  Thus this is a dangerous routine that
7173      * should not be used in the wrong hands */
7174
7175     SV* invlist = newSV_type(SVt_PV);
7176
7177     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7178
7179     SvPV_set(invlist, (char *) list);
7180     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7181                                shouldn't touch it */
7182     SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
7183
7184     if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7185         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7186     }
7187
7188     /* Initialize the iteration pointer.
7189      * XXX This could be done at compile time in charclass_invlists.h, but I
7190      * (khw) am not confident that the suffixes for specifying the C constant
7191      * UV_MAX are portable, e.g.  'ull' on a 32 bit machine that is configured
7192      * to use 64 bits; might need a Configure probe */
7193     invlist_iterfinish(invlist);
7194
7195     return invlist;
7196 }
7197
7198 STATIC void
7199 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7200 {
7201     /* Grow the maximum size of an inversion list */
7202
7203     PERL_ARGS_ASSERT_INVLIST_EXTEND;
7204
7205     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7206 }
7207
7208 PERL_STATIC_INLINE void
7209 S_invlist_trim(pTHX_ SV* const invlist)
7210 {
7211     PERL_ARGS_ASSERT_INVLIST_TRIM;
7212
7213     /* Change the length of the inversion list to how many entries it currently
7214      * has */
7215
7216     SvPV_shrink_to_cur((SV *) invlist);
7217 }
7218
7219 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7220
7221 STATIC void
7222 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7223 {
7224    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7225     * the end of the inversion list.  The range must be above any existing
7226     * ones. */
7227
7228     UV* array;
7229     UV max = invlist_max(invlist);
7230     UV len = _invlist_len(invlist);
7231
7232     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7233
7234     if (len == 0) { /* Empty lists must be initialized */
7235         array = _invlist_array_init(invlist, start == 0);
7236     }
7237     else {
7238         /* Here, the existing list is non-empty. The current max entry in the
7239          * list is generally the first value not in the set, except when the
7240          * set extends to the end of permissible values, in which case it is
7241          * the first entry in that final set, and so this call is an attempt to
7242          * append out-of-order */
7243
7244         UV final_element = len - 1;
7245         array = invlist_array(invlist);
7246         if (array[final_element] > start
7247             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7248         {
7249             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7250                        array[final_element], start,
7251                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7252         }
7253
7254         /* Here, it is a legal append.  If the new range begins with the first
7255          * value not in the set, it is extending the set, so the new first
7256          * value not in the set is one greater than the newly extended range.
7257          * */
7258         if (array[final_element] == start) {
7259             if (end != UV_MAX) {
7260                 array[final_element] = end + 1;
7261             }
7262             else {
7263                 /* But if the end is the maximum representable on the machine,
7264                  * just let the range that this would extend to have no end */
7265                 invlist_set_len(invlist, len - 1);
7266             }
7267             return;
7268         }
7269     }
7270
7271     /* Here the new range doesn't extend any existing set.  Add it */
7272
7273     len += 2;   /* Includes an element each for the start and end of range */
7274
7275     /* If overflows the existing space, extend, which may cause the array to be
7276      * moved */
7277     if (max < len) {
7278         invlist_extend(invlist, len);
7279         invlist_set_len(invlist, len);  /* Have to set len here to avoid assert
7280                                            failure in invlist_array() */
7281         array = invlist_array(invlist);
7282     }
7283     else {
7284         invlist_set_len(invlist, len);
7285     }
7286
7287     /* The next item on the list starts the range, the one after that is
7288      * one past the new range.  */
7289     array[len - 2] = start;
7290     if (end != UV_MAX) {
7291         array[len - 1] = end + 1;
7292     }
7293     else {
7294         /* But if the end is the maximum representable on the machine, just let
7295          * the range have no end */
7296         invlist_set_len(invlist, len - 1);
7297     }
7298 }
7299
7300 #ifndef PERL_IN_XSUB_RE
7301
7302 IV
7303 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7304 {
7305     /* Searches the inversion list for the entry that contains the input code
7306      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
7307      * return value is the index into the list's array of the range that
7308      * contains <cp> */
7309
7310     IV low = 0;
7311     IV mid;
7312     IV high = _invlist_len(invlist);
7313     const IV highest_element = high - 1;
7314     const UV* array;
7315
7316     PERL_ARGS_ASSERT__INVLIST_SEARCH;
7317
7318     /* If list is empty, return failure. */
7319     if (high == 0) {
7320         return -1;
7321     }
7322
7323     /* (We can't get the array unless we know the list is non-empty) */
7324     array = invlist_array(invlist);
7325
7326     mid = invlist_previous_index(invlist);
7327     assert(mid >=0 && mid <= highest_element);
7328
7329     /* <mid> contains the cache of the result of the previous call to this
7330      * function (0 the first time).  See if this call is for the same result,
7331      * or if it is for mid-1.  This is under the theory that calls to this
7332      * function will often be for related code points that are near each other.
7333      * And benchmarks show that caching gives better results.  We also test
7334      * here if the code point is within the bounds of the list.  These tests
7335      * replace others that would have had to be made anyway to make sure that
7336      * the array bounds were not exceeded, and these give us extra information
7337      * at the same time */
7338     if (cp >= array[mid]) {
7339         if (cp >= array[highest_element]) {
7340             return highest_element;
7341         }
7342
7343         /* Here, array[mid] <= cp < array[highest_element].  This means that
7344          * the final element is not the answer, so can exclude it; it also
7345          * means that <mid> is not the final element, so can refer to 'mid + 1'
7346          * safely */
7347         if (cp < array[mid + 1]) {
7348             return mid;
7349         }
7350         high--;
7351         low = mid + 1;
7352     }
7353     else { /* cp < aray[mid] */
7354         if (cp < array[0]) { /* Fail if outside the array */
7355             return -1;
7356         }
7357         high = mid;
7358         if (cp >= array[mid - 1]) {
7359             goto found_entry;
7360         }
7361     }
7362
7363     /* Binary search.  What we are looking for is <i> such that
7364      *  array[i] <= cp < array[i+1]
7365      * The loop below converges on the i+1.  Note that there may not be an
7366      * (i+1)th element in the array, and things work nonetheless */
7367     while (low < high) {
7368         mid = (low + high) / 2;
7369         assert(mid <= highest_element);
7370         if (array[mid] <= cp) { /* cp >= array[mid] */
7371             low = mid + 1;
7372
7373             /* We could do this extra test to exit the loop early.
7374             if (cp < array[low]) {
7375                 return mid;
7376             }
7377             */
7378         }
7379         else { /* cp < array[mid] */
7380             high = mid;
7381         }
7382     }
7383
7384   found_entry:
7385     high--;
7386     invlist_set_previous_index(invlist, high);
7387     return high;
7388 }
7389
7390 void
7391 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7392 {
7393     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7394      * but is used when the swash has an inversion list.  This makes this much
7395      * faster, as it uses a binary search instead of a linear one.  This is
7396      * intimately tied to that function, and perhaps should be in utf8.c,
7397      * except it is intimately tied to inversion lists as well.  It assumes
7398      * that <swatch> is all 0's on input */
7399
7400     UV current = start;
7401     const IV len = _invlist_len(invlist);
7402     IV i;
7403     const UV * array;
7404
7405     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7406
7407     if (len == 0) { /* Empty inversion list */
7408         return;
7409     }
7410
7411     array = invlist_array(invlist);
7412
7413     /* Find which element it is */
7414     i = _invlist_search(invlist, start);
7415
7416     /* We populate from <start> to <end> */
7417     while (current < end) {
7418         UV upper;
7419
7420         /* The inversion list gives the results for every possible code point
7421          * after the first one in the list.  Only those ranges whose index is
7422          * even are ones that the inversion list matches.  For the odd ones,
7423          * and if the initial code point is not in the list, we have to skip
7424          * forward to the next element */
7425         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7426             i++;
7427             if (i >= len) { /* Finished if beyond the end of the array */
7428                 return;
7429             }
7430             current = array[i];
7431             if (current >= end) {   /* Finished if beyond the end of what we
7432                                        are populating */
7433                 if (LIKELY(end < UV_MAX)) {
7434                     return;
7435                 }
7436
7437                 /* We get here when the upper bound is the maximum
7438                  * representable on the machine, and we are looking for just
7439                  * that code point.  Have to special case it */
7440                 i = len;
7441                 goto join_end_of_list;
7442             }
7443         }
7444         assert(current >= start);
7445
7446         /* The current range ends one below the next one, except don't go past
7447          * <end> */
7448         i++;
7449         upper = (i < len && array[i] < end) ? array[i] : end;
7450
7451         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
7452          * for each code point in it */
7453         for (; current < upper; current++) {
7454             const STRLEN offset = (STRLEN)(current - start);
7455             swatch[offset >> 3] |= 1 << (offset & 7);
7456         }
7457
7458     join_end_of_list:
7459
7460         /* Quit if at the end of the list */
7461         if (i >= len) {
7462
7463             /* But first, have to deal with the highest possible code point on
7464              * the platform.  The previous code assumes that <end> is one
7465              * beyond where we want to populate, but that is impossible at the
7466              * platform's infinity, so have to handle it specially */
7467             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7468             {
7469                 const STRLEN offset = (STRLEN)(end - start);
7470                 swatch[offset >> 3] |= 1 << (offset & 7);
7471             }
7472             return;
7473         }
7474
7475         /* Advance to the next range, which will be for code points not in the
7476          * inversion list */
7477         current = array[i];
7478     }
7479
7480     return;
7481 }
7482
7483 void
7484 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7485 {
7486     /* Take the union of two inversion lists and point <output> to it.  *output
7487      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7488      * the reference count to that list will be decremented.  The first list,
7489      * <a>, may be NULL, in which case a copy of the second list is returned.
7490      * If <complement_b> is TRUE, the union is taken of the complement
7491      * (inversion) of <b> instead of b itself.
7492      *
7493      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7494      * Richard Gillam, published by Addison-Wesley, and explained at some
7495      * length there.  The preface says to incorporate its examples into your
7496      * code at your own risk.
7497      *
7498      * The algorithm is like a merge sort.
7499      *
7500      * XXX A potential performance improvement is to keep track as we go along
7501      * if only one of the inputs contributes to the result, meaning the other
7502      * is a subset of that one.  In that case, we can skip the final copy and
7503      * return the larger of the input lists, but then outside code might need
7504      * to keep track of whether to free the input list or not */
7505
7506     UV* array_a;    /* a's array */
7507     UV* array_b;
7508     UV len_a;       /* length of a's array */
7509     UV len_b;
7510
7511     SV* u;                      /* the resulting union */
7512     UV* array_u;
7513     UV len_u;
7514
7515     UV i_a = 0;             /* current index into a's array */
7516     UV i_b = 0;
7517     UV i_u = 0;
7518
7519     /* running count, as explained in the algorithm source book; items are
7520      * stopped accumulating and are output when the count changes to/from 0.
7521      * The count is incremented when we start a range that's in the set, and
7522      * decremented when we start a range that's not in the set.  So its range
7523      * is 0 to 2.  Only when the count is zero is something not in the set.
7524      */
7525     UV count = 0;
7526
7527     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7528     assert(a != b);
7529
7530     /* If either one is empty, the union is the other one */
7531     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7532         if (*output == a) {
7533             if (a != NULL) {
7534                 SvREFCNT_dec_NN(a);
7535             }
7536         }
7537         if (*output != b) {
7538             *output = invlist_clone(b);
7539             if (complement_b) {
7540                 _invlist_invert(*output);
7541             }
7542         } /* else *output already = b; */
7543         return;
7544     }
7545     else if ((len_b = _invlist_len(b)) == 0) {
7546         if (*output == b) {
7547             SvREFCNT_dec_NN(b);
7548         }
7549
7550         /* The complement of an empty list is a list that has everything in it,
7551          * so the union with <a> includes everything too */
7552         if (complement_b) {
7553             if (a == *output) {
7554                 SvREFCNT_dec_NN(a);
7555             }
7556             *output = _new_invlist(1);
7557             _append_range_to_invlist(*output, 0, UV_MAX);
7558         }
7559         else if (*output != a) {
7560             *output = invlist_clone(a);
7561         }
7562         /* else *output already = a; */
7563         return;
7564     }
7565
7566     /* Here both lists exist and are non-empty */
7567     array_a = invlist_array(a);
7568     array_b = invlist_array(b);
7569
7570     /* If are to take the union of 'a' with the complement of b, set it
7571      * up so are looking at b's complement. */
7572     if (complement_b) {
7573
7574         /* To complement, we invert: if the first element is 0, remove it.  To
7575          * do this, we just pretend the array starts one later, and clear the
7576          * flag as we don't have to do anything else later */
7577         if (array_b[0] == 0) {
7578             array_b++;
7579             len_b--;
7580             complement_b = FALSE;
7581         }
7582         else {
7583
7584             /* But if the first element is not zero, we unshift a 0 before the
7585              * array.  The data structure reserves a space for that 0 (which
7586              * should be a '1' right now), so physical shifting is unneeded,
7587              * but temporarily change that element to 0.  Before exiting the
7588              * routine, we must restore the element to '1' */
7589             array_b--;
7590             len_b++;
7591             array_b[0] = 0;
7592         }
7593     }
7594
7595     /* Size the union for the worst case: that the sets are completely
7596      * disjoint */
7597     u = _new_invlist(len_a + len_b);
7598
7599     /* Will contain U+0000 if either component does */
7600     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7601                                       || (len_b > 0 && array_b[0] == 0));
7602
7603     /* Go through each list item by item, stopping when exhausted one of
7604      * them */
7605     while (i_a < len_a && i_b < len_b) {
7606         UV cp;      /* The element to potentially add to the union's array */
7607         bool cp_in_set;   /* is it in the the input list's set or not */
7608
7609         /* We need to take one or the other of the two inputs for the union.
7610          * Since we are merging two sorted lists, we take the smaller of the
7611          * next items.  In case of a tie, we take the one that is in its set
7612          * first.  If we took one not in the set first, it would decrement the
7613          * count, possibly to 0 which would cause it to be output as ending the
7614          * range, and the next time through we would take the same number, and
7615          * output it again as beginning the next range.  By doing it the
7616          * opposite way, there is no possibility that the count will be
7617          * momentarily decremented to 0, and thus the two adjoining ranges will
7618          * be seamlessly merged.  (In a tie and both are in the set or both not
7619          * in the set, it doesn't matter which we take first.) */
7620         if (array_a[i_a] < array_b[i_b]
7621             || (array_a[i_a] == array_b[i_b]
7622                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7623         {
7624             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7625             cp= array_a[i_a++];
7626         }
7627         else {
7628             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7629             cp = array_b[i_b++];
7630         }
7631
7632         /* Here, have chosen which of the two inputs to look at.  Only output
7633          * if the running count changes to/from 0, which marks the
7634          * beginning/end of a range in that's in the set */
7635         if (cp_in_set) {
7636             if (count == 0) {
7637                 array_u[i_u++] = cp;
7638             }
7639             count++;
7640         }
7641         else {
7642             count--;
7643             if (count == 0) {
7644                 array_u[i_u++] = cp;
7645             }
7646         }
7647     }
7648
7649     /* Here, we are finished going through at least one of the lists, which
7650      * means there is something remaining in at most one.  We check if the list
7651      * that hasn't been exhausted is positioned such that we are in the middle
7652      * of a range in its set or not.  (i_a and i_b point to the element beyond
7653      * the one we care about.) If in the set, we decrement 'count'; if 0, there
7654      * is potentially more to output.
7655      * There are four cases:
7656      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
7657      *     in the union is entirely from the non-exhausted set.
7658      *  2) Both were in their sets, count is 2.  Nothing further should
7659      *     be output, as everything that remains will be in the exhausted
7660      *     list's set, hence in the union; decrementing to 1 but not 0 insures
7661      *     that
7662      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
7663      *     Nothing further should be output because the union includes
7664      *     everything from the exhausted set.  Not decrementing ensures that.
7665      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7666      *     decrementing to 0 insures that we look at the remainder of the
7667      *     non-exhausted set */
7668     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7669         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7670     {
7671         count--;
7672     }
7673
7674     /* The final length is what we've output so far, plus what else is about to
7675      * be output.  (If 'count' is non-zero, then the input list we exhausted
7676      * has everything remaining up to the machine's limit in its set, and hence
7677      * in the union, so there will be no further output. */
7678     len_u = i_u;
7679     if (count == 0) {
7680         /* At most one of the subexpressions will be non-zero */
7681         len_u += (len_a - i_a) + (len_b - i_b);
7682     }
7683
7684     /* Set result to final length, which can change the pointer to array_u, so
7685      * re-find it */
7686     if (len_u != _invlist_len(u)) {
7687         invlist_set_len(u, len_u);
7688         invlist_trim(u);
7689         array_u = invlist_array(u);
7690     }
7691
7692     /* When 'count' is 0, the list that was exhausted (if one was shorter than
7693      * the other) ended with everything above it not in its set.  That means
7694      * that the remaining part of the union is precisely the same as the
7695      * non-exhausted list, so can just copy it unchanged.  (If both list were
7696      * exhausted at the same time, then the operations below will be both 0.)
7697      */
7698     if (count == 0) {
7699         IV copy_count; /* At most one will have a non-zero copy count */
7700         if ((copy_count = len_a - i_a) > 0) {
7701             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7702         }
7703         else if ((copy_count = len_b - i_b) > 0) {
7704             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7705         }
7706     }
7707
7708     /* If we've changed b, restore it */
7709     if (complement_b) {
7710         array_b[0] = 1;
7711     }
7712
7713     /*  We may be removing a reference to one of the inputs */
7714     if (a == *output || b == *output) {
7715         assert(! invlist_is_iterating(*output));
7716         SvREFCNT_dec_NN(*output);
7717     }
7718
7719     *output = u;
7720     return;
7721 }
7722
7723 void
7724 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7725 {
7726     /* Take the intersection of two inversion lists and point <i> to it.  *i
7727      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7728      * the reference count to that list will be decremented.
7729      * If <complement_b> is TRUE, the result will be the intersection of <a>
7730      * and the complement (or inversion) of <b> instead of <b> directly.
7731      *
7732      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7733      * Richard Gillam, published by Addison-Wesley, and explained at some
7734      * length there.  The preface says to incorporate its examples into your
7735      * code at your own risk.  In fact, it had bugs
7736      *
7737      * The algorithm is like a merge sort, and is essentially the same as the
7738      * union above
7739      */
7740
7741     UV* array_a;                /* a's array */
7742     UV* array_b;
7743     UV len_a;   /* length of a's array */
7744     UV len_b;
7745
7746     SV* r;                   /* the resulting intersection */
7747     UV* array_r;
7748     UV len_r;
7749
7750     UV i_a = 0;             /* current index into a's array */
7751     UV i_b = 0;
7752     UV i_r = 0;
7753
7754     /* running count, as explained in the algorithm source book; items are
7755      * stopped accumulating and are output when the count changes to/from 2.
7756      * The count is incremented when we start a range that's in the set, and
7757      * decremented when we start a range that's not in the set.  So its range
7758      * is 0 to 2.  Only when the count is 2 is something in the intersection.
7759      */
7760     UV count = 0;
7761
7762     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7763     assert(a != b);
7764
7765     /* Special case if either one is empty */
7766     len_a = _invlist_len(a);
7767     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7768
7769         if (len_a != 0 && complement_b) {
7770
7771             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7772              * be empty.  Here, also we are using 'b's complement, which hence
7773              * must be every possible code point.  Thus the intersection is
7774              * simply 'a'. */
7775             if (*i != a) {
7776                 *i = invlist_clone(a);
7777
7778                 if (*i == b) {
7779                     SvREFCNT_dec_NN(b);
7780                 }
7781             }
7782             /* else *i is already 'a' */
7783             return;
7784         }
7785
7786         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
7787          * intersection must be empty */
7788         if (*i == a) {
7789             SvREFCNT_dec_NN(a);
7790         }
7791         else if (*i == b) {
7792             SvREFCNT_dec_NN(b);
7793         }
7794         *i = _new_invlist(0);
7795         return;
7796     }
7797
7798     /* Here both lists exist and are non-empty */
7799     array_a = invlist_array(a);
7800     array_b = invlist_array(b);
7801
7802     /* If are to take the intersection of 'a' with the complement of b, set it
7803      * up so are looking at b's complement. */
7804     if (complement_b) {
7805
7806         /* To complement, we invert: if the first element is 0, remove it.  To
7807          * do this, we just pretend the array starts one later, and clear the
7808          * flag as we don't have to do anything else later */
7809         if (array_b[0] == 0) {
7810             array_b++;
7811             len_b--;
7812             complement_b = FALSE;
7813         }
7814         else {
7815
7816             /* But if the first element is not zero, we unshift a 0 before the
7817              * array.  The data structure reserves a space for that 0 (which
7818              * should be a '1' right now), so physical shifting is unneeded,
7819              * but temporarily change that element to 0.  Before exiting the
7820              * routine, we must restore the element to '1' */
7821             array_b--;
7822             len_b++;
7823             array_b[0] = 0;
7824         }
7825     }
7826
7827     /* Size the intersection for the worst case: that the intersection ends up
7828      * fragmenting everything to be completely disjoint */
7829     r= _new_invlist(len_a + len_b);
7830
7831     /* Will contain U+0000 iff both components do */
7832     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7833                                      && len_b > 0 && array_b[0] == 0);
7834
7835     /* Go through each list item by item, stopping when exhausted one of
7836      * them */
7837     while (i_a < len_a && i_b < len_b) {
7838         UV cp;      /* The element to potentially add to the intersection's
7839                        array */
7840         bool cp_in_set; /* Is it in the input list's set or not */
7841
7842         /* We need to take one or the other of the two inputs for the
7843          * intersection.  Since we are merging two sorted lists, we take the
7844          * smaller of the next items.  In case of a tie, we take the one that
7845          * is not in its set first (a difference from the union algorithm).  If
7846          * we took one in the set first, it would increment the count, possibly
7847          * to 2 which would cause it to be output as starting a range in the
7848          * intersection, and the next time through we would take that same
7849          * number, and output it again as ending the set.  By doing it the
7850          * opposite of this, there is no possibility that the count will be
7851          * momentarily incremented to 2.  (In a tie and both are in the set or
7852          * both not in the set, it doesn't matter which we take first.) */
7853         if (array_a[i_a] < array_b[i_b]
7854             || (array_a[i_a] == array_b[i_b]
7855                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7856         {
7857             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7858             cp= array_a[i_a++];
7859         }
7860         else {
7861             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7862             cp= array_b[i_b++];
7863         }
7864
7865         /* Here, have chosen which of the two inputs to look at.  Only output
7866          * if the running count changes to/from 2, which marks the
7867          * beginning/end of a range that's in the intersection */
7868         if (cp_in_set) {
7869             count++;
7870             if (count == 2) {
7871                 array_r[i_r++] = cp;
7872             }
7873         }
7874         else {
7875             if (count == 2) {
7876                 array_r[i_r++] = cp;
7877             }
7878             count--;
7879         }
7880     }
7881
7882     /* Here, we are finished going through at least one of the lists, which
7883      * means there is something remaining in at most one.  We check if the list
7884      * that has been exhausted is positioned such that we are in the middle
7885      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
7886      * the ones we care about.)  There are four cases:
7887      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
7888      *     nothing left in the intersection.
7889      *  2) Both were in their sets, count is 2 and perhaps is incremented to
7890      *     above 2.  What should be output is exactly that which is in the
7891      *     non-exhausted set, as everything it has is also in the intersection
7892      *     set, and everything it doesn't have can't be in the intersection
7893      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7894      *     gets incremented to 2.  Like the previous case, the intersection is
7895      *     everything that remains in the non-exhausted set.
7896      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7897      *     remains 1.  And the intersection has nothing more. */
7898     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7899         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7900     {
7901         count++;
7902     }
7903
7904     /* The final length is what we've output so far plus what else is in the
7905      * intersection.  At most one of the subexpressions below will be non-zero */
7906     len_r = i_r;
7907     if (count >= 2) {
7908         len_r += (len_a - i_a) + (len_b - i_b);
7909     }
7910
7911     /* Set result to final length, which can change the pointer to array_r, so
7912      * re-find it */
7913     if (len_r != _invlist_len(r)) {
7914         invlist_set_len(r, len_r);
7915         invlist_trim(r);
7916         array_r = invlist_array(r);
7917     }
7918
7919     /* Finish outputting any remaining */
7920     if (count >= 2) { /* At most one will have a non-zero copy count */
7921         IV copy_count;
7922         if ((copy_count = len_a - i_a) > 0) {
7923             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7924         }
7925         else if ((copy_count = len_b - i_b) > 0) {
7926             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7927         }
7928     }
7929
7930     /* If we've changed b, restore it */
7931     if (complement_b) {
7932         array_b[0] = 1;
7933     }
7934
7935     /*  We may be removing a reference to one of the inputs */
7936     if (a == *i || b == *i) {
7937         assert(! invlist_is_iterating(*i));
7938         SvREFCNT_dec_NN(*i);
7939     }
7940
7941     *i = r;
7942     return;
7943 }
7944
7945 SV*
7946 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7947 {
7948     /* Add the range from 'start' to 'end' inclusive to the inversion list's
7949      * set.  A pointer to the inversion list is returned.  This may actually be
7950      * a new list, in which case the passed in one has been destroyed.  The
7951      * passed in inversion list can be NULL, in which case a new one is created
7952      * with just the one range in it */
7953
7954     SV* range_invlist;
7955     UV len;
7956
7957     if (invlist == NULL) {
7958         invlist = _new_invlist(2);
7959         len = 0;
7960     }
7961     else {
7962         len = _invlist_len(invlist);
7963     }
7964
7965     /* If comes after the final entry actually in the list, can just append it
7966      * to the end, */
7967     if (len == 0
7968         || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
7969             && start >= invlist_array(invlist)[len - 1]))
7970     {
7971         _append_range_to_invlist(invlist, start, end);
7972         return invlist;
7973     }
7974
7975     /* Here, can't just append things, create and return a new inversion list
7976      * which is the union of this range and the existing inversion list */
7977     range_invlist = _new_invlist(2);
7978     _append_range_to_invlist(range_invlist, start, end);
7979
7980     _invlist_union(invlist, range_invlist, &invlist);
7981
7982     /* The temporary can be freed */
7983     SvREFCNT_dec_NN(range_invlist);
7984
7985     return invlist;
7986 }
7987
7988 #endif
7989
7990 PERL_STATIC_INLINE SV*
7991 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7992     return _add_range_to_invlist(invlist, cp, cp);
7993 }
7994
7995 #ifndef PERL_IN_XSUB_RE
7996 void
7997 Perl__invlist_invert(pTHX_ SV* const invlist)
7998 {
7999     /* Complement the input inversion list.  This adds a 0 if the list didn't
8000      * have a zero; removes it otherwise.  As described above, the data
8001      * structure is set up so that this is very efficient */
8002
8003     UV* len_pos = _get_invlist_len_addr(invlist);
8004
8005     PERL_ARGS_ASSERT__INVLIST_INVERT;
8006
8007     assert(! invlist_is_iterating(invlist));
8008
8009     /* The inverse of matching nothing is matching everything */
8010     if (*len_pos == 0) {
8011         _append_range_to_invlist(invlist, 0, UV_MAX);
8012         return;
8013     }
8014
8015     /* The exclusive or complents 0 to 1; and 1 to 0.  If the result is 1, the
8016      * zero element was a 0, so it is being removed, so the length decrements
8017      * by 1; and vice-versa.  SvCUR is unaffected */
8018     if (*get_invlist_zero_addr(invlist) ^= 1) {
8019         (*len_pos)--;
8020     }
8021     else {
8022         (*len_pos)++;
8023     }
8024 }
8025
8026 void
8027 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8028 {
8029     /* Complement the input inversion list (which must be a Unicode property,
8030      * all of which don't match above the Unicode maximum code point.)  And
8031      * Perl has chosen to not have the inversion match above that either.  This
8032      * adds a 0x110000 if the list didn't end with it, and removes it if it did
8033      */
8034
8035     UV len;
8036     UV* array;
8037
8038     PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8039
8040     _invlist_invert(invlist);
8041
8042     len = _invlist_len(invlist);
8043
8044     if (len != 0) { /* If empty do nothing */
8045         array = invlist_array(invlist);
8046         if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8047             /* Add 0x110000.  First, grow if necessary */
8048             len++;
8049             if (invlist_max(invlist) < len) {
8050                 invlist_extend(invlist, len);
8051                 array = invlist_array(invlist);
8052             }
8053             invlist_set_len(invlist, len);
8054             array[len - 1] = PERL_UNICODE_MAX + 1;
8055         }
8056         else {  /* Remove the 0x110000 */
8057             invlist_set_len(invlist, len - 1);
8058         }
8059     }
8060
8061     return;
8062 }
8063 #endif
8064
8065 PERL_STATIC_INLINE SV*
8066 S_invlist_clone(pTHX_ SV* const invlist)
8067 {
8068
8069     /* Return a new inversion list that is a copy of the input one, which is
8070      * unchanged */
8071
8072     /* Need to allocate extra space to accommodate Perl's addition of a
8073      * trailing NUL to SvPV's, since it thinks they are always strings */
8074     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8075     STRLEN length = SvCUR(invlist);
8076
8077     PERL_ARGS_ASSERT_INVLIST_CLONE;
8078
8079     SvCUR_set(new_invlist, length); /* This isn't done automatically */
8080     Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8081
8082     return new_invlist;
8083 }
8084
8085 PERL_STATIC_INLINE UV*
8086 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8087 {
8088     /* Return the address of the UV that contains the current iteration
8089      * position */
8090
8091     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8092
8093     return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8094 }
8095
8096 PERL_STATIC_INLINE UV*
8097 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8098 {
8099     /* Return the address of the UV that contains the version id. */
8100
8101     PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8102
8103     return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8104 }
8105
8106 PERL_STATIC_INLINE void
8107 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
8108 {
8109     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8110
8111     *get_invlist_iter_addr(invlist) = 0;
8112 }
8113
8114 PERL_STATIC_INLINE void
8115 S_invlist_iterfinish(pTHX_ SV* invlist)
8116 {
8117     /* Terminate iterator for invlist.  This is to catch development errors.
8118      * Any iteration that is interrupted before completed should call this
8119      * function.  Functions that add code points anywhere else but to the end
8120      * of an inversion list assert that they are not in the middle of an
8121      * iteration.  If they were, the addition would make the iteration
8122      * problematical: if the iteration hadn't reached the place where things
8123      * were being added, it would be ok */
8124
8125     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8126
8127     *get_invlist_iter_addr(invlist) = UV_MAX;
8128 }
8129
8130 STATIC bool
8131 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8132 {
8133     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8134      * This call sets in <*start> and <*end>, the next range in <invlist>.
8135      * Returns <TRUE> if successful and the next call will return the next
8136      * range; <FALSE> if was already at the end of the list.  If the latter,
8137      * <*start> and <*end> are unchanged, and the next call to this function
8138      * will start over at the beginning of the list */
8139
8140     UV* pos = get_invlist_iter_addr(invlist);
8141     UV len = _invlist_len(invlist);
8142     UV *array;
8143
8144     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8145
8146     if (*pos >= len) {
8147         *pos = UV_MAX;  /* Force iterinit() to be required next time */
8148         return FALSE;
8149     }
8150
8151     array = invlist_array(invlist);
8152
8153     *start = array[(*pos)++];
8154
8155     if (*pos >= len) {
8156         *end = UV_MAX;
8157     }
8158     else {
8159         *end = array[(*pos)++] - 1;
8160     }
8161
8162     return TRUE;
8163 }
8164
8165 PERL_STATIC_INLINE bool
8166 S_invlist_is_iterating(pTHX_ SV* const invlist)
8167 {
8168     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8169
8170     return *(get_invlist_iter_addr(invlist)) < UV_MAX;
8171 }
8172
8173 PERL_STATIC_INLINE UV
8174 S_invlist_highest(pTHX_ SV* const invlist)
8175 {
8176     /* Returns the highest code point that matches an inversion list.  This API
8177      * has an ambiguity, as it returns 0 under either the highest is actually
8178      * 0, or if the list is empty.  If this distinction matters to you, check
8179      * for emptiness before calling this function */
8180
8181     UV len = _invlist_len(invlist);
8182     UV *array;
8183
8184     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8185
8186     if (len == 0) {
8187         return 0;
8188     }
8189
8190     array = invlist_array(invlist);
8191
8192     /* The last element in the array in the inversion list always starts a
8193      * range that goes to infinity.  That range may be for code points that are
8194      * matched in the inversion list, or it may be for ones that aren't
8195      * matched.  In the latter case, the highest code point in the set is one
8196      * less than the beginning of this range; otherwise it is the final element
8197      * of this range: infinity */
8198     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8199            ? UV_MAX
8200            : array[len - 1] - 1;
8201 }
8202
8203 #ifndef PERL_IN_XSUB_RE
8204 SV *
8205 Perl__invlist_contents(pTHX_ SV* const invlist)
8206 {
8207     /* Get the contents of an inversion list into a string SV so that they can
8208      * be printed out.  It uses the format traditionally done for debug tracing
8209      */
8210
8211     UV start, end;
8212     SV* output = newSVpvs("\n");
8213
8214     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8215
8216     assert(! invlist_is_iterating(invlist));
8217
8218     invlist_iterinit(invlist);
8219     while (invlist_iternext(invlist, &start, &end)) {
8220         if (end == UV_MAX) {
8221             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8222         }
8223         else if (end != start) {
8224             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8225                     start,       end);
8226         }
8227         else {
8228             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8229         }
8230     }
8231
8232     return output;
8233 }
8234 #endif
8235
8236 #ifdef PERL_ARGS_ASSERT__INVLIST_DUMP
8237 void
8238 Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header)
8239 {
8240     /* Dumps out the ranges in an inversion list.  The string 'header'
8241      * if present is output on a line before the first range */
8242
8243     UV start, end;
8244
8245     PERL_ARGS_ASSERT__INVLIST_DUMP;
8246
8247     if (header && strlen(header)) {
8248         PerlIO_printf(Perl_debug_log, "%s\n", header);
8249     }
8250     if (invlist_is_iterating(invlist)) {
8251         PerlIO_printf(Perl_debug_log, "Can't dump because is in middle of iterating\n");
8252         return;
8253     }
8254
8255     invlist_iterinit(invlist);
8256     while (invlist_iternext(invlist, &start, &end)) {
8257         if (end == UV_MAX) {
8258             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8259         }
8260         else if (end != start) {
8261             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n",
8262                                                  start,         end);
8263         }
8264         else {
8265             PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start);
8266         }
8267     }
8268 }
8269 #endif
8270
8271 #if 0
8272 bool
8273 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8274 {
8275     /* Return a boolean as to if the two passed in inversion lists are
8276      * identical.  The final argument, if TRUE, says to take the complement of
8277      * the second inversion list before doing the comparison */
8278
8279     UV* array_a = invlist_array(a);
8280     UV* array_b = invlist_array(b);
8281     UV len_a = _invlist_len(a);
8282     UV len_b = _invlist_len(b);
8283
8284     UV i = 0;               /* current index into the arrays */
8285     bool retval = TRUE;     /* Assume are identical until proven otherwise */
8286
8287     PERL_ARGS_ASSERT__INVLISTEQ;
8288
8289     /* If are to compare 'a' with the complement of b, set it
8290      * up so are looking at b's complement. */
8291     if (complement_b) {
8292
8293         /* The complement of nothing is everything, so <a> would have to have
8294          * just one element, starting at zero (ending at infinity) */
8295         if (len_b == 0) {
8296             return (len_a == 1 && array_a[0] == 0);
8297         }
8298         else if (array_b[0] == 0) {
8299
8300             /* Otherwise, to complement, we invert.  Here, the first element is
8301              * 0, just remove it.  To do this, we just pretend the array starts
8302              * one later, and clear the flag as we don't have to do anything
8303              * else later */
8304
8305             array_b++;
8306             len_b--;
8307             complement_b = FALSE;
8308         }
8309         else {
8310
8311             /* But if the first element is not zero, we unshift a 0 before the
8312              * array.  The data structure reserves a space for that 0 (which
8313              * should be a '1' right now), so physical shifting is unneeded,
8314              * but temporarily change that element to 0.  Before exiting the
8315              * routine, we must restore the element to '1' */
8316             array_b--;
8317             len_b++;
8318             array_b[0] = 0;
8319         }
8320     }
8321
8322     /* Make sure that the lengths are the same, as well as the final element
8323      * before looping through the remainder.  (Thus we test the length, final,
8324      * and first elements right off the bat) */
8325     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8326         retval = FALSE;
8327     }
8328     else for (i = 0; i < len_a - 1; i++) {
8329         if (array_a[i] != array_b[i]) {
8330             retval = FALSE;
8331             break;
8332         }
8333     }
8334
8335     if (complement_b) {
8336         array_b[0] = 1;
8337     }
8338     return retval;
8339 }
8340 #endif
8341
8342 #undef HEADER_LENGTH
8343 #undef INVLIST_INITIAL_LENGTH
8344 #undef TO_INTERNAL_SIZE
8345 #undef FROM_INTERNAL_SIZE
8346 #undef INVLIST_LEN_OFFSET
8347 #undef INVLIST_ZERO_OFFSET
8348 #undef INVLIST_ITER_OFFSET
8349 #undef INVLIST_VERSION_ID
8350 #undef INVLIST_PREVIOUS_INDEX_OFFSET
8351
8352 /* End of inversion list object */
8353
8354 STATIC void
8355 S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state)
8356 {
8357     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
8358      * constructs, and updates RExC_flags with them.  On input, RExC_parse
8359      * should point to the first flag; it is updated on output to point to the
8360      * final ')' or ':'.  There needs to be at least one flag, or this will
8361      * abort */
8362
8363     /* for (?g), (?gc), and (?o) warnings; warning
8364        about (?c) will warn about (?g) -- japhy    */
8365
8366 #define WASTED_O  0x01
8367 #define WASTED_G  0x02
8368 #define WASTED_C  0x04
8369 #define WASTED_GC (0x02|0x04)
8370     I32 wastedflags = 0x00;
8371     U32 posflags = 0, negflags = 0;
8372     U32 *flagsp = &posflags;
8373     char has_charset_modifier = '\0';
8374     regex_charset cs;
8375     bool has_use_defaults = FALSE;
8376     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
8377
8378     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
8379
8380     /* '^' as an initial flag sets certain defaults */
8381     if (UCHARAT(RExC_parse) == '^') {
8382         RExC_parse++;
8383         has_use_defaults = TRUE;
8384         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8385         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8386                                         ? REGEX_UNICODE_CHARSET
8387                                         : REGEX_DEPENDS_CHARSET);
8388     }
8389
8390     cs = get_regex_charset(RExC_flags);
8391     if (cs == REGEX_DEPENDS_CHARSET
8392         && (RExC_utf8 || RExC_uni_semantics))
8393     {
8394         cs = REGEX_UNICODE_CHARSET;
8395     }
8396
8397     while (*RExC_parse) {
8398         /* && strchr("iogcmsx", *RExC_parse) */
8399         /* (?g), (?gc) and (?o) are useless here
8400            and must be globally applied -- japhy */
8401         switch (*RExC_parse) {
8402
8403             /* Code for the imsx flags */
8404             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8405
8406             case LOCALE_PAT_MOD:
8407                 if (has_charset_modifier) {
8408                     goto excess_modifier;
8409                 }
8410                 else if (flagsp == &negflags) {
8411                     goto neg_modifier;
8412                 }
8413                 cs = REGEX_LOCALE_CHARSET;
8414                 has_charset_modifier = LOCALE_PAT_MOD;
8415                 RExC_contains_locale = 1;
8416                 break;
8417             case UNICODE_PAT_MOD:
8418                 if (has_charset_modifier) {
8419                     goto excess_modifier;
8420                 }
8421                 else if (flagsp == &negflags) {
8422                     goto neg_modifier;
8423                 }
8424                 cs = REGEX_UNICODE_CHARSET;
8425                 has_charset_modifier = UNICODE_PAT_MOD;
8426                 break;
8427             case ASCII_RESTRICT_PAT_MOD:
8428                 if (flagsp == &negflags) {
8429                     goto neg_modifier;
8430                 }
8431                 if (has_charset_modifier) {
8432                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8433                         goto excess_modifier;
8434                     }
8435                     /* Doubled modifier implies more restricted */
8436                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8437                 }
8438                 else {
8439                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
8440                 }
8441                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8442                 break;
8443             case DEPENDS_PAT_MOD:
8444                 if (has_use_defaults) {
8445                     goto fail_modifiers;
8446                 }
8447                 else if (flagsp == &negflags) {
8448                     goto neg_modifier;
8449                 }
8450                 else if (has_charset_modifier) {
8451                     goto excess_modifier;
8452                 }
8453
8454                 /* The dual charset means unicode semantics if the
8455                  * pattern (or target, not known until runtime) are
8456                  * utf8, or something in the pattern indicates unicode
8457                  * semantics */
8458                 cs = (RExC_utf8 || RExC_uni_semantics)
8459                      ? REGEX_UNICODE_CHARSET
8460                      : REGEX_DEPENDS_CHARSET;
8461                 has_charset_modifier = DEPENDS_PAT_MOD;
8462                 break;
8463             excess_modifier:
8464                 RExC_parse++;
8465                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8466                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8467                 }
8468                 else if (has_charset_modifier == *(RExC_parse - 1)) {
8469                     vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8470                 }
8471                 else {
8472                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8473                 }
8474                 /*NOTREACHED*/
8475             neg_modifier:
8476                 RExC_parse++;
8477                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8478                 /*NOTREACHED*/
8479             case ONCE_PAT_MOD: /* 'o' */
8480             case GLOBAL_PAT_MOD: /* 'g' */
8481                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8482                     const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8483                     if (! (wastedflags & wflagbit) ) {
8484                         wastedflags |= wflagbit;
8485                         vWARN5(
8486                             RExC_parse + 1,
8487                             "Useless (%s%c) - %suse /%c modifier",
8488                             flagsp == &negflags ? "?-" : "?",
8489                             *RExC_parse,
8490                             flagsp == &negflags ? "don't " : "",
8491                             *RExC_parse
8492                         );
8493                     }
8494                 }
8495                 break;
8496
8497             case CONTINUE_PAT_MOD: /* 'c' */
8498                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8499                     if (! (wastedflags & WASTED_C) ) {
8500                         wastedflags |= WASTED_GC;
8501                         vWARN3(
8502                             RExC_parse + 1,
8503                             "Useless (%sc) - %suse /gc modifier",
8504                             flagsp == &negflags ? "?-" : "?",
8505                             flagsp == &negflags ? "don't " : ""
8506                         );
8507                     }
8508                 }
8509                 break;
8510             case KEEPCOPY_PAT_MOD: /* 'p' */
8511                 if (flagsp == &negflags) {
8512                     if (SIZE_ONLY)
8513                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8514                 } else {
8515                     *flagsp |= RXf_PMf_KEEPCOPY;
8516                 }
8517                 break;
8518             case '-':
8519                 /* A flag is a default iff it is following a minus, so
8520                  * if there is a minus, it means will be trying to
8521                  * re-specify a default which is an error */
8522                 if (has_use_defaults || flagsp == &negflags) {
8523                     goto fail_modifiers;
8524                 }
8525                 flagsp = &negflags;
8526                 wastedflags = 0;  /* reset so (?g-c) warns twice */
8527                 break;
8528             case ':':
8529             case ')':
8530                 RExC_flags |= posflags;
8531                 RExC_flags &= ~negflags;
8532                 set_regex_charset(&RExC_flags, cs);
8533                 return;
8534                 /*NOTREACHED*/
8535             default:
8536             fail_modifiers:
8537                 RExC_parse++;
8538                 vFAIL3("Sequence (%.*s...) not recognized",
8539                        RExC_parse-seqstart, seqstart);
8540                 /*NOTREACHED*/
8541         }
8542
8543         ++RExC_parse;
8544     }
8545 }
8546
8547 /*
8548  - reg - regular expression, i.e. main body or parenthesized thing
8549  *
8550  * Caller must absorb opening parenthesis.
8551  *
8552  * Combining parenthesis handling with the base level of regular expression
8553  * is a trifle forced, but the need to tie the tails of the branches to what
8554  * follows makes it hard to avoid.
8555  */
8556 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8557 #ifdef DEBUGGING
8558 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8559 #else
8560 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8561 #endif
8562
8563 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
8564    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
8565    needs to be restarted.
8566    Otherwise would only return NULL if regbranch() returns NULL, which
8567    cannot happen.  */
8568 STATIC regnode *
8569 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8570     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8571 {
8572     dVAR;
8573     regnode *ret;               /* Will be the head of the group. */
8574     regnode *br;
8575     regnode *lastbr;
8576     regnode *ender = NULL;
8577     I32 parno = 0;
8578     I32 flags;
8579     U32 oregflags = RExC_flags;
8580     bool have_branch = 0;
8581     bool is_open = 0;
8582     I32 freeze_paren = 0;
8583     I32 after_freeze = 0;
8584
8585     char * parse_start = RExC_parse; /* MJD */
8586     char * const oregcomp_parse = RExC_parse;
8587
8588     GET_RE_DEBUG_FLAGS_DECL;
8589
8590     PERL_ARGS_ASSERT_REG;
8591     DEBUG_PARSE("reg ");
8592
8593     *flagp = 0;                         /* Tentatively. */
8594
8595
8596     /* Make an OPEN node, if parenthesized. */
8597     if (paren) {
8598         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8599             char *start_verb = RExC_parse;
8600             STRLEN verb_len = 0;
8601             char *start_arg = NULL;
8602             unsigned char op = 0;
8603             int argok = 1;
8604             int internal_argval = 0; /* internal_argval is only useful if !argok */
8605             while ( *RExC_parse && *RExC_parse != ')' ) {
8606                 if ( *RExC_parse == ':' ) {
8607                     start_arg = RExC_parse + 1;
8608                     break;
8609                 }
8610                 RExC_parse++;
8611             }
8612             ++start_verb;
8613             verb_len = RExC_parse - start_verb;
8614             if ( start_arg ) {
8615                 RExC_parse++;
8616                 while ( *RExC_parse && *RExC_parse != ')' )
8617                     RExC_parse++;
8618                 if ( *RExC_parse != ')' )
8619                     vFAIL("Unterminated verb pattern argument");
8620                 if ( RExC_parse == start_arg )
8621                     start_arg = NULL;
8622             } else {
8623                 if ( *RExC_parse != ')' )
8624                     vFAIL("Unterminated verb pattern");
8625             }
8626
8627             switch ( *start_verb ) {
8628             case 'A':  /* (*ACCEPT) */
8629                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8630                     op = ACCEPT;
8631                     internal_argval = RExC_nestroot;
8632                 }
8633                 break;
8634             case 'C':  /* (*COMMIT) */
8635                 if ( memEQs(start_verb,verb_len,"COMMIT") )
8636                     op = COMMIT;
8637                 break;
8638             case 'F':  /* (*FAIL) */
8639                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8640                     op = OPFAIL;
8641                     argok = 0;
8642                 }
8643                 break;
8644             case ':':  /* (*:NAME) */
8645             case 'M':  /* (*MARK:NAME) */
8646                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8647                     op = MARKPOINT;
8648                     argok = -1;
8649                 }
8650                 break;
8651             case 'P':  /* (*PRUNE) */
8652                 if ( memEQs(start_verb,verb_len,"PRUNE") )
8653                     op = PRUNE;
8654                 break;
8655             case 'S':   /* (*SKIP) */
8656                 if ( memEQs(start_verb,verb_len,"SKIP") )
8657                     op = SKIP;
8658                 break;
8659             case 'T':  /* (*THEN) */
8660                 /* [19:06] <TimToady> :: is then */
8661                 if ( memEQs(start_verb,verb_len,"THEN") ) {
8662                     op = CUTGROUP;
8663                     RExC_seen |= REG_SEEN_CUTGROUP;
8664                 }
8665                 break;
8666             }
8667             if ( ! op ) {
8668                 RExC_parse++;
8669                 vFAIL3("Unknown verb pattern '%.*s'",
8670                     verb_len, start_verb);
8671             }
8672             if ( argok ) {
8673                 if ( start_arg && internal_argval ) {
8674                     vFAIL3("Verb pattern '%.*s' may not have an argument",
8675                         verb_len, start_verb);
8676                 } else if ( argok < 0 && !start_arg ) {
8677                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8678                         verb_len, start_verb);
8679                 } else {
8680                     ret = reganode(pRExC_state, op, internal_argval);
8681                     if ( ! internal_argval && ! SIZE_ONLY ) {
8682                         if (start_arg) {
8683                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8684                             ARG(ret) = add_data( pRExC_state, 1, "S" );
8685                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8686                             ret->flags = 0;
8687                         } else {
8688                             ret->flags = 1;
8689                         }
8690                     }
8691                 }
8692                 if (!internal_argval)
8693                     RExC_seen |= REG_SEEN_VERBARG;
8694             } else if ( start_arg ) {
8695                 vFAIL3("Verb pattern '%.*s' may not have an argument",
8696                         verb_len, start_verb);
8697             } else {
8698                 ret = reg_node(pRExC_state, op);
8699             }
8700             nextchar(pRExC_state);
8701             return ret;
8702         } else
8703         if (*RExC_parse == '?') { /* (?...) */
8704             bool is_logical = 0;
8705             const char * const seqstart = RExC_parse;
8706
8707             RExC_parse++;
8708             paren = *RExC_parse++;
8709             ret = NULL;                 /* For look-ahead/behind. */
8710             switch (paren) {
8711
8712             case 'P':   /* (?P...) variants for those used to PCRE/Python */
8713                 paren = *RExC_parse++;
8714                 if ( paren == '<')         /* (?P<...>) named capture */
8715                     goto named_capture;
8716                 else if (paren == '>') {   /* (?P>name) named recursion */
8717                     goto named_recursion;
8718                 }
8719                 else if (paren == '=') {   /* (?P=...)  named backref */
8720                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
8721                        you change this make sure you change that */
8722                     char* name_start = RExC_parse;
8723                     U32 num = 0;
8724                     SV *sv_dat = reg_scan_name(pRExC_state,
8725                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8726                     if (RExC_parse == name_start || *RExC_parse != ')')
8727                         vFAIL2("Sequence %.3s... not terminated",parse_start);
8728
8729                     if (!SIZE_ONLY) {
8730                         num = add_data( pRExC_state, 1, "S" );
8731                         RExC_rxi->data->data[num]=(void*)sv_dat;
8732                         SvREFCNT_inc_simple_void(sv_dat);
8733                     }
8734                     RExC_sawback = 1;
8735                     ret = reganode(pRExC_state,
8736                                    ((! FOLD)
8737                                      ? NREF
8738                                      : (ASCII_FOLD_RESTRICTED)
8739                                        ? NREFFA
8740                                        : (AT_LEAST_UNI_SEMANTICS)
8741                                          ? NREFFU
8742                                          : (LOC)
8743                                            ? NREFFL
8744                                            : NREFF),
8745                                     num);
8746                     *flagp |= HASWIDTH;
8747
8748                     Set_Node_Offset(ret, parse_start+1);
8749                     Set_Node_Cur_Length(ret); /* MJD */
8750
8751                     nextchar(pRExC_state);
8752                     return ret;
8753                 }
8754                 RExC_parse++;
8755                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8756                 /*NOTREACHED*/
8757             case '<':           /* (?<...) */
8758                 if (*RExC_parse == '!')
8759                     paren = ',';
8760                 else if (*RExC_parse != '=')
8761               named_capture:
8762                 {               /* (?<...>) */
8763                     char *name_start;
8764                     SV *svname;
8765                     paren= '>';
8766             case '\'':          /* (?'...') */
8767                     name_start= RExC_parse;
8768                     svname = reg_scan_name(pRExC_state,
8769                         SIZE_ONLY ?  /* reverse test from the others */
8770                         REG_RSN_RETURN_NAME :
8771                         REG_RSN_RETURN_NULL);
8772                     if (RExC_parse == name_start) {
8773                         RExC_parse++;
8774                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8775                         /*NOTREACHED*/
8776                     }
8777                     if (*RExC_parse != paren)
8778                         vFAIL2("Sequence (?%c... not terminated",
8779                             paren=='>' ? '<' : paren);
8780                     if (SIZE_ONLY) {
8781                         HE *he_str;
8782                         SV *sv_dat = NULL;
8783                         if (!svname) /* shouldn't happen */
8784                             Perl_croak(aTHX_
8785                                 "panic: reg_scan_name returned NULL");
8786                         if (!RExC_paren_names) {
8787                             RExC_paren_names= newHV();
8788                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
8789 #ifdef DEBUGGING
8790                             RExC_paren_name_list= newAV();
8791                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8792 #endif
8793                         }
8794                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8795                         if ( he_str )
8796                             sv_dat = HeVAL(he_str);
8797                         if ( ! sv_dat ) {
8798                             /* croak baby croak */
8799                             Perl_croak(aTHX_
8800                                 "panic: paren_name hash element allocation failed");
8801                         } else if ( SvPOK(sv_dat) ) {
8802                             /* (?|...) can mean we have dupes so scan to check
8803                                its already been stored. Maybe a flag indicating
8804                                we are inside such a construct would be useful,
8805                                but the arrays are likely to be quite small, so
8806                                for now we punt -- dmq */
8807                             IV count = SvIV(sv_dat);
8808                             I32 *pv = (I32*)SvPVX(sv_dat);
8809                             IV i;
8810                             for ( i = 0 ; i < count ; i++ ) {
8811                                 if ( pv[i] == RExC_npar ) {
8812                                     count = 0;
8813                                     break;
8814                                 }
8815                             }
8816                             if ( count ) {
8817                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8818                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8819                                 pv[count] = RExC_npar;
8820                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8821                             }
8822                         } else {
8823                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
8824                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8825                             SvIOK_on(sv_dat);
8826                             SvIV_set(sv_dat, 1);
8827                         }
8828 #ifdef DEBUGGING
8829                         /* Yes this does cause a memory leak in debugging Perls */
8830                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8831                             SvREFCNT_dec_NN(svname);
8832 #endif
8833
8834                         /*sv_dump(sv_dat);*/
8835                     }
8836                     nextchar(pRExC_state);
8837                     paren = 1;
8838                     goto capturing_parens;
8839                 }
8840                 RExC_seen |= REG_SEEN_LOOKBEHIND;
8841                 RExC_in_lookbehind++;
8842                 RExC_parse++;
8843             case '=':           /* (?=...) */
8844                 RExC_seen_zerolen++;
8845                 break;
8846             case '!':           /* (?!...) */
8847                 RExC_seen_zerolen++;
8848                 if (*RExC_parse == ')') {
8849                     ret=reg_node(pRExC_state, OPFAIL);
8850                     nextchar(pRExC_state);
8851                     return ret;
8852                 }
8853                 break;
8854             case '|':           /* (?|...) */
8855                 /* branch reset, behave like a (?:...) except that
8856                    buffers in alternations share the same numbers */
8857                 paren = ':';
8858                 after_freeze = freeze_paren = RExC_npar;
8859                 break;
8860             case ':':           /* (?:...) */
8861             case '>':           /* (?>...) */
8862                 break;
8863             case '$':           /* (?$...) */
8864             case '@':           /* (?@...) */
8865                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8866                 break;
8867             case '0' :           /* (?0) */
8868             case 'R' :           /* (?R) */
8869                 if (*RExC_parse != ')')
8870                     FAIL("Sequence (?R) not terminated");
8871                 ret = reg_node(pRExC_state, GOSTART);
8872                 *flagp |= POSTPONED;
8873                 nextchar(pRExC_state);
8874                 return ret;
8875                 /*notreached*/
8876             { /* named and numeric backreferences */
8877                 I32 num;
8878             case '&':            /* (?&NAME) */
8879                 parse_start = RExC_parse - 1;
8880               named_recursion:
8881                 {
8882                     SV *sv_dat = reg_scan_name(pRExC_state,
8883                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8884                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8885                 }
8886                 goto gen_recurse_regop;
8887                 assert(0); /* NOT REACHED */
8888             case '+':
8889                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8890                     RExC_parse++;
8891                     vFAIL("Illegal pattern");
8892                 }
8893                 goto parse_recursion;
8894                 /* NOT REACHED*/
8895             case '-': /* (?-1) */
8896                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8897                     RExC_parse--; /* rewind to let it be handled later */
8898                     goto parse_flags;
8899                 }
8900                 /*FALLTHROUGH */
8901             case '1': case '2': case '3': case '4': /* (?1) */
8902             case '5': case '6': case '7': case '8': case '9':
8903                 RExC_parse--;
8904               parse_recursion:
8905                 num = atoi(RExC_parse);
8906                 parse_start = RExC_parse - 1; /* MJD */
8907                 if (*RExC_parse == '-')
8908                     RExC_parse++;
8909                 while (isDIGIT(*RExC_parse))
8910                         RExC_parse++;
8911                 if (*RExC_parse!=')')
8912                     vFAIL("Expecting close bracket");
8913
8914               gen_recurse_regop:
8915                 if ( paren == '-' ) {
8916                     /*
8917                     Diagram of capture buffer numbering.
8918                     Top line is the normal capture buffer numbers
8919                     Bottom line is the negative indexing as from
8920                     the X (the (?-2))
8921
8922                     +   1 2    3 4 5 X          6 7
8923                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8924                     -   5 4    3 2 1 X          x x
8925
8926                     */
8927                     num = RExC_npar + num;
8928                     if (num < 1)  {
8929                         RExC_parse++;
8930                         vFAIL("Reference to nonexistent group");
8931                     }
8932                 } else if ( paren == '+' ) {
8933                     num = RExC_npar + num - 1;
8934                 }
8935
8936                 ret = reganode(pRExC_state, GOSUB, num);
8937                 if (!SIZE_ONLY) {
8938                     if (num > (I32)RExC_rx->nparens) {
8939                         RExC_parse++;
8940                         vFAIL("Reference to nonexistent group");
8941                     }
8942                     ARG2L_SET( ret, RExC_recurse_count++);
8943                     RExC_emit++;
8944                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8945                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8946                 } else {
8947                     RExC_size++;
8948                 }
8949                 RExC_seen |= REG_SEEN_RECURSE;
8950                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8951                 Set_Node_Offset(ret, parse_start); /* MJD */
8952
8953                 *flagp |= POSTPONED;
8954                 nextchar(pRExC_state);
8955                 return ret;
8956             } /* named and numeric backreferences */
8957             assert(0); /* NOT REACHED */
8958
8959             case '?':           /* (??...) */
8960                 is_logical = 1;
8961                 if (*RExC_parse != '{') {
8962                     RExC_parse++;
8963                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8964                     /*NOTREACHED*/
8965                 }
8966                 *flagp |= POSTPONED;
8967                 paren = *RExC_parse++;
8968                 /* FALL THROUGH */
8969             case '{':           /* (?{...}) */
8970             {
8971                 U32 n = 0;
8972                 struct reg_code_block *cb;
8973
8974                 RExC_seen_zerolen++;
8975
8976                 if (   !pRExC_state->num_code_blocks
8977                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
8978                     || pRExC_state->code_blocks[pRExC_state->code_index].start
8979                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8980                             - RExC_start)
8981                 ) {
8982                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
8983                         FAIL("panic: Sequence (?{...}): no code block found\n");
8984                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
8985                 }
8986                 /* this is a pre-compiled code block (?{...}) */
8987                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8988                 RExC_parse = RExC_start + cb->end;
8989                 if (!SIZE_ONLY) {
8990                     OP *o = cb->block;
8991                     if (cb->src_regex) {
8992                         n = add_data(pRExC_state, 2, "rl");
8993                         RExC_rxi->data->data[n] =
8994                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
8995                         RExC_rxi->data->data[n+1] = (void*)o;
8996                     }
8997                     else {
8998                         n = add_data(pRExC_state, 1,
8999                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
9000                         RExC_rxi->data->data[n] = (void*)o;
9001                     }
9002                 }
9003                 pRExC_state->code_index++;
9004                 nextchar(pRExC_state);
9005
9006                 if (is_logical) {
9007                     regnode *eval;
9008                     ret = reg_node(pRExC_state, LOGICAL);
9009                     eval = reganode(pRExC_state, EVAL, n);
9010                     if (!SIZE_ONLY) {
9011                         ret->flags = 2;
9012                         /* for later propagation into (??{}) return value */
9013                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9014                     }
9015                     REGTAIL(pRExC_state, ret, eval);
9016                     /* deal with the length of this later - MJD */
9017                     return ret;
9018                 }
9019                 ret = reganode(pRExC_state, EVAL, n);
9020                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9021                 Set_Node_Offset(ret, parse_start);
9022                 return ret;
9023             }
9024             case '(':           /* (?(?{...})...) and (?(?=...)...) */
9025             {
9026                 int is_define= 0;
9027                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
9028                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9029                         || RExC_parse[1] == '<'
9030                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
9031                         I32 flag;
9032                         regnode *tail;
9033
9034                         ret = reg_node(pRExC_state, LOGICAL);
9035                         if (!SIZE_ONLY)
9036                             ret->flags = 1;
9037
9038                         tail = reg(pRExC_state, 1, &flag, depth+1);
9039                         if (flag & RESTART_UTF8) {
9040                             *flagp = RESTART_UTF8;
9041                             return NULL;
9042                         }
9043                         REGTAIL(pRExC_state, ret, tail);
9044                         goto insert_if;
9045                     }
9046                 }
9047                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9048                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9049                 {
9050                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
9051                     char *name_start= RExC_parse++;
9052                     U32 num = 0;
9053                     SV *sv_dat=reg_scan_name(pRExC_state,
9054                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9055                     if (RExC_parse == name_start || *RExC_parse != ch)
9056                         vFAIL2("Sequence (?(%c... not terminated",
9057                             (ch == '>' ? '<' : ch));
9058                     RExC_parse++;
9059                     if (!SIZE_ONLY) {
9060                         num = add_data( pRExC_state, 1, "S" );
9061                         RExC_rxi->data->data[num]=(void*)sv_dat;
9062                         SvREFCNT_inc_simple_void(sv_dat);
9063                     }
9064                     ret = reganode(pRExC_state,NGROUPP,num);
9065                     goto insert_if_check_paren;
9066                 }
9067                 else if (RExC_parse[0] == 'D' &&
9068                          RExC_parse[1] == 'E' &&
9069                          RExC_parse[2] == 'F' &&
9070                          RExC_parse[3] == 'I' &&
9071                          RExC_parse[4] == 'N' &&
9072                          RExC_parse[5] == 'E')
9073                 {
9074                     ret = reganode(pRExC_state,DEFINEP,0);
9075                     RExC_parse +=6 ;
9076                     is_define = 1;
9077                     goto insert_if_check_paren;
9078                 }
9079                 else if (RExC_parse[0] == 'R') {
9080                     RExC_parse++;
9081                     parno = 0;
9082                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9083                         parno = atoi(RExC_parse++);
9084                         while (isDIGIT(*RExC_parse))
9085                             RExC_parse++;
9086                     } else if (RExC_parse[0] == '&') {
9087                         SV *sv_dat;
9088                         RExC_parse++;
9089                         sv_dat = reg_scan_name(pRExC_state,
9090                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9091                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9092                     }
9093                     ret = reganode(pRExC_state,INSUBP,parno);
9094                     goto insert_if_check_paren;
9095                 }
9096                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9097                     /* (?(1)...) */
9098                     char c;
9099                     parno = atoi(RExC_parse++);
9100
9101                     while (isDIGIT(*RExC_parse))
9102                         RExC_parse++;
9103                     ret = reganode(pRExC_state, GROUPP, parno);
9104
9105                  insert_if_check_paren:
9106                     if ((c = *nextchar(pRExC_state)) != ')')
9107                         vFAIL("Switch condition not recognized");
9108                   insert_if:
9109                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9110                     br = regbranch(pRExC_state, &flags, 1,depth+1);
9111                     if (br == NULL) {
9112                         if (flags & RESTART_UTF8) {
9113                             *flagp = RESTART_UTF8;
9114                             return NULL;
9115                         }
9116                         FAIL2("panic: regbranch returned NULL, flags=%#X",
9117                               flags);
9118                     } else
9119                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
9120                     c = *nextchar(pRExC_state);
9121                     if (flags&HASWIDTH)
9122                         *flagp |= HASWIDTH;
9123                     if (c == '|') {
9124                         if (is_define)
9125                             vFAIL("(?(DEFINE)....) does not allow branches");
9126                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
9127                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9128                             if (flags & RESTART_UTF8) {
9129                                 *flagp = RESTART_UTF8;
9130                                 return NULL;
9131                             }
9132                             FAIL2("panic: regbranch returned NULL, flags=%#X",
9133                                   flags);
9134                         }
9135                         REGTAIL(pRExC_state, ret, lastbr);
9136                         if (flags&HASWIDTH)
9137                             *flagp |= HASWIDTH;
9138                         c = *nextchar(pRExC_state);
9139                     }
9140                     else
9141                         lastbr = NULL;
9142                     if (c != ')')
9143                         vFAIL("Switch (?(condition)... contains too many branches");
9144                     ender = reg_node(pRExC_state, TAIL);
9145                     REGTAIL(pRExC_state, br, ender);
9146                     if (lastbr) {
9147                         REGTAIL(pRExC_state, lastbr, ender);
9148                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
9149                     }
9150                     else
9151                         REGTAIL(pRExC_state, ret, ender);
9152                     RExC_size++; /* XXX WHY do we need this?!!
9153                                     For large programs it seems to be required
9154                                     but I can't figure out why. -- dmq*/
9155                     return ret;
9156                 }
9157                 else {
9158                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
9159                 }
9160             }
9161             case '[':           /* (?[ ... ]) */
9162                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
9163                                          oregcomp_parse);
9164             case 0:
9165                 RExC_parse--; /* for vFAIL to print correctly */
9166                 vFAIL("Sequence (? incomplete");
9167                 break;
9168             default: /* e.g., (?i) */
9169                 --RExC_parse;
9170               parse_flags:
9171                 parse_lparen_question_flags(pRExC_state);
9172                 if (UCHARAT(RExC_parse) != ':') {
9173                     nextchar(pRExC_state);
9174                     *flagp = TRYAGAIN;
9175                     return NULL;
9176                 }
9177                 paren = ':';
9178                 nextchar(pRExC_state);
9179                 ret = NULL;
9180                 goto parse_rest;
9181             } /* end switch */
9182         }
9183         else {                  /* (...) */
9184           capturing_parens:
9185             parno = RExC_npar;
9186             RExC_npar++;
9187
9188             ret = reganode(pRExC_state, OPEN, parno);
9189             if (!SIZE_ONLY ){
9190                 if (!RExC_nestroot)
9191                     RExC_nestroot = parno;
9192                 if (RExC_seen & REG_SEEN_RECURSE
9193                     && !RExC_open_parens[parno-1])
9194                 {
9195                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9196                         "Setting open paren #%"IVdf" to %d\n",
9197                         (IV)parno, REG_NODE_NUM(ret)));
9198                     RExC_open_parens[parno-1]= ret;
9199                 }
9200             }
9201             Set_Node_Length(ret, 1); /* MJD */
9202             Set_Node_Offset(ret, RExC_parse); /* MJD */
9203             is_open = 1;
9204         }
9205     }
9206     else                        /* ! paren */
9207         ret = NULL;
9208
9209    parse_rest:
9210     /* Pick up the branches, linking them together. */
9211     parse_start = RExC_parse;   /* MJD */
9212     br = regbranch(pRExC_state, &flags, 1,depth+1);
9213
9214     /*     branch_len = (paren != 0); */
9215
9216     if (br == NULL) {
9217         if (flags & RESTART_UTF8) {
9218             *flagp = RESTART_UTF8;
9219             return NULL;
9220         }
9221         FAIL2("panic: regbranch returned NULL, flags=%#X", flags);
9222     }
9223     if (*RExC_parse == '|') {
9224         if (!SIZE_ONLY && RExC_extralen) {
9225             reginsert(pRExC_state, BRANCHJ, br, depth+1);
9226         }
9227         else {                  /* MJD */
9228             reginsert(pRExC_state, BRANCH, br, depth+1);
9229             Set_Node_Length(br, paren != 0);
9230             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9231         }
9232         have_branch = 1;
9233         if (SIZE_ONLY)
9234             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
9235     }
9236     else if (paren == ':') {
9237         *flagp |= flags&SIMPLE;
9238     }
9239     if (is_open) {                              /* Starts with OPEN. */
9240         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
9241     }
9242     else if (paren != '?')              /* Not Conditional */
9243         ret = br;
9244     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9245     lastbr = br;
9246     while (*RExC_parse == '|') {
9247         if (!SIZE_ONLY && RExC_extralen) {
9248             ender = reganode(pRExC_state, LONGJMP,0);
9249             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9250         }
9251         if (SIZE_ONLY)
9252             RExC_extralen += 2;         /* Account for LONGJMP. */
9253         nextchar(pRExC_state);
9254         if (freeze_paren) {
9255             if (RExC_npar > after_freeze)
9256                 after_freeze = RExC_npar;
9257             RExC_npar = freeze_paren;
9258         }
9259         br = regbranch(pRExC_state, &flags, 0, depth+1);
9260
9261         if (br == NULL) {
9262             if (flags & RESTART_UTF8) {
9263                 *flagp = RESTART_UTF8;
9264                 return NULL;
9265             }
9266             FAIL2("panic: regbranch returned NULL, flags=%#X", flags);
9267         }
9268         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
9269         lastbr = br;
9270         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9271     }
9272
9273     if (have_branch || paren != ':') {
9274         /* Make a closing node, and hook it on the end. */
9275         switch (paren) {
9276         case ':':
9277             ender = reg_node(pRExC_state, TAIL);
9278             break;
9279         case 1:
9280             ender = reganode(pRExC_state, CLOSE, parno);
9281             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9282                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9283                         "Setting close paren #%"IVdf" to %d\n",
9284                         (IV)parno, REG_NODE_NUM(ender)));
9285                 RExC_close_parens[parno-1]= ender;
9286                 if (RExC_nestroot == parno)
9287                     RExC_nestroot = 0;
9288             }
9289             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9290             Set_Node_Length(ender,1); /* MJD */
9291             break;
9292         case '<':
9293         case ',':
9294         case '=':
9295         case '!':
9296             *flagp &= ~HASWIDTH;
9297             /* FALL THROUGH */
9298         case '>':
9299             ender = reg_node(pRExC_state, SUCCEED);
9300             break;
9301         case 0:
9302             ender = reg_node(pRExC_state, END);
9303             if (!SIZE_ONLY) {
9304                 assert(!RExC_opend); /* there can only be one! */
9305                 RExC_opend = ender;
9306             }
9307             break;
9308         }
9309         DEBUG_PARSE_r(if (!SIZE_ONLY) {
9310             SV * const mysv_val1=sv_newmortal();
9311             SV * const mysv_val2=sv_newmortal();
9312             DEBUG_PARSE_MSG("lsbr");
9313             regprop(RExC_rx, mysv_val1, lastbr);
9314             regprop(RExC_rx, mysv_val2, ender);
9315             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9316                           SvPV_nolen_const(mysv_val1),
9317                           (IV)REG_NODE_NUM(lastbr),
9318                           SvPV_nolen_const(mysv_val2),
9319                           (IV)REG_NODE_NUM(ender),
9320                           (IV)(ender - lastbr)
9321             );
9322         });
9323         REGTAIL(pRExC_state, lastbr, ender);
9324
9325         if (have_branch && !SIZE_ONLY) {
9326             char is_nothing= 1;
9327             if (depth==1)
9328                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9329
9330             /* Hook the tails of the branches to the closing node. */
9331             for (br = ret; br; br = regnext(br)) {
9332                 const U8 op = PL_regkind[OP(br)];
9333                 if (op == BRANCH) {
9334                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9335                     if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9336                         is_nothing= 0;
9337                 }
9338                 else if (op == BRANCHJ) {
9339                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9340                     /* for now we always disable this optimisation * /
9341                     if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9342                     */
9343                         is_nothing= 0;
9344                 }
9345             }
9346             if (is_nothing) {
9347                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9348                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9349                     SV * const mysv_val1=sv_newmortal();
9350                     SV * const mysv_val2=sv_newmortal();
9351                     DEBUG_PARSE_MSG("NADA");
9352                     regprop(RExC_rx, mysv_val1, ret);
9353                     regprop(RExC_rx, mysv_val2, ender);
9354                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9355                                   SvPV_nolen_const(mysv_val1),
9356                                   (IV)REG_NODE_NUM(ret),
9357                                   SvPV_nolen_const(mysv_val2),
9358                                   (IV)REG_NODE_NUM(ender),
9359                                   (IV)(ender - ret)
9360                     );
9361                 });
9362                 OP(br)= NOTHING;
9363                 if (OP(ender) == TAIL) {
9364                     NEXT_OFF(br)= 0;
9365                     RExC_emit= br + 1;
9366                 } else {
9367                     regnode *opt;
9368                     for ( opt= br + 1; opt < ender ; opt++ )
9369                         OP(opt)= OPTIMIZED;
9370                     NEXT_OFF(br)= ender - br;
9371                 }
9372             }
9373         }
9374     }
9375
9376     {
9377         const char *p;
9378         static const char parens[] = "=!<,>";
9379
9380         if (paren && (p = strchr(parens, paren))) {
9381             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9382             int flag = (p - parens) > 1;
9383
9384             if (paren == '>')
9385                 node = SUSPEND, flag = 0;
9386             reginsert(pRExC_state, node,ret, depth+1);
9387             Set_Node_Cur_Length(ret);
9388             Set_Node_Offset(ret, parse_start + 1);
9389             ret->flags = flag;
9390             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9391         }
9392     }
9393
9394     /* Check for proper termination. */
9395     if (paren) {
9396         RExC_flags = oregflags;
9397         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9398             RExC_parse = oregcomp_parse;
9399             vFAIL("Unmatched (");
9400         }
9401     }
9402     else if (!paren && RExC_parse < RExC_end) {
9403         if (*RExC_parse == ')') {
9404             RExC_parse++;
9405             vFAIL("Unmatched )");
9406         }
9407         else
9408             FAIL("Junk on end of regexp");      /* "Can't happen". */
9409         assert(0); /* NOTREACHED */
9410     }
9411
9412     if (RExC_in_lookbehind) {
9413         RExC_in_lookbehind--;
9414     }
9415     if (after_freeze > RExC_npar)
9416         RExC_npar = after_freeze;
9417     return(ret);
9418 }
9419
9420 /*
9421  - regbranch - one alternative of an | operator
9422  *
9423  * Implements the concatenation operator.
9424  *
9425  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9426  * restarted.
9427  */
9428 STATIC regnode *
9429 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9430 {
9431     dVAR;
9432     regnode *ret;
9433     regnode *chain = NULL;
9434     regnode *latest;
9435     I32 flags = 0, c = 0;
9436     GET_RE_DEBUG_FLAGS_DECL;
9437
9438     PERL_ARGS_ASSERT_REGBRANCH;
9439
9440     DEBUG_PARSE("brnc");
9441
9442     if (first)
9443         ret = NULL;
9444     else {
9445         if (!SIZE_ONLY && RExC_extralen)
9446             ret = reganode(pRExC_state, BRANCHJ,0);
9447         else {
9448             ret = reg_node(pRExC_state, BRANCH);
9449             Set_Node_Length(ret, 1);
9450         }
9451     }
9452
9453     if (!first && SIZE_ONLY)
9454         RExC_extralen += 1;                     /* BRANCHJ */
9455
9456     *flagp = WORST;                     /* Tentatively. */
9457
9458     RExC_parse--;
9459     nextchar(pRExC_state);
9460     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9461         flags &= ~TRYAGAIN;
9462         latest = regpiece(pRExC_state, &flags,depth+1);
9463         if (latest == NULL) {
9464             if (flags & TRYAGAIN)
9465                 continue;
9466             if (flags & RESTART_UTF8) {
9467                 *flagp = RESTART_UTF8;
9468                 return NULL;
9469             }
9470             FAIL2("panic: regpiece returned NULL, flags=%#X", flags);
9471         }
9472         else if (ret == NULL)
9473             ret = latest;
9474         *flagp |= flags&(HASWIDTH|POSTPONED);
9475         if (chain == NULL)      /* First piece. */
9476             *flagp |= flags&SPSTART;
9477         else {
9478             RExC_naughty++;
9479             REGTAIL(pRExC_state, chain, latest);
9480         }
9481         chain = latest;
9482         c++;
9483     }
9484     if (chain == NULL) {        /* Loop ran zero times. */
9485         chain = reg_node(pRExC_state, NOTHING);
9486         if (ret == NULL)
9487             ret = chain;
9488     }
9489     if (c == 1) {
9490         *flagp |= flags&SIMPLE;
9491     }
9492
9493     return ret;
9494 }
9495
9496 /*
9497  - regpiece - something followed by possible [*+?]
9498  *
9499  * Note that the branching code sequences used for ? and the general cases
9500  * of * and + are somewhat optimized:  they use the same NOTHING node as
9501  * both the endmarker for their branch list and the body of the last branch.
9502  * It might seem that this node could be dispensed with entirely, but the
9503  * endmarker role is not redundant.
9504  *
9505  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
9506  * TRYAGAIN.
9507  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9508  * restarted.
9509  */
9510 STATIC regnode *
9511 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9512 {
9513     dVAR;
9514     regnode *ret;
9515     char op;
9516     char *next;
9517     I32 flags;
9518     const char * const origparse = RExC_parse;
9519     I32 min;
9520     I32 max = REG_INFTY;
9521 #ifdef RE_TRACK_PATTERN_OFFSETS
9522     char *parse_start;
9523 #endif
9524     const char *maxpos = NULL;
9525
9526     /* Save the original in case we change the emitted regop to a FAIL. */
9527     regnode * const orig_emit = RExC_emit;
9528
9529     GET_RE_DEBUG_FLAGS_DECL;
9530
9531     PERL_ARGS_ASSERT_REGPIECE;
9532
9533     DEBUG_PARSE("piec");
9534
9535     ret = regatom(pRExC_state, &flags,depth+1);
9536     if (ret == NULL) {
9537         if (flags & (TRYAGAIN|RESTART_UTF8))
9538             *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
9539         else
9540             FAIL2("panic: regatom returned NULL, flags=%#X", flags);
9541         return(NULL);
9542     }
9543
9544     op = *RExC_parse;
9545
9546     if (op == '{' && regcurly(RExC_parse, FALSE)) {
9547         maxpos = NULL;
9548 #ifdef RE_TRACK_PATTERN_OFFSETS
9549         parse_start = RExC_parse; /* MJD */
9550 #endif
9551         next = RExC_parse + 1;
9552         while (isDIGIT(*next) || *next == ',') {
9553             if (*next == ',') {
9554                 if (maxpos)
9555                     break;
9556                 else
9557                     maxpos = next;
9558             }
9559             next++;
9560         }
9561         if (*next == '}') {             /* got one */
9562             if (!maxpos)
9563                 maxpos = next;
9564             RExC_parse++;
9565             min = atoi(RExC_parse);
9566             if (*maxpos == ',')
9567                 maxpos++;
9568             else
9569                 maxpos = RExC_parse;
9570             max = atoi(maxpos);
9571             if (!max && *maxpos != '0')
9572                 max = REG_INFTY;                /* meaning "infinity" */
9573             else if (max >= REG_INFTY)
9574                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9575             RExC_parse = next;
9576             nextchar(pRExC_state);
9577             if (max < min) {    /* If can't match, warn and optimize to fail
9578                                    unconditionally */
9579                 if (SIZE_ONLY) {
9580                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9581
9582                     /* We can't back off the size because we have to reserve
9583                      * enough space for all the things we are about to throw
9584                      * away, but we can shrink it by the ammount we are about
9585                      * to re-use here */
9586                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9587                 }
9588                 else {
9589                     RExC_emit = orig_emit;
9590                 }
9591                 ret = reg_node(pRExC_state, OPFAIL);
9592                 return ret;
9593             }
9594             else if (max == 0) {    /* replace {0} with a nothing node */
9595                 if (SIZE_ONLY) {
9596                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)NOTHING];
9597                 }
9598                 else {
9599                     RExC_emit = orig_emit;
9600                 }
9601                 ret = reg_node(pRExC_state, NOTHING);
9602                 return ret;
9603             }
9604
9605         do_curly:
9606             if ((flags&SIMPLE)) {
9607                 RExC_naughty += 2 + RExC_naughty / 2;
9608                 reginsert(pRExC_state, CURLY, ret, depth+1);
9609                 Set_Node_Offset(ret, parse_start+1); /* MJD */
9610                 Set_Node_Cur_Length(ret);
9611             }
9612             else {
9613                 regnode * const w = reg_node(pRExC_state, WHILEM);
9614
9615                 w->flags = 0;
9616                 REGTAIL(pRExC_state, ret, w);
9617                 if (!SIZE_ONLY && RExC_extralen) {
9618                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
9619                     reginsert(pRExC_state, NOTHING,ret, depth+1);
9620                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
9621                 }
9622                 reginsert(pRExC_state, CURLYX,ret, depth+1);
9623                                 /* MJD hk */
9624                 Set_Node_Offset(ret, parse_start+1);
9625                 Set_Node_Length(ret,
9626                                 op == '{' ? (RExC_parse - parse_start) : 1);
9627
9628                 if (!SIZE_ONLY && RExC_extralen)
9629                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
9630                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9631                 if (SIZE_ONLY)
9632                     RExC_whilem_seen++, RExC_extralen += 3;
9633                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
9634             }
9635             ret->flags = 0;
9636
9637             if (min > 0)
9638                 *flagp = WORST;
9639             if (max > 0)
9640                 *flagp |= HASWIDTH;
9641             if (!SIZE_ONLY) {
9642                 ARG1_SET(ret, (U16)min);
9643                 ARG2_SET(ret, (U16)max);
9644             }
9645
9646             goto nest_check;
9647         }
9648     }
9649
9650     if (!ISMULT1(op)) {
9651         *flagp = flags;
9652         return(ret);
9653     }
9654
9655 #if 0                           /* Now runtime fix should be reliable. */
9656
9657     /* if this is reinstated, don't forget to put this back into perldiag:
9658
9659             =item Regexp *+ operand could be empty at {#} in regex m/%s/
9660
9661            (F) The part of the regexp subject to either the * or + quantifier
9662            could match an empty string. The {#} shows in the regular
9663            expression about where the problem was discovered.
9664
9665     */
9666
9667     if (!(flags&HASWIDTH) && op != '?')
9668       vFAIL("Regexp *+ operand could be empty");
9669 #endif
9670
9671 #ifdef RE_TRACK_PATTERN_OFFSETS
9672     parse_start = RExC_parse;
9673 #endif
9674     nextchar(pRExC_state);
9675
9676     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9677
9678     if (op == '*' && (flags&SIMPLE)) {
9679         reginsert(pRExC_state, STAR, ret, depth+1);
9680         ret->flags = 0;
9681         RExC_naughty += 4;
9682     }
9683     else if (op == '*') {
9684         min = 0;
9685         goto do_curly;
9686     }
9687     else if (op == '+' && (flags&SIMPLE)) {
9688         reginsert(pRExC_state, PLUS, ret, depth+1);
9689         ret->flags = 0;
9690         RExC_naughty += 3;
9691     }
9692     else if (op == '+') {
9693         min = 1;
9694         goto do_curly;
9695     }
9696     else if (op == '?') {
9697         min = 0; max = 1;
9698         goto do_curly;
9699     }
9700   nest_check:
9701     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9702         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
9703         ckWARN3reg(RExC_parse,
9704                    "%.*s matches null string many times",
9705                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9706                    origparse);
9707         (void)ReREFCNT_inc(RExC_rx_sv);
9708     }
9709
9710     if (RExC_parse < RExC_end && *RExC_parse == '?') {
9711         nextchar(pRExC_state);
9712         reginsert(pRExC_state, MINMOD, ret, depth+1);
9713         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9714     }
9715 #ifndef REG_ALLOW_MINMOD_SUSPEND
9716     else
9717 #endif
9718     if (RExC_parse < RExC_end && *RExC_parse == '+') {
9719         regnode *ender;
9720         nextchar(pRExC_state);
9721         ender = reg_node(pRExC_state, SUCCEED);
9722         REGTAIL(pRExC_state, ret, ender);
9723         reginsert(pRExC_state, SUSPEND, ret, depth+1);
9724         ret->flags = 0;
9725         ender = reg_node(pRExC_state, TAIL);
9726         REGTAIL(pRExC_state, ret, ender);
9727         /*ret= ender;*/
9728     }
9729
9730     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9731         RExC_parse++;
9732         vFAIL("Nested quantifiers");
9733     }
9734
9735     return(ret);
9736 }
9737
9738 STATIC bool
9739 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
9740         const bool strict   /* Apply stricter parsing rules? */
9741     )
9742 {
9743
9744  /* This is expected to be called by a parser routine that has recognized '\N'
9745    and needs to handle the rest. RExC_parse is expected to point at the first
9746    char following the N at the time of the call.  On successful return,
9747    RExC_parse has been updated to point to just after the sequence identified
9748    by this routine, and <*flagp> has been updated.
9749
9750    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9751    character class.
9752
9753    \N may begin either a named sequence, or if outside a character class, mean
9754    to match a non-newline.  For non single-quoted regexes, the tokenizer has
9755    attempted to decide which, and in the case of a named sequence, converted it
9756    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9757    where c1... are the characters in the sequence.  For single-quoted regexes,
9758    the tokenizer passes the \N sequence through unchanged; this code will not
9759    attempt to determine this nor expand those, instead raising a syntax error.
9760    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9761    or there is no '}', it signals that this \N occurrence means to match a
9762    non-newline.
9763
9764    Only the \N{U+...} form should occur in a character class, for the same
9765    reason that '.' inside a character class means to just match a period: it
9766    just doesn't make sense.
9767
9768    The function raises an error (via vFAIL), and doesn't return for various
9769    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
9770    success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
9771    RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
9772    only possible if node_p is non-NULL.
9773
9774
9775    If <valuep> is non-null, it means the caller can accept an input sequence
9776    consisting of a just a single code point; <*valuep> is set to that value
9777    if the input is such.
9778
9779    If <node_p> is non-null it signifies that the caller can accept any other
9780    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
9781    is set as follows:
9782     1) \N means not-a-NL: points to a newly created REG_ANY node;
9783     2) \N{}:              points to a new NOTHING node;
9784     3) otherwise:         points to a new EXACT node containing the resolved
9785                           string.
9786    Note that FALSE is returned for single code point sequences if <valuep> is
9787    null.
9788  */
9789
9790     char * endbrace;    /* '}' following the name */
9791     char* p;
9792     char *endchar;      /* Points to '.' or '}' ending cur char in the input
9793                            stream */
9794     bool has_multiple_chars; /* true if the input stream contains a sequence of
9795                                 more than one character */
9796
9797     GET_RE_DEBUG_FLAGS_DECL;
9798
9799     PERL_ARGS_ASSERT_GROK_BSLASH_N;
9800
9801     GET_RE_DEBUG_FLAGS;
9802
9803     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
9804
9805     /* The [^\n] meaning of \N ignores spaces and comments under the /x
9806      * modifier.  The other meaning does not */
9807     p = (RExC_flags & RXf_PMf_EXTENDED)
9808         ? regwhite( pRExC_state, RExC_parse )
9809         : RExC_parse;
9810
9811     /* Disambiguate between \N meaning a named character versus \N meaning
9812      * [^\n].  The former is assumed when it can't be the latter. */
9813     if (*p != '{' || regcurly(p, FALSE)) {
9814         RExC_parse = p;
9815         if (! node_p) {
9816             /* no bare \N in a charclass */
9817             if (in_char_class) {
9818                 vFAIL("\\N in a character class must be a named character: \\N{...}");
9819             }
9820             return FALSE;
9821         }
9822         nextchar(pRExC_state);
9823         *node_p = reg_node(pRExC_state, REG_ANY);
9824         *flagp |= HASWIDTH|SIMPLE;
9825         RExC_naughty++;
9826         RExC_parse--;
9827         Set_Node_Length(*node_p, 1); /* MJD */
9828         return TRUE;
9829     }
9830
9831     /* Here, we have decided it should be a named character or sequence */
9832
9833     /* The test above made sure that the next real character is a '{', but
9834      * under the /x modifier, it could be separated by space (or a comment and
9835      * \n) and this is not allowed (for consistency with \x{...} and the
9836      * tokenizer handling of \N{NAME}). */
9837     if (*RExC_parse != '{') {
9838         vFAIL("Missing braces on \\N{}");
9839     }
9840
9841     RExC_parse++;       /* Skip past the '{' */
9842
9843     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9844         || ! (endbrace == RExC_parse            /* nothing between the {} */
9845               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
9846                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9847     {
9848         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
9849         vFAIL("\\N{NAME} must be resolved by the lexer");
9850     }
9851
9852     if (endbrace == RExC_parse) {   /* empty: \N{} */
9853         bool ret = TRUE;
9854         if (node_p) {
9855             *node_p = reg_node(pRExC_state,NOTHING);
9856         }
9857         else if (in_char_class) {
9858             if (SIZE_ONLY && in_char_class) {
9859                 if (strict) {
9860                     RExC_parse++;   /* Position after the "}" */
9861                     vFAIL("Zero length \\N{}");
9862                 }
9863                 else {
9864                     ckWARNreg(RExC_parse,
9865                               "Ignoring zero length \\N{} in character class");
9866                 }
9867             }
9868             ret = FALSE;
9869         }
9870         else {
9871             return FALSE;
9872         }
9873         nextchar(pRExC_state);
9874         return ret;
9875     }
9876
9877     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9878     RExC_parse += 2;    /* Skip past the 'U+' */
9879
9880     endchar = RExC_parse + strcspn(RExC_parse, ".}");
9881
9882     /* Code points are separated by dots.  If none, there is only one code
9883      * point, and is terminated by the brace */
9884     has_multiple_chars = (endchar < endbrace);
9885
9886     if (valuep && (! has_multiple_chars || in_char_class)) {
9887         /* We only pay attention to the first char of
9888         multichar strings being returned in char classes. I kinda wonder
9889         if this makes sense as it does change the behaviour
9890         from earlier versions, OTOH that behaviour was broken
9891         as well. XXX Solution is to recharacterize as
9892         [rest-of-class]|multi1|multi2... */
9893
9894         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9895         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9896             | PERL_SCAN_DISALLOW_PREFIX
9897             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9898
9899         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9900
9901         /* The tokenizer should have guaranteed validity, but it's possible to
9902          * bypass it by using single quoting, so check */
9903         if (length_of_hex == 0
9904             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9905         {
9906             RExC_parse += length_of_hex;        /* Includes all the valid */
9907             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
9908                             ? UTF8SKIP(RExC_parse)
9909                             : 1;
9910             /* Guard against malformed utf8 */
9911             if (RExC_parse >= endchar) {
9912                 RExC_parse = endchar;
9913             }
9914             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9915         }
9916
9917         if (in_char_class && has_multiple_chars) {
9918             if (strict) {
9919                 RExC_parse = endbrace;
9920                 vFAIL("\\N{} in character class restricted to one character");
9921             }
9922             else {
9923                 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9924             }
9925         }
9926
9927         RExC_parse = endbrace + 1;
9928     }
9929     else if (! node_p || ! has_multiple_chars) {
9930
9931         /* Here, the input is legal, but not according to the caller's
9932          * options.  We fail without advancing the parse, so that the
9933          * caller can try again */
9934         RExC_parse = p;
9935         return FALSE;
9936     }
9937     else {
9938
9939         /* What is done here is to convert this to a sub-pattern of the form
9940          * (?:\x{char1}\x{char2}...)
9941          * and then call reg recursively.  That way, it retains its atomicness,
9942          * while not having to worry about special handling that some code
9943          * points may have.  toke.c has converted the original Unicode values
9944          * to native, so that we can just pass on the hex values unchanged.  We
9945          * do have to set a flag to keep recoding from happening in the
9946          * recursion */
9947
9948         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9949         STRLEN len;
9950         char *orig_end = RExC_end;
9951         I32 flags;
9952
9953         while (RExC_parse < endbrace) {
9954
9955             /* Convert to notation the rest of the code understands */
9956             sv_catpv(substitute_parse, "\\x{");
9957             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9958             sv_catpv(substitute_parse, "}");
9959
9960             /* Point to the beginning of the next character in the sequence. */
9961             RExC_parse = endchar + 1;
9962             endchar = RExC_parse + strcspn(RExC_parse, ".}");
9963         }
9964         sv_catpv(substitute_parse, ")");
9965
9966         RExC_parse = SvPV(substitute_parse, len);
9967
9968         /* Don't allow empty number */
9969         if (len < 8) {
9970             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9971         }
9972         RExC_end = RExC_parse + len;
9973
9974         /* The values are Unicode, and therefore not subject to recoding */
9975         RExC_override_recoding = 1;
9976
9977         if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
9978             if (flags & RESTART_UTF8) {
9979                 *flagp = RESTART_UTF8;
9980                 return FALSE;
9981             }
9982             FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#X",
9983                   flags);
9984         }
9985         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9986
9987         RExC_parse = endbrace;
9988         RExC_end = orig_end;
9989         RExC_override_recoding = 0;
9990
9991         nextchar(pRExC_state);
9992     }
9993
9994     return TRUE;
9995 }
9996
9997
9998 /*
9999  * reg_recode
10000  *
10001  * It returns the code point in utf8 for the value in *encp.
10002  *    value: a code value in the source encoding
10003  *    encp:  a pointer to an Encode object
10004  *
10005  * If the result from Encode is not a single character,
10006  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10007  */
10008 STATIC UV
10009 S_reg_recode(pTHX_ const char value, SV **encp)
10010 {
10011     STRLEN numlen = 1;
10012     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10013     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10014     const STRLEN newlen = SvCUR(sv);
10015     UV uv = UNICODE_REPLACEMENT;
10016
10017     PERL_ARGS_ASSERT_REG_RECODE;
10018
10019     if (newlen)
10020         uv = SvUTF8(sv)
10021              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10022              : *(U8*)s;
10023
10024     if (!newlen || numlen != newlen) {
10025         uv = UNICODE_REPLACEMENT;
10026         *encp = NULL;
10027     }
10028     return uv;
10029 }
10030
10031 PERL_STATIC_INLINE U8
10032 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10033 {
10034     U8 op;
10035
10036     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10037
10038     if (! FOLD) {
10039         return EXACT;
10040     }
10041
10042     op = get_regex_charset(RExC_flags);
10043     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10044         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10045                  been, so there is no hole */
10046     }
10047
10048     return op + EXACTF;
10049 }
10050
10051 PERL_STATIC_INLINE void
10052 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
10053 {
10054     /* This knows the details about sizing an EXACTish node, setting flags for
10055      * it (by setting <*flagp>, and potentially populating it with a single
10056      * character.
10057      *
10058      * If <len> (the length in bytes) is non-zero, this function assumes that
10059      * the node has already been populated, and just does the sizing.  In this
10060      * case <code_point> should be the final code point that has already been
10061      * placed into the node.  This value will be ignored except that under some
10062      * circumstances <*flagp> is set based on it.
10063      *
10064      * If <len> is zero, the function assumes that the node is to contain only
10065      * the single character given by <code_point> and calculates what <len>
10066      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
10067      * additionally will populate the node's STRING with <code_point>, if <len>
10068      * is 0.  In both cases <*flagp> is appropriately set
10069      *
10070      * It knows that under FOLD, UTF characters and the Latin Sharp S must be
10071      * folded (the latter only when the rules indicate it can match 'ss') */
10072
10073     bool len_passed_in = cBOOL(len != 0);
10074     U8 character[UTF8_MAXBYTES_CASE+1];
10075
10076     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10077
10078     if (! len_passed_in) {
10079         if (UTF) {
10080             if (FOLD) {
10081                 to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
10082             }
10083             else {
10084                 uvchr_to_utf8( character, code_point);
10085                 len = UTF8SKIP(character);
10086             }
10087         }
10088         else if (! FOLD
10089                  || code_point != LATIN_SMALL_LETTER_SHARP_S
10090                  || ASCII_FOLD_RESTRICTED
10091                  || ! AT_LEAST_UNI_SEMANTICS)
10092         {
10093             *character = (U8) code_point;
10094             len = 1;
10095         }
10096         else {
10097             *character = 's';
10098             *(character + 1) = 's';
10099             len = 2;
10100         }
10101     }
10102
10103     if (SIZE_ONLY) {
10104         RExC_size += STR_SZ(len);
10105     }
10106     else {
10107         RExC_emit += STR_SZ(len);
10108         STR_LEN(node) = len;
10109         if (! len_passed_in) {
10110             Copy((char *) character, STRING(node), len, char);
10111         }
10112     }
10113
10114     *flagp |= HASWIDTH;
10115
10116     /* A single character node is SIMPLE, except for the special-cased SHARP S
10117      * under /di. */
10118     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10119         && (code_point != LATIN_SMALL_LETTER_SHARP_S
10120             || ! FOLD || ! DEPENDS_SEMANTICS))
10121     {
10122         *flagp |= SIMPLE;
10123     }
10124 }
10125
10126 /*
10127  - regatom - the lowest level
10128
10129    Try to identify anything special at the start of the pattern. If there
10130    is, then handle it as required. This may involve generating a single regop,
10131    such as for an assertion; or it may involve recursing, such as to
10132    handle a () structure.
10133
10134    If the string doesn't start with something special then we gobble up
10135    as much literal text as we can.
10136
10137    Once we have been able to handle whatever type of thing started the
10138    sequence, we return.
10139
10140    Note: we have to be careful with escapes, as they can be both literal
10141    and special, and in the case of \10 and friends, context determines which.
10142
10143    A summary of the code structure is:
10144
10145    switch (first_byte) {
10146         cases for each special:
10147             handle this special;
10148             break;
10149         case '\\':
10150             switch (2nd byte) {
10151                 cases for each unambiguous special:
10152                     handle this special;
10153                     break;
10154                 cases for each ambigous special/literal:
10155                     disambiguate;
10156                     if (special)  handle here
10157                     else goto defchar;
10158                 default: // unambiguously literal:
10159                     goto defchar;
10160             }
10161         default:  // is a literal char
10162             // FALL THROUGH
10163         defchar:
10164             create EXACTish node for literal;
10165             while (more input and node isn't full) {
10166                 switch (input_byte) {
10167                    cases for each special;
10168                        make sure parse pointer is set so that the next call to
10169                            regatom will see this special first
10170                        goto loopdone; // EXACTish node terminated by prev. char
10171                    default:
10172                        append char to EXACTISH node;
10173                 }
10174                 get next input byte;
10175             }
10176         loopdone:
10177    }
10178    return the generated node;
10179
10180    Specifically there are two separate switches for handling
10181    escape sequences, with the one for handling literal escapes requiring
10182    a dummy entry for all of the special escapes that are actually handled
10183    by the other.
10184
10185    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
10186    TRYAGAIN.
10187    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10188    restarted.
10189    Otherwise does not return NULL.
10190 */
10191
10192 STATIC regnode *
10193 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10194 {
10195     dVAR;
10196     regnode *ret = NULL;
10197     I32 flags = 0;
10198     char *parse_start = RExC_parse;
10199     U8 op;
10200     int invert = 0;
10201
10202     GET_RE_DEBUG_FLAGS_DECL;
10203
10204     *flagp = WORST;             /* Tentatively. */
10205
10206     DEBUG_PARSE("atom");
10207
10208     PERL_ARGS_ASSERT_REGATOM;
10209
10210 tryagain:
10211     switch ((U8)*RExC_parse) {
10212     case '^':
10213         RExC_seen_zerolen++;
10214         nextchar(pRExC_state);
10215         if (RExC_flags & RXf_PMf_MULTILINE)
10216             ret = reg_node(pRExC_state, MBOL);
10217         else if (RExC_flags & RXf_PMf_SINGLELINE)
10218             ret = reg_node(pRExC_state, SBOL);
10219         else
10220             ret = reg_node(pRExC_state, BOL);
10221         Set_Node_Length(ret, 1); /* MJD */
10222         break;
10223     case '$':
10224         nextchar(pRExC_state);
10225         if (*RExC_parse)
10226             RExC_seen_zerolen++;
10227         if (RExC_flags & RXf_PMf_MULTILINE)
10228             ret = reg_node(pRExC_state, MEOL);
10229         else if (RExC_flags & RXf_PMf_SINGLELINE)
10230             ret = reg_node(pRExC_state, SEOL);
10231         else
10232             ret = reg_node(pRExC_state, EOL);
10233         Set_Node_Length(ret, 1); /* MJD */
10234         break;
10235     case '.':
10236         nextchar(pRExC_state);
10237         if (RExC_flags & RXf_PMf_SINGLELINE)
10238             ret = reg_node(pRExC_state, SANY);
10239         else
10240             ret = reg_node(pRExC_state, REG_ANY);
10241         *flagp |= HASWIDTH|SIMPLE;
10242         RExC_naughty++;
10243         Set_Node_Length(ret, 1); /* MJD */
10244         break;
10245     case '[':
10246     {
10247         char * const oregcomp_parse = ++RExC_parse;
10248         ret = regclass(pRExC_state, flagp,depth+1,
10249                        FALSE, /* means parse the whole char class */
10250                        TRUE, /* allow multi-char folds */
10251                        FALSE, /* don't silence non-portable warnings. */
10252                        NULL);
10253         if (*RExC_parse != ']') {
10254             RExC_parse = oregcomp_parse;
10255             vFAIL("Unmatched [");
10256         }
10257         if (ret == NULL) {
10258             if (*flagp & RESTART_UTF8)
10259                 return NULL;
10260             FAIL2("panic: regclass returned NULL to regatom, flags=%#X",
10261                   *flagp);
10262         }
10263         nextchar(pRExC_state);
10264         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10265         break;
10266     }
10267     case '(':
10268         nextchar(pRExC_state);
10269         ret = reg(pRExC_state, 1, &flags,depth+1);
10270         if (ret == NULL) {
10271                 if (flags & TRYAGAIN) {
10272                     if (RExC_parse == RExC_end) {
10273                          /* Make parent create an empty node if needed. */
10274                         *flagp |= TRYAGAIN;
10275                         return(NULL);
10276                     }
10277                     goto tryagain;
10278                 }
10279                 if (flags & RESTART_UTF8) {
10280                     *flagp = RESTART_UTF8;
10281                     return NULL;
10282                 }
10283                 FAIL2("panic: reg returned NULL to regatom, flags=%#X", flags);
10284         }
10285         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10286         break;
10287     case '|':
10288     case ')':
10289         if (flags & TRYAGAIN) {
10290             *flagp |= TRYAGAIN;
10291             return NULL;
10292         }
10293         vFAIL("Internal urp");
10294                                 /* Supposed to be caught earlier. */
10295         break;
10296     case '{':
10297         if (!regcurly(RExC_parse, FALSE)) {
10298             RExC_parse++;
10299             goto defchar;
10300         }
10301         /* FALL THROUGH */
10302     case '?':
10303     case '+':
10304     case '*':
10305         RExC_parse++;
10306         vFAIL("Quantifier follows nothing");
10307         break;
10308     case '\\':
10309         /* Special Escapes
10310
10311            This switch handles escape sequences that resolve to some kind
10312            of special regop and not to literal text. Escape sequnces that
10313            resolve to literal text are handled below in the switch marked
10314            "Literal Escapes".
10315
10316            Every entry in this switch *must* have a corresponding entry
10317            in the literal escape switch. However, the opposite is not
10318            required, as the default for this switch is to jump to the
10319            literal text handling code.
10320         */
10321         switch ((U8)*++RExC_parse) {
10322             U8 arg;
10323         /* Special Escapes */
10324         case 'A':
10325             RExC_seen_zerolen++;
10326             ret = reg_node(pRExC_state, SBOL);
10327             *flagp |= SIMPLE;
10328             goto finish_meta_pat;
10329         case 'G':
10330             ret = reg_node(pRExC_state, GPOS);
10331             RExC_seen |= REG_SEEN_GPOS;
10332             *flagp |= SIMPLE;
10333             goto finish_meta_pat;
10334         case 'K':
10335             RExC_seen_zerolen++;
10336             ret = reg_node(pRExC_state, KEEPS);
10337             *flagp |= SIMPLE;
10338             /* XXX:dmq : disabling in-place substitution seems to
10339              * be necessary here to avoid cases of memory corruption, as
10340              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10341              */
10342             RExC_seen |= REG_SEEN_LOOKBEHIND;
10343             goto finish_meta_pat;
10344         case 'Z':
10345             ret = reg_node(pRExC_state, SEOL);
10346             *flagp |= SIMPLE;
10347             RExC_seen_zerolen++;                /* Do not optimize RE away */
10348             goto finish_meta_pat;
10349         case 'z':
10350             ret = reg_node(pRExC_state, EOS);
10351             *flagp |= SIMPLE;
10352             RExC_seen_zerolen++;                /* Do not optimize RE away */
10353             goto finish_meta_pat;
10354         case 'C':
10355             ret = reg_node(pRExC_state, CANY);
10356             RExC_seen |= REG_SEEN_CANY;
10357             *flagp |= HASWIDTH|SIMPLE;
10358             goto finish_meta_pat;
10359         case 'X':
10360             ret = reg_node(pRExC_state, CLUMP);
10361             *flagp |= HASWIDTH;
10362             goto finish_meta_pat;
10363
10364         case 'W':
10365             invert = 1;
10366             /* FALLTHROUGH */
10367         case 'w':
10368             arg = ANYOF_WORDCHAR;
10369             goto join_posix;
10370
10371         case 'b':
10372             RExC_seen_zerolen++;
10373             RExC_seen |= REG_SEEN_LOOKBEHIND;
10374             op = BOUND + get_regex_charset(RExC_flags);
10375             if (op > BOUNDA) {  /* /aa is same as /a */
10376                 op = BOUNDA;
10377             }
10378             ret = reg_node(pRExC_state, op);
10379             FLAGS(ret) = get_regex_charset(RExC_flags);
10380             *flagp |= SIMPLE;
10381             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10382                 ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
10383             }
10384             goto finish_meta_pat;
10385         case 'B':
10386             RExC_seen_zerolen++;
10387             RExC_seen |= REG_SEEN_LOOKBEHIND;
10388             op = NBOUND + get_regex_charset(RExC_flags);
10389             if (op > NBOUNDA) { /* /aa is same as /a */
10390                 op = NBOUNDA;
10391             }
10392             ret = reg_node(pRExC_state, op);
10393             FLAGS(ret) = get_regex_charset(RExC_flags);
10394             *flagp |= SIMPLE;
10395             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10396                 ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
10397             }
10398             goto finish_meta_pat;
10399
10400         case 'D':
10401             invert = 1;
10402             /* FALLTHROUGH */
10403         case 'd':
10404             arg = ANYOF_DIGIT;
10405             goto join_posix;
10406
10407         case 'R':
10408             ret = reg_node(pRExC_state, LNBREAK);
10409             *flagp |= HASWIDTH|SIMPLE;
10410             goto finish_meta_pat;
10411
10412         case 'H':
10413             invert = 1;
10414             /* FALLTHROUGH */
10415         case 'h':
10416             arg = ANYOF_BLANK;
10417             op = POSIXU;
10418             goto join_posix_op_known;
10419
10420         case 'V':
10421             invert = 1;
10422             /* FALLTHROUGH */
10423         case 'v':
10424             arg = ANYOF_VERTWS;
10425             op = POSIXU;
10426             goto join_posix_op_known;
10427
10428         case 'S':
10429             invert = 1;
10430             /* FALLTHROUGH */
10431         case 's':
10432             arg = ANYOF_SPACE;
10433
10434         join_posix:
10435
10436             op = POSIXD + get_regex_charset(RExC_flags);
10437             if (op > POSIXA) {  /* /aa is same as /a */
10438                 op = POSIXA;
10439             }
10440
10441         join_posix_op_known:
10442
10443             if (invert) {
10444                 op += NPOSIXD - POSIXD;
10445             }
10446
10447             ret = reg_node(pRExC_state, op);
10448             if (! SIZE_ONLY) {
10449                 FLAGS(ret) = namedclass_to_classnum(arg);
10450             }
10451
10452             *flagp |= HASWIDTH|SIMPLE;
10453             /* FALL THROUGH */
10454
10455          finish_meta_pat:
10456             nextchar(pRExC_state);
10457             Set_Node_Length(ret, 2); /* MJD */
10458             break;
10459         case 'p':
10460         case 'P':
10461             {
10462 #ifdef DEBUGGING
10463                 char* parse_start = RExC_parse - 2;
10464 #endif
10465
10466                 RExC_parse--;
10467
10468                 ret = regclass(pRExC_state, flagp,depth+1,
10469                                TRUE, /* means just parse this element */
10470                                FALSE, /* don't allow multi-char folds */
10471                                FALSE, /* don't silence non-portable warnings.
10472                                          It would be a bug if these returned
10473                                          non-portables */
10474                                NULL);
10475                 /* regclass() can only return RESTART_UTF8 if multi-char folds
10476                    are allowed.  */
10477                 if (!ret)
10478                     FAIL2("panic: regclass returned NULL to regatom, flags=%#X",
10479                           *flagp);
10480
10481                 RExC_parse--;
10482
10483                 Set_Node_Offset(ret, parse_start + 2);
10484                 Set_Node_Cur_Length(ret);
10485                 nextchar(pRExC_state);
10486             }
10487             break;
10488         case 'N':
10489             /* Handle \N and \N{NAME} with multiple code points here and not
10490              * below because it can be multicharacter. join_exact() will join
10491              * them up later on.  Also this makes sure that things like
10492              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10493              * The options to the grok function call causes it to fail if the
10494              * sequence is just a single code point.  We then go treat it as
10495              * just another character in the current EXACT node, and hence it
10496              * gets uniform treatment with all the other characters.  The
10497              * special treatment for quantifiers is not needed for such single
10498              * character sequences */
10499             ++RExC_parse;
10500             if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
10501                                 FALSE /* not strict */ )) {
10502                 if (*flagp & RESTART_UTF8)
10503                     return NULL;
10504                 RExC_parse--;
10505                 goto defchar;
10506             }
10507             break;
10508         case 'k':    /* Handle \k<NAME> and \k'NAME' */
10509         parse_named_seq:
10510         {
10511             char ch= RExC_parse[1];
10512             if (ch != '<' && ch != '\'' && ch != '{') {
10513                 RExC_parse++;
10514                 vFAIL2("Sequence %.2s... not terminated",parse_start);
10515             } else {
10516                 /* this pretty much dupes the code for (?P=...) in reg(), if
10517                    you change this make sure you change that */
10518                 char* name_start = (RExC_parse += 2);
10519                 U32 num = 0;
10520                 SV *sv_dat = reg_scan_name(pRExC_state,
10521                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10522                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10523                 if (RExC_parse == name_start || *RExC_parse != ch)
10524                     vFAIL2("Sequence %.3s... not terminated",parse_start);
10525
10526                 if (!SIZE_ONLY) {
10527                     num = add_data( pRExC_state, 1, "S" );
10528                     RExC_rxi->data->data[num]=(void*)sv_dat;
10529                     SvREFCNT_inc_simple_void(sv_dat);
10530                 }
10531
10532                 RExC_sawback = 1;
10533                 ret = reganode(pRExC_state,
10534                                ((! FOLD)
10535                                  ? NREF
10536                                  : (ASCII_FOLD_RESTRICTED)
10537                                    ? NREFFA
10538                                    : (AT_LEAST_UNI_SEMANTICS)
10539                                      ? NREFFU
10540                                      : (LOC)
10541                                        ? NREFFL
10542                                        : NREFF),
10543                                 num);
10544                 *flagp |= HASWIDTH;
10545
10546                 /* override incorrect value set in reganode MJD */
10547                 Set_Node_Offset(ret, parse_start+1);
10548                 Set_Node_Cur_Length(ret); /* MJD */
10549                 nextchar(pRExC_state);
10550
10551             }
10552             break;
10553         }
10554         case 'g':
10555         case '1': case '2': case '3': case '4':
10556         case '5': case '6': case '7': case '8': case '9':
10557             {
10558                 I32 num;
10559                 bool isg = *RExC_parse == 'g';
10560                 bool isrel = 0;
10561                 bool hasbrace = 0;
10562                 if (isg) {
10563                     RExC_parse++;
10564                     if (*RExC_parse == '{') {
10565                         RExC_parse++;
10566                         hasbrace = 1;
10567                     }
10568                     if (*RExC_parse == '-') {
10569                         RExC_parse++;
10570                         isrel = 1;
10571                     }
10572                     if (hasbrace && !isDIGIT(*RExC_parse)) {
10573                         if (isrel) RExC_parse--;
10574                         RExC_parse -= 2;
10575                         goto parse_named_seq;
10576                 }   }
10577                 num = atoi(RExC_parse);
10578                 if (isg && num == 0)
10579                     vFAIL("Reference to invalid group 0");
10580                 if (isrel) {
10581                     num = RExC_npar - num;
10582                     if (num < 1)
10583                         vFAIL("Reference to nonexistent or unclosed group");
10584                 }
10585                 if (!isg && num > 9 && num >= RExC_npar)
10586                     /* Probably a character specified in octal, e.g. \35 */
10587                     goto defchar;
10588                 else {
10589                     char * const parse_start = RExC_parse - 1; /* MJD */
10590                     while (isDIGIT(*RExC_parse))
10591                         RExC_parse++;
10592                     if (parse_start == RExC_parse - 1)
10593                         vFAIL("Unterminated \\g... pattern");
10594                     if (hasbrace) {
10595                         if (*RExC_parse != '}')
10596                             vFAIL("Unterminated \\g{...} pattern");
10597                         RExC_parse++;
10598                     }
10599                     if (!SIZE_ONLY) {
10600                         if (num > (I32)RExC_rx->nparens)
10601                             vFAIL("Reference to nonexistent group");
10602                     }
10603                     RExC_sawback = 1;
10604                     ret = reganode(pRExC_state,
10605                                    ((! FOLD)
10606                                      ? REF
10607                                      : (ASCII_FOLD_RESTRICTED)
10608                                        ? REFFA
10609                                        : (AT_LEAST_UNI_SEMANTICS)
10610                                          ? REFFU
10611                                          : (LOC)
10612                                            ? REFFL
10613                                            : REFF),
10614                                     num);
10615                     *flagp |= HASWIDTH;
10616
10617                     /* override incorrect value set in reganode MJD */
10618                     Set_Node_Offset(ret, parse_start+1);
10619                     Set_Node_Cur_Length(ret); /* MJD */
10620                     RExC_parse--;
10621                     nextchar(pRExC_state);
10622                 }
10623             }
10624             break;
10625         case '\0':
10626             if (RExC_parse >= RExC_end)
10627                 FAIL("Trailing \\");
10628             /* FALL THROUGH */
10629         default:
10630             /* Do not generate "unrecognized" warnings here, we fall
10631                back into the quick-grab loop below */
10632             parse_start--;
10633             goto defchar;
10634         }
10635         break;
10636
10637     case '#':
10638         if (RExC_flags & RXf_PMf_EXTENDED) {
10639             if ( reg_skipcomment( pRExC_state ) )
10640                 goto tryagain;
10641         }
10642         /* FALL THROUGH */
10643
10644     default:
10645
10646             parse_start = RExC_parse - 1;
10647
10648             RExC_parse++;
10649
10650         defchar: {
10651             STRLEN len = 0;
10652             UV ender;
10653             char *p;
10654             char *s;
10655 #define MAX_NODE_STRING_SIZE 127
10656             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10657             char *s0;
10658             U8 upper_parse = MAX_NODE_STRING_SIZE;
10659             STRLEN foldlen;
10660             U8 node_type;
10661             bool next_is_quantifier;
10662             char * oldp = NULL;
10663
10664             /* If a folding node contains only code points that don't
10665              * participate in folds, it can be changed into an EXACT node,
10666              * which allows the optimizer more things to look for */
10667             bool maybe_exact;
10668
10669             ender = 0;
10670             node_type = compute_EXACTish(pRExC_state);
10671             ret = reg_node(pRExC_state, node_type);
10672
10673             /* In pass1, folded, we use a temporary buffer instead of the
10674              * actual node, as the node doesn't exist yet */
10675             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10676
10677             s0 = s;
10678
10679         reparse:
10680
10681             /* We do the EXACTFish to EXACT node only if folding, and not if in
10682              * locale, as whether a character folds or not isn't known until
10683              * runtime */
10684             maybe_exact = FOLD && ! LOC;
10685
10686             /* XXX The node can hold up to 255 bytes, yet this only goes to
10687              * 127.  I (khw) do not know why.  Keeping it somewhat less than
10688              * 255 allows us to not have to worry about overflow due to
10689              * converting to utf8 and fold expansion, but that value is
10690              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
10691              * split up by this limit into a single one using the real max of
10692              * 255.  Even at 127, this breaks under rare circumstances.  If
10693              * folding, we do not want to split a node at a character that is a
10694              * non-final in a multi-char fold, as an input string could just
10695              * happen to want to match across the node boundary.  The join
10696              * would solve that problem if the join actually happens.  But a
10697              * series of more than two nodes in a row each of 127 would cause
10698              * the first join to succeed to get to 254, but then there wouldn't
10699              * be room for the next one, which could at be one of those split
10700              * multi-char folds.  I don't know of any fool-proof solution.  One
10701              * could back off to end with only a code point that isn't such a
10702              * non-final, but it is possible for there not to be any in the
10703              * entire node. */
10704             for (p = RExC_parse - 1;
10705                  len < upper_parse && p < RExC_end;
10706                  len++)
10707             {
10708                 oldp = p;
10709
10710                 if (RExC_flags & RXf_PMf_EXTENDED)
10711                     p = regwhite( pRExC_state, p );
10712                 switch ((U8)*p) {
10713                 case '^':
10714                 case '$':
10715                 case '.':
10716                 case '[':
10717                 case '(':
10718                 case ')':
10719                 case '|':
10720                     goto loopdone;
10721                 case '\\':
10722                     /* Literal Escapes Switch
10723
10724                        This switch is meant to handle escape sequences that
10725                        resolve to a literal character.
10726
10727                        Every escape sequence that represents something
10728                        else, like an assertion or a char class, is handled
10729                        in the switch marked 'Special Escapes' above in this
10730                        routine, but also has an entry here as anything that
10731                        isn't explicitly mentioned here will be treated as
10732                        an unescaped equivalent literal.
10733                     */
10734
10735                     switch ((U8)*++p) {
10736                     /* These are all the special escapes. */
10737                     case 'A':             /* Start assertion */
10738                     case 'b': case 'B':   /* Word-boundary assertion*/
10739                     case 'C':             /* Single char !DANGEROUS! */
10740                     case 'd': case 'D':   /* digit class */
10741                     case 'g': case 'G':   /* generic-backref, pos assertion */
10742                     case 'h': case 'H':   /* HORIZWS */
10743                     case 'k': case 'K':   /* named backref, keep marker */
10744                     case 'p': case 'P':   /* Unicode property */
10745                               case 'R':   /* LNBREAK */
10746                     case 's': case 'S':   /* space class */
10747                     case 'v': case 'V':   /* VERTWS */
10748                     case 'w': case 'W':   /* word class */
10749                     case 'X':             /* eXtended Unicode "combining character sequence" */
10750                     case 'z': case 'Z':   /* End of line/string assertion */
10751                         --p;
10752                         goto loopdone;
10753
10754                     /* Anything after here is an escape that resolves to a
10755                        literal. (Except digits, which may or may not)
10756                      */
10757                     case 'n':
10758                         ender = '\n';
10759                         p++;
10760                         break;
10761                     case 'N': /* Handle a single-code point named character. */
10762                         /* The options cause it to fail if a multiple code
10763                          * point sequence.  Handle those in the switch() above
10764                          * */
10765                         RExC_parse = p + 1;
10766                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
10767                                             flagp, depth, FALSE,
10768                                             FALSE /* not strict */ ))
10769                         {
10770                             if (*flagp & RESTART_UTF8)
10771                                 FAIL("panic: grok_bslash_N set RESTART_UTF8");
10772                             RExC_parse = p = oldp;
10773                             goto loopdone;
10774                         }
10775                         p = RExC_parse;
10776                         if (ender > 0xff) {
10777                             REQUIRE_UTF8;
10778                         }
10779                         break;
10780                     case 'r':
10781                         ender = '\r';
10782                         p++;
10783                         break;
10784                     case 't':
10785                         ender = '\t';
10786                         p++;
10787                         break;
10788                     case 'f':
10789                         ender = '\f';
10790                         p++;
10791                         break;
10792                     case 'e':
10793                           ender = ASCII_TO_NATIVE('\033');
10794                         p++;
10795                         break;
10796                     case 'a':
10797                           ender = ASCII_TO_NATIVE('\007');
10798                         p++;
10799                         break;
10800                     case 'o':
10801                         {
10802                             UV result;
10803                             const char* error_msg;
10804
10805                             bool valid = grok_bslash_o(&p,
10806                                                        &result,
10807                                                        &error_msg,
10808                                                        TRUE, /* out warnings */
10809                                                        FALSE, /* not strict */
10810                                                        TRUE, /* Output warnings
10811                                                                 for non-
10812                                                                 portables */
10813                                                        UTF);
10814                             if (! valid) {
10815                                 RExC_parse = p; /* going to die anyway; point
10816                                                    to exact spot of failure */
10817                                 vFAIL(error_msg);
10818                             }
10819                             ender = result;
10820                             if (PL_encoding && ender < 0x100) {
10821                                 goto recode_encoding;
10822                             }
10823                             if (ender > 0xff) {
10824                                 REQUIRE_UTF8;
10825                             }
10826                             break;
10827                         }
10828                     case 'x':
10829                         {
10830                             UV result = UV_MAX; /* initialize to erroneous
10831                                                    value */
10832                             const char* error_msg;
10833
10834                             bool valid = grok_bslash_x(&p,
10835                                                        &result,
10836                                                        &error_msg,
10837                                                        TRUE, /* out warnings */
10838                                                        FALSE, /* not strict */
10839                                                        TRUE, /* Output warnings
10840                                                                 for non-
10841                                                                 portables */
10842                                                        UTF);
10843                             if (! valid) {
10844                                 RExC_parse = p; /* going to die anyway; point
10845                                                    to exact spot of failure */
10846                                 vFAIL(error_msg);
10847                             }
10848                             ender = result;
10849
10850                             if (PL_encoding && ender < 0x100) {
10851                                 goto recode_encoding;
10852                             }
10853                             if (ender > 0xff) {
10854                                 REQUIRE_UTF8;
10855                             }
10856                             break;
10857                         }
10858                     case 'c':
10859                         p++;
10860                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10861                         break;
10862                     case '0': case '1': case '2': case '3':case '4':
10863                     case '5': case '6': case '7':
10864                         if (*p == '0' ||
10865                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10866                         {
10867                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10868                             STRLEN numlen = 3;
10869                             ender = grok_oct(p, &numlen, &flags, NULL);
10870                             if (ender > 0xff) {
10871                                 REQUIRE_UTF8;
10872                             }
10873                             p += numlen;
10874                             if (SIZE_ONLY   /* like \08, \178 */
10875                                 && numlen < 3
10876                                 && p < RExC_end
10877                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
10878                             {
10879                                 reg_warn_non_literal_string(
10880                                          p + 1,
10881                                          form_short_octal_warning(p, numlen));
10882                             }
10883                         }
10884                         else {  /* Not to be treated as an octal constant, go
10885                                    find backref */
10886                             --p;
10887                             goto loopdone;
10888                         }
10889                         if (PL_encoding && ender < 0x100)
10890                             goto recode_encoding;
10891                         break;
10892                     recode_encoding:
10893                         if (! RExC_override_recoding) {
10894                             SV* enc = PL_encoding;
10895                             ender = reg_recode((const char)(U8)ender, &enc);
10896                             if (!enc && SIZE_ONLY)
10897                                 ckWARNreg(p, "Invalid escape in the specified encoding");
10898                             REQUIRE_UTF8;
10899                         }
10900                         break;
10901                     case '\0':
10902                         if (p >= RExC_end)
10903                             FAIL("Trailing \\");
10904                         /* FALL THROUGH */
10905                     default:
10906                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
10907                             /* Include any { following the alpha to emphasize
10908                              * that it could be part of an escape at some point
10909                              * in the future */
10910                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
10911                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
10912                         }
10913                         goto normal_default;
10914                     } /* End of switch on '\' */
10915                     break;
10916                 default:    /* A literal character */
10917
10918                     if (! SIZE_ONLY
10919                         && RExC_flags & RXf_PMf_EXTENDED
10920                         && ckWARN(WARN_DEPRECATED)
10921                         && is_PATWS_non_low(p, UTF))
10922                     {
10923                         vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
10924                                 "Escape literal pattern white space under /x");
10925                     }
10926
10927                   normal_default:
10928                     if (UTF8_IS_START(*p) && UTF) {
10929                         STRLEN numlen;
10930                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10931                                                &numlen, UTF8_ALLOW_DEFAULT);
10932                         p += numlen;
10933                     }
10934                     else
10935                         ender = (U8) *p++;
10936                     break;
10937                 } /* End of switch on the literal */
10938
10939                 /* Here, have looked at the literal character and <ender>
10940                  * contains its ordinal, <p> points to the character after it
10941                  */
10942
10943                 if ( RExC_flags & RXf_PMf_EXTENDED)
10944                     p = regwhite( pRExC_state, p );
10945
10946                 /* If the next thing is a quantifier, it applies to this
10947                  * character only, which means that this character has to be in
10948                  * its own node and can't just be appended to the string in an
10949                  * existing node, so if there are already other characters in
10950                  * the node, close the node with just them, and set up to do
10951                  * this character again next time through, when it will be the
10952                  * only thing in its new node */
10953                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
10954                 {
10955                     p = oldp;
10956                     goto loopdone;
10957                 }
10958
10959                 if (FOLD) {
10960                     if (UTF
10961                             /* See comments for join_exact() as to why we fold
10962                              * this non-UTF at compile time */
10963                         || (node_type == EXACTFU
10964                             && ender == LATIN_SMALL_LETTER_SHARP_S))
10965                     {
10966
10967
10968                         /* Prime the casefolded buffer.  Locale rules, which
10969                          * apply only to code points < 256, aren't known until
10970                          * execution, so for them, just output the original
10971                          * character using utf8.  If we start to fold non-UTF
10972                          * patterns, be sure to update join_exact() */
10973                         if (LOC && ender < 256) {
10974                             if (UNI_IS_INVARIANT(ender)) {
10975                                 *s = (U8) ender;
10976                                 foldlen = 1;
10977                             } else {
10978                                 *s = UTF8_TWO_BYTE_HI(ender);
10979                                 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
10980                                 foldlen = 2;
10981                             }
10982                         }
10983                         else {
10984                             UV folded = _to_uni_fold_flags(
10985                                            ender,
10986                                            (U8 *) s,
10987                                            &foldlen,
10988                                            FOLD_FLAGS_FULL
10989                                            | ((LOC) ?  FOLD_FLAGS_LOCALE
10990                                                     : (ASCII_FOLD_RESTRICTED)
10991                                                       ? FOLD_FLAGS_NOMIX_ASCII
10992                                                       : 0)
10993                                             );
10994
10995                             /* If this node only contains non-folding code
10996                              * points so far, see if this new one is also
10997                              * non-folding */
10998                             if (maybe_exact) {
10999                                 if (folded != ender) {
11000                                     maybe_exact = FALSE;
11001                                 }
11002                                 else {
11003                                     /* Here the fold is the original; we have
11004                                      * to check further to see if anything
11005                                      * folds to it */
11006                                     if (! PL_utf8_foldable) {
11007                                         SV* swash = swash_init("utf8",
11008                                                            "_Perl_Any_Folds",
11009                                                            &PL_sv_undef, 1, 0);
11010                                         PL_utf8_foldable =
11011                                                     _get_swash_invlist(swash);
11012                                         SvREFCNT_dec_NN(swash);
11013                                     }
11014                                     if (_invlist_contains_cp(PL_utf8_foldable,
11015                                                              ender))
11016                                     {
11017                                         maybe_exact = FALSE;
11018                                     }
11019                                 }
11020                             }
11021                             ender = folded;
11022                         }
11023                         s += foldlen;
11024
11025                         /* The loop increments <len> each time, as all but this
11026                          * path (and the one just below for UTF) through it add
11027                          * a single byte to the EXACTish node.  But this one
11028                          * has changed len to be the correct final value, so
11029                          * subtract one to cancel out the increment that
11030                          * follows */
11031                         len += foldlen - 1;
11032                     }
11033                     else {
11034                         *(s++) = (char) ender;
11035                         maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
11036                     }
11037                 }
11038                 else if (UTF) {
11039                     const STRLEN unilen = reguni(pRExC_state, ender, s);
11040                     if (unilen > 0) {
11041                        s   += unilen;
11042                        len += unilen;
11043                     }
11044
11045                     /* See comment just above for - 1 */
11046                     len--;
11047                 }
11048                 else {
11049                     REGC((char)ender, s++);
11050                 }
11051
11052                 if (next_is_quantifier) {
11053
11054                     /* Here, the next input is a quantifier, and to get here,
11055                      * the current character is the only one in the node.
11056                      * Also, here <len> doesn't include the final byte for this
11057                      * character */
11058                     len++;
11059                     goto loopdone;
11060                 }
11061
11062             } /* End of loop through literal characters */
11063
11064             /* Here we have either exhausted the input or ran out of room in
11065              * the node.  (If we encountered a character that can't be in the
11066              * node, transfer is made directly to <loopdone>, and so we
11067              * wouldn't have fallen off the end of the loop.)  In the latter
11068              * case, we artificially have to split the node into two, because
11069              * we just don't have enough space to hold everything.  This
11070              * creates a problem if the final character participates in a
11071              * multi-character fold in the non-final position, as a match that
11072              * should have occurred won't, due to the way nodes are matched,
11073              * and our artificial boundary.  So back off until we find a non-
11074              * problematic character -- one that isn't at the beginning or
11075              * middle of such a fold.  (Either it doesn't participate in any
11076              * folds, or appears only in the final position of all the folds it
11077              * does participate in.)  A better solution with far fewer false
11078              * positives, and that would fill the nodes more completely, would
11079              * be to actually have available all the multi-character folds to
11080              * test against, and to back-off only far enough to be sure that
11081              * this node isn't ending with a partial one.  <upper_parse> is set
11082              * further below (if we need to reparse the node) to include just
11083              * up through that final non-problematic character that this code
11084              * identifies, so when it is set to less than the full node, we can
11085              * skip the rest of this */
11086             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
11087
11088                 const STRLEN full_len = len;
11089
11090                 assert(len >= MAX_NODE_STRING_SIZE);
11091
11092                 /* Here, <s> points to the final byte of the final character.
11093                  * Look backwards through the string until find a non-
11094                  * problematic character */
11095
11096                 if (! UTF) {
11097
11098                     /* These two have no multi-char folds to non-UTF characters
11099                      */
11100                     if (ASCII_FOLD_RESTRICTED || LOC) {
11101                         goto loopdone;
11102                     }
11103
11104                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
11105                     len = s - s0 + 1;
11106                 }
11107                 else {
11108                     if (!  PL_NonL1NonFinalFold) {
11109                         PL_NonL1NonFinalFold = _new_invlist_C_array(
11110                                         NonL1_Perl_Non_Final_Folds_invlist);
11111                     }
11112
11113                     /* Point to the first byte of the final character */
11114                     s = (char *) utf8_hop((U8 *) s, -1);
11115
11116                     while (s >= s0) {   /* Search backwards until find
11117                                            non-problematic char */
11118                         if (UTF8_IS_INVARIANT(*s)) {
11119
11120                             /* There are no ascii characters that participate
11121                              * in multi-char folds under /aa.  In EBCDIC, the
11122                              * non-ascii invariants are all control characters,
11123                              * so don't ever participate in any folds. */
11124                             if (ASCII_FOLD_RESTRICTED
11125                                 || ! IS_NON_FINAL_FOLD(*s))
11126                             {
11127                                 break;
11128                             }
11129                         }
11130                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11131
11132                             /* No Latin1 characters participate in multi-char
11133                              * folds under /l */
11134                             if (LOC
11135                                 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
11136                                                                 *s, *(s+1))))
11137                             {
11138                                 break;
11139                             }
11140                         }
11141                         else if (! _invlist_contains_cp(
11142                                         PL_NonL1NonFinalFold,
11143                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
11144                         {
11145                             break;
11146                         }
11147
11148                         /* Here, the current character is problematic in that
11149                          * it does occur in the non-final position of some
11150                          * fold, so try the character before it, but have to
11151                          * special case the very first byte in the string, so
11152                          * we don't read outside the string */
11153                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11154                     } /* End of loop backwards through the string */
11155
11156                     /* If there were only problematic characters in the string,
11157                      * <s> will point to before s0, in which case the length
11158                      * should be 0, otherwise include the length of the
11159                      * non-problematic character just found */
11160                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11161                 }
11162
11163                 /* Here, have found the final character, if any, that is
11164                  * non-problematic as far as ending the node without splitting
11165                  * it across a potential multi-char fold.  <len> contains the
11166                  * number of bytes in the node up-to and including that
11167                  * character, or is 0 if there is no such character, meaning
11168                  * the whole node contains only problematic characters.  In
11169                  * this case, give up and just take the node as-is.  We can't
11170                  * do any better */
11171                 if (len == 0) {
11172                     len = full_len;
11173                 } else {
11174
11175                     /* Here, the node does contain some characters that aren't
11176                      * problematic.  If one such is the final character in the
11177                      * node, we are done */
11178                     if (len == full_len) {
11179                         goto loopdone;
11180                     }
11181                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11182
11183                         /* If the final character is problematic, but the
11184                          * penultimate is not, back-off that last character to
11185                          * later start a new node with it */
11186                         p = oldp;
11187                         goto loopdone;
11188                     }
11189
11190                     /* Here, the final non-problematic character is earlier
11191                      * in the input than the penultimate character.  What we do
11192                      * is reparse from the beginning, going up only as far as
11193                      * this final ok one, thus guaranteeing that the node ends
11194                      * in an acceptable character.  The reason we reparse is
11195                      * that we know how far in the character is, but we don't
11196                      * know how to correlate its position with the input parse.
11197                      * An alternate implementation would be to build that
11198                      * correlation as we go along during the original parse,
11199                      * but that would entail extra work for every node, whereas
11200                      * this code gets executed only when the string is too
11201                      * large for the node, and the final two characters are
11202                      * problematic, an infrequent occurrence.  Yet another
11203                      * possible strategy would be to save the tail of the
11204                      * string, and the next time regatom is called, initialize
11205                      * with that.  The problem with this is that unless you
11206                      * back off one more character, you won't be guaranteed
11207                      * regatom will get called again, unless regbranch,
11208                      * regpiece ... are also changed.  If you do back off that
11209                      * extra character, so that there is input guaranteed to
11210                      * force calling regatom, you can't handle the case where
11211                      * just the first character in the node is acceptable.  I
11212                      * (khw) decided to try this method which doesn't have that
11213                      * pitfall; if performance issues are found, we can do a
11214                      * combination of the current approach plus that one */
11215                     upper_parse = len;
11216                     len = 0;
11217                     s = s0;
11218                     goto reparse;
11219                 }
11220             }   /* End of verifying node ends with an appropriate char */
11221
11222         loopdone:   /* Jumped to when encounters something that shouldn't be in
11223                        the node */
11224
11225             /* If 'maybe_exact' is still set here, means there are no
11226              * code points in the node that participate in folds */
11227             if (FOLD && maybe_exact) {
11228                 OP(ret) = EXACT;
11229             }
11230
11231             /* I (khw) don't know if you can get here with zero length, but the
11232              * old code handled this situation by creating a zero-length EXACT
11233              * node.  Might as well be NOTHING instead */
11234             if (len == 0) {
11235                 OP(ret) = NOTHING;
11236             }
11237             else{
11238                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11239             }
11240
11241             RExC_parse = p - 1;
11242             Set_Node_Cur_Length(ret); /* MJD */
11243             nextchar(pRExC_state);
11244             {
11245                 /* len is STRLEN which is unsigned, need to copy to signed */
11246                 IV iv = len;
11247                 if (iv < 0)
11248                     vFAIL("Internal disaster");
11249             }
11250
11251         } /* End of label 'defchar:' */
11252         break;
11253     } /* End of giant switch on input character */
11254
11255     return(ret);
11256 }
11257
11258 STATIC char *
11259 S_regwhite( RExC_state_t *pRExC_state, char *p )
11260 {
11261     const char *e = RExC_end;
11262
11263     PERL_ARGS_ASSERT_REGWHITE;
11264
11265     while (p < e) {
11266         if (isSPACE(*p))
11267             ++p;
11268         else if (*p == '#') {
11269             bool ended = 0;
11270             do {
11271                 if (*p++ == '\n') {
11272                     ended = 1;
11273                     break;
11274                 }
11275             } while (p < e);
11276             if (!ended)
11277                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11278         }
11279         else
11280             break;
11281     }
11282     return p;
11283 }
11284
11285 STATIC char *
11286 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
11287 {
11288     /* Returns the next non-pattern-white space, non-comment character (the
11289      * latter only if 'recognize_comment is true) in the string p, which is
11290      * ended by RExC_end.  If there is no line break ending a comment,
11291      * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
11292     const char *e = RExC_end;
11293
11294     PERL_ARGS_ASSERT_REGPATWS;
11295
11296     while (p < e) {
11297         STRLEN len;
11298         if ((len = is_PATWS_safe(p, e, UTF))) {
11299             p += len;
11300         }
11301         else if (recognize_comment && *p == '#') {
11302             bool ended = 0;
11303             do {
11304                 p++;
11305                 if (is_LNBREAK_safe(p, e, UTF)) {
11306                     ended = 1;
11307                     break;
11308                 }
11309             } while (p < e);
11310             if (!ended)
11311                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11312         }
11313         else
11314             break;
11315     }
11316     return p;
11317 }
11318
11319 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11320    Character classes ([:foo:]) can also be negated ([:^foo:]).
11321    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11322    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11323    but trigger failures because they are currently unimplemented. */
11324
11325 #define POSIXCC_DONE(c)   ((c) == ':')
11326 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11327 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11328
11329 PERL_STATIC_INLINE I32
11330 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
11331 {
11332     dVAR;
11333     I32 namedclass = OOB_NAMEDCLASS;
11334
11335     PERL_ARGS_ASSERT_REGPPOSIXCC;
11336
11337     if (value == '[' && RExC_parse + 1 < RExC_end &&
11338         /* I smell either [: or [= or [. -- POSIX has been here, right? */
11339         POSIXCC(UCHARAT(RExC_parse)))
11340     {
11341         const char c = UCHARAT(RExC_parse);
11342         char* const s = RExC_parse++;
11343
11344         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11345             RExC_parse++;
11346         if (RExC_parse == RExC_end) {
11347             if (strict) {
11348
11349                 /* Try to give a better location for the error (than the end of
11350                  * the string) by looking for the matching ']' */
11351                 RExC_parse = s;
11352                 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11353                     RExC_parse++;
11354                 }
11355                 vFAIL2("Unmatched '%c' in POSIX class", c);
11356             }
11357             /* Grandfather lone [:, [=, [. */
11358             RExC_parse = s;
11359         }
11360         else {
11361             const char* const t = RExC_parse++; /* skip over the c */
11362             assert(*t == c);
11363
11364             if (UCHARAT(RExC_parse) == ']') {
11365                 const char *posixcc = s + 1;
11366                 RExC_parse++; /* skip over the ending ] */
11367
11368                 if (*s == ':') {
11369                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11370                     const I32 skip = t - posixcc;
11371
11372                     /* Initially switch on the length of the name.  */
11373                     switch (skip) {
11374                     case 4:
11375                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
11376                                                           this is the Perl \w
11377                                                         */
11378                             namedclass = ANYOF_WORDCHAR;
11379                         break;
11380                     case 5:
11381                         /* Names all of length 5.  */
11382                         /* alnum alpha ascii blank cntrl digit graph lower
11383                            print punct space upper  */
11384                         /* Offset 4 gives the best switch position.  */
11385                         switch (posixcc[4]) {
11386                         case 'a':
11387                             if (memEQ(posixcc, "alph", 4)) /* alpha */
11388                                 namedclass = ANYOF_ALPHA;
11389                             break;
11390                         case 'e':
11391                             if (memEQ(posixcc, "spac", 4)) /* space */
11392                                 namedclass = ANYOF_PSXSPC;
11393                             break;
11394                         case 'h':
11395                             if (memEQ(posixcc, "grap", 4)) /* graph */
11396                                 namedclass = ANYOF_GRAPH;
11397                             break;
11398                         case 'i':
11399                             if (memEQ(posixcc, "asci", 4)) /* ascii */
11400                                 namedclass = ANYOF_ASCII;
11401                             break;
11402                         case 'k':
11403                             if (memEQ(posixcc, "blan", 4)) /* blank */
11404                                 namedclass = ANYOF_BLANK;
11405                             break;
11406                         case 'l':
11407                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11408                                 namedclass = ANYOF_CNTRL;
11409                             break;
11410                         case 'm':
11411                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
11412                                 namedclass = ANYOF_ALPHANUMERIC;
11413                             break;
11414                         case 'r':
11415                             if (memEQ(posixcc, "lowe", 4)) /* lower */
11416                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
11417                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
11418                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
11419                             break;
11420                         case 't':
11421                             if (memEQ(posixcc, "digi", 4)) /* digit */
11422                                 namedclass = ANYOF_DIGIT;
11423                             else if (memEQ(posixcc, "prin", 4)) /* print */
11424                                 namedclass = ANYOF_PRINT;
11425                             else if (memEQ(posixcc, "punc", 4)) /* punct */
11426                                 namedclass = ANYOF_PUNCT;
11427                             break;
11428                         }
11429                         break;
11430                     case 6:
11431                         if (memEQ(posixcc, "xdigit", 6))
11432                             namedclass = ANYOF_XDIGIT;
11433                         break;
11434                     }
11435
11436                     if (namedclass == OOB_NAMEDCLASS)
11437                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11438                                       t - s - 1, s + 1);
11439
11440                     /* The #defines are structured so each complement is +1 to
11441                      * the normal one */
11442                     if (complement) {
11443                         namedclass++;
11444                     }
11445                     assert (posixcc[skip] == ':');
11446                     assert (posixcc[skip+1] == ']');
11447                 } else if (!SIZE_ONLY) {
11448                     /* [[=foo=]] and [[.foo.]] are still future. */
11449
11450                     /* adjust RExC_parse so the warning shows after
11451                        the class closes */
11452                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11453                         RExC_parse++;
11454                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11455                 }
11456             } else {
11457                 /* Maternal grandfather:
11458                  * "[:" ending in ":" but not in ":]" */
11459                 if (strict) {
11460                     vFAIL("Unmatched '[' in POSIX class");
11461                 }
11462
11463                 /* Grandfather lone [:, [=, [. */
11464                 RExC_parse = s;
11465             }
11466         }
11467     }
11468
11469     return namedclass;
11470 }
11471
11472 STATIC bool
11473 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
11474 {
11475     /* This applies some heuristics at the current parse position (which should
11476      * be at a '[') to see if what follows might be intended to be a [:posix:]
11477      * class.  It returns true if it really is a posix class, of course, but it
11478      * also can return true if it thinks that what was intended was a posix
11479      * class that didn't quite make it.
11480      *
11481      * It will return true for
11482      *      [:alphanumerics:
11483      *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
11484      *                         ')' indicating the end of the (?[
11485      *      [:any garbage including %^&$ punctuation:]
11486      *
11487      * This is designed to be called only from S_handle_regex_sets; it could be
11488      * easily adapted to be called from the spot at the beginning of regclass()
11489      * that checks to see in a normal bracketed class if the surrounding []
11490      * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
11491      * change long-standing behavior, so I (khw) didn't do that */
11492     char* p = RExC_parse + 1;
11493     char first_char = *p;
11494
11495     PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
11496
11497     assert(*(p - 1) == '[');
11498
11499     if (! POSIXCC(first_char)) {
11500         return FALSE;
11501     }
11502
11503     p++;
11504     while (p < RExC_end && isWORDCHAR(*p)) p++;
11505
11506     if (p >= RExC_end) {
11507         return FALSE;
11508     }
11509
11510     if (p - RExC_parse > 2    /* Got at least 1 word character */
11511         && (*p == first_char
11512             || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
11513     {
11514         return TRUE;
11515     }
11516
11517     p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
11518
11519     return (p
11520             && p - RExC_parse > 2 /* [:] evaluates to colon;
11521                                       [::] is a bad posix class. */
11522             && first_char == *(p - 1));
11523 }
11524
11525 STATIC regnode *
11526 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth,
11527                    char * const oregcomp_parse)
11528 {
11529     /* Handle the (?[...]) construct to do set operations */
11530
11531     U8 curchar;
11532     UV start, end;      /* End points of code point ranges */
11533     SV* result_string;
11534     char *save_end, *save_parse;
11535     SV* final;
11536     STRLEN len;
11537     regnode* node;
11538     AV* stack;
11539     const bool save_fold = FOLD;
11540
11541     GET_RE_DEBUG_FLAGS_DECL;
11542
11543     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
11544
11545     if (LOC) {
11546         vFAIL("(?[...]) not valid in locale");
11547     }
11548     RExC_uni_semantics = 1;
11549
11550     /* This will return only an ANYOF regnode, or (unlikely) something smaller
11551      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
11552      * call regclass to handle '[]' so as to not have to reinvent its parsing
11553      * rules here (throwing away the size it computes each time).  And, we exit
11554      * upon an unescaped ']' that isn't one ending a regclass.  To do both
11555      * these things, we need to realize that something preceded by a backslash
11556      * is escaped, so we have to keep track of backslashes */
11557     if (SIZE_ONLY) {
11558
11559         Perl_ck_warner_d(aTHX_
11560             packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
11561             "The regex_sets feature is experimental" REPORT_LOCATION,
11562             (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse);
11563
11564         while (RExC_parse < RExC_end) {
11565             SV* current = NULL;
11566             RExC_parse = regpatws(pRExC_state, RExC_parse,
11567                                 TRUE); /* means recognize comments */
11568             switch (*RExC_parse) {
11569                 default:
11570                     break;
11571                 case '\\':
11572                     /* Skip the next byte (which could cause us to end up in
11573                      * the middle of a UTF-8 character, but since none of those
11574                      * are confusable with anything we currently handle in this
11575                      * switch (invariants all), it's safe.  We'll just hit the
11576                      * default: case next time and keep on incrementing until
11577                      * we find one of the invariants we do handle. */
11578                     RExC_parse++;
11579                     break;
11580                 case '[':
11581                 {
11582                     /* If this looks like it is a [:posix:] class, leave the
11583                      * parse pointer at the '[' to fool regclass() into
11584                      * thinking it is part of a '[[:posix:]]'.  That function
11585                      * will use strict checking to force a syntax error if it
11586                      * doesn't work out to a legitimate class */
11587                     bool is_posix_class
11588                                     = could_it_be_a_POSIX_class(pRExC_state);
11589                     if (! is_posix_class) {
11590                         RExC_parse++;
11591                     }
11592
11593                     /* regclass() can only return RESTART_UTF8 if multi-char
11594                        folds are allowed.  */
11595                     if (!regclass(pRExC_state, flagp,depth+1,
11596                                   is_posix_class, /* parse the whole char
11597                                                      class only if not a
11598                                                      posix class */
11599                                   FALSE, /* don't allow multi-char folds */
11600                                   TRUE, /* silence non-portable warnings. */
11601                                   &current))
11602                         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
11603                               *flagp);
11604
11605                     /* function call leaves parse pointing to the ']', except
11606                      * if we faked it */
11607                     if (is_posix_class) {
11608                         RExC_parse--;
11609                     }
11610
11611                     SvREFCNT_dec(current);   /* In case it returned something */
11612                     break;
11613                 }
11614
11615                 case ']':
11616                     RExC_parse++;
11617                     if (RExC_parse < RExC_end
11618                         && *RExC_parse == ')')
11619                     {
11620                         node = reganode(pRExC_state, ANYOF, 0);
11621                         RExC_size += ANYOF_SKIP;
11622                         nextchar(pRExC_state);
11623                         Set_Node_Length(node,
11624                                 RExC_parse - oregcomp_parse + 1); /* MJD */
11625                         return node;
11626                     }
11627                     goto no_close;
11628             }
11629             RExC_parse++;
11630         }
11631
11632         no_close:
11633         FAIL("Syntax error in (?[...])");
11634     }
11635
11636     /* Pass 2 only after this.  Everything in this construct is a
11637      * metacharacter.  Operands begin with either a '\' (for an escape
11638      * sequence), or a '[' for a bracketed character class.  Any other
11639      * character should be an operator, or parenthesis for grouping.  Both
11640      * types of operands are handled by calling regclass() to parse them.  It
11641      * is called with a parameter to indicate to return the computed inversion
11642      * list.  The parsing here is implemented via a stack.  Each entry on the
11643      * stack is a single character representing one of the operators, or the
11644      * '('; or else a pointer to an operand inversion list. */
11645
11646 #define IS_OPERAND(a)  (! SvIOK(a))
11647
11648     /* The stack starts empty.  It is a syntax error if the first thing parsed
11649      * is a binary operator; everything else is pushed on the stack.  When an
11650      * operand is parsed, the top of the stack is examined.  If it is a binary
11651      * operator, the item before it should be an operand, and both are replaced
11652      * by the result of doing that operation on the new operand and the one on
11653      * the stack.   Thus a sequence of binary operands is reduced to a single
11654      * one before the next one is parsed.
11655      *
11656      * A unary operator may immediately follow a binary in the input, for
11657      * example
11658      *      [a] + ! [b]
11659      * When an operand is parsed and the top of the stack is a unary operator,
11660      * the operation is performed, and then the stack is rechecked to see if
11661      * this new operand is part of a binary operation; if so, it is handled as
11662      * above.
11663      *
11664      * A '(' is simply pushed on the stack; it is valid only if the stack is
11665      * empty, or the top element of the stack is an operator or another '('
11666      * (for which the parenthesized expression will become an operand).  By the
11667      * time the corresponding ')' is parsed everything in between should have
11668      * been parsed and evaluated to a single operand (or else is a syntax
11669      * error), and is handled as a regular operand */
11670
11671     stack = newAV();
11672
11673     while (RExC_parse < RExC_end) {
11674         I32 top_index = av_tindex(stack);
11675         SV** top_ptr;
11676         SV* current = NULL;
11677
11678         /* Skip white space */
11679         RExC_parse = regpatws(pRExC_state, RExC_parse,
11680                                 TRUE); /* means recognize comments */
11681         if (RExC_parse >= RExC_end) {
11682             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
11683         }
11684         if ((curchar = UCHARAT(RExC_parse)) == ']') {
11685             break;
11686         }
11687
11688         switch (curchar) {
11689
11690             case '?':
11691                 if (av_tindex(stack) >= 0   /* This makes sure that we can
11692                                                safely subtract 1 from
11693                                                RExC_parse in the next clause.
11694                                                If we have something on the
11695                                                stack, we have parsed something
11696                                              */
11697                     && UCHARAT(RExC_parse - 1) == '('
11698                     && RExC_parse < RExC_end)
11699                 {
11700                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
11701                      * This happens when we have some thing like
11702                      *
11703                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
11704                      *   ...
11705                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
11706                      *
11707                      * Here we would be handling the interpolated
11708                      * '$thai_or_lao'.  We handle this by a recursive call to
11709                      * ourselves which returns the inversion list the
11710                      * interpolated expression evaluates to.  We use the flags
11711                      * from the interpolated pattern. */
11712                     U32 save_flags = RExC_flags;
11713                     const char * const save_parse = ++RExC_parse;
11714
11715                     parse_lparen_question_flags(pRExC_state);
11716
11717                     if (RExC_parse == save_parse  /* Makes sure there was at
11718                                                      least one flag (or this
11719                                                      embedding wasn't compiled)
11720                                                    */
11721                         || RExC_parse >= RExC_end - 4
11722                         || UCHARAT(RExC_parse) != ':'
11723                         || UCHARAT(++RExC_parse) != '('
11724                         || UCHARAT(++RExC_parse) != '?'
11725                         || UCHARAT(++RExC_parse) != '[')
11726                     {
11727
11728                         /* In combination with the above, this moves the
11729                          * pointer to the point just after the first erroneous
11730                          * character (or if there are no flags, to where they
11731                          * should have been) */
11732                         if (RExC_parse >= RExC_end - 4) {
11733                             RExC_parse = RExC_end;
11734                         }
11735                         else if (RExC_parse != save_parse) {
11736                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11737                         }
11738                         vFAIL("Expecting '(?flags:(?[...'");
11739                     }
11740                     RExC_parse++;
11741                     (void) handle_regex_sets(pRExC_state, &current, flagp,
11742                                                     depth+1, oregcomp_parse);
11743
11744                     /* Here, 'current' contains the embedded expression's
11745                      * inversion list, and RExC_parse points to the trailing
11746                      * ']'; the next character should be the ')' which will be
11747                      * paired with the '(' that has been put on the stack, so
11748                      * the whole embedded expression reduces to '(operand)' */
11749                     RExC_parse++;
11750
11751                     RExC_flags = save_flags;
11752                     goto handle_operand;
11753                 }
11754                 /* FALL THROUGH */
11755
11756             default:
11757                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11758                 vFAIL("Unexpected character");
11759
11760             case '\\':
11761                 /* regclass() can only return RESTART_UTF8 if multi-char
11762                    folds are allowed.  */
11763                 if (!regclass(pRExC_state, flagp,depth+1,
11764                               TRUE, /* means parse just the next thing */
11765                               FALSE, /* don't allow multi-char folds */
11766                               FALSE, /* don't silence non-portable warnings.  */
11767                               &current))
11768                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
11769                           *flagp);
11770                 /* regclass() will return with parsing just the \ sequence,
11771                  * leaving the parse pointer at the next thing to parse */
11772                 RExC_parse--;
11773                 goto handle_operand;
11774
11775             case '[':   /* Is a bracketed character class */
11776             {
11777                 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
11778
11779                 if (! is_posix_class) {
11780                     RExC_parse++;
11781                 }
11782
11783                 /* regclass() can only return RESTART_UTF8 if multi-char
11784                    folds are allowed.  */
11785                 if(!regclass(pRExC_state, flagp,depth+1,
11786                              is_posix_class, /* parse the whole char class
11787                                                 only if not a posix class */
11788                              FALSE, /* don't allow multi-char folds */
11789                              FALSE, /* don't silence non-portable warnings.  */
11790                              &current))
11791                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
11792                           *flagp);
11793                 /* function call leaves parse pointing to the ']', except if we
11794                  * faked it */
11795                 if (is_posix_class) {
11796                     RExC_parse--;
11797                 }
11798
11799                 goto handle_operand;
11800             }
11801
11802             case '&':
11803             case '|':
11804             case '+':
11805             case '-':
11806             case '^':
11807                 if (top_index < 0
11808                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
11809                     || ! IS_OPERAND(*top_ptr))
11810                 {
11811                     RExC_parse++;
11812                     vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
11813                 }
11814                 av_push(stack, newSVuv(curchar));
11815                 break;
11816
11817             case '!':
11818                 av_push(stack, newSVuv(curchar));
11819                 break;
11820
11821             case '(':
11822                 if (top_index >= 0) {
11823                     top_ptr = av_fetch(stack, top_index, FALSE);
11824                     assert(top_ptr);
11825                     if (IS_OPERAND(*top_ptr)) {
11826                         RExC_parse++;
11827                         vFAIL("Unexpected '(' with no preceding operator");
11828                     }
11829                 }
11830                 av_push(stack, newSVuv(curchar));
11831                 break;
11832
11833             case ')':
11834             {
11835                 SV* lparen;
11836                 if (top_index < 1
11837                     || ! (current = av_pop(stack))
11838                     || ! IS_OPERAND(current)
11839                     || ! (lparen = av_pop(stack))
11840                     || IS_OPERAND(lparen)
11841                     || SvUV(lparen) != '(')
11842                 {
11843                     RExC_parse++;
11844                     vFAIL("Unexpected ')'");
11845                 }
11846                 top_index -= 2;
11847                 SvREFCNT_dec_NN(lparen);
11848
11849                 /* FALL THROUGH */
11850             }
11851
11852               handle_operand:
11853
11854                 /* Here, we have an operand to process, in 'current' */
11855
11856                 if (top_index < 0) {    /* Just push if stack is empty */
11857                     av_push(stack, current);
11858                 }
11859                 else {
11860                     SV* top = av_pop(stack);
11861                     char current_operator;
11862
11863                     if (IS_OPERAND(top)) {
11864                         vFAIL("Operand with no preceding operator");
11865                     }
11866                     current_operator = (char) SvUV(top);
11867                     switch (current_operator) {
11868                         case '(':   /* Push the '(' back on followed by the new
11869                                        operand */
11870                             av_push(stack, top);
11871                             av_push(stack, current);
11872                             SvREFCNT_inc(top);  /* Counters the '_dec' done
11873                                                    just after the 'break', so
11874                                                    it doesn't get wrongly freed
11875                                                  */
11876                             break;
11877
11878                         case '!':
11879                             _invlist_invert(current);
11880
11881                             /* Unlike binary operators, the top of the stack,
11882                              * now that this unary one has been popped off, may
11883                              * legally be an operator, and we now have operand
11884                              * for it. */
11885                             top_index--;
11886                             SvREFCNT_dec_NN(top);
11887                             goto handle_operand;
11888
11889                         case '&':
11890                             _invlist_intersection(av_pop(stack),
11891                                                    current,
11892                                                    &current);
11893                             av_push(stack, current);
11894                             break;
11895
11896                         case '|':
11897                         case '+':
11898                             _invlist_union(av_pop(stack), current, &current);
11899                             av_push(stack, current);
11900                             break;
11901
11902                         case '-':
11903                             _invlist_subtract(av_pop(stack), current, &current);
11904                             av_push(stack, current);
11905                             break;
11906
11907                         case '^':   /* The union minus the intersection */
11908                         {
11909                             SV* i = NULL;
11910                             SV* u = NULL;
11911                             SV* element;
11912
11913                             element = av_pop(stack);
11914                             _invlist_union(element, current, &u);
11915                             _invlist_intersection(element, current, &i);
11916                             _invlist_subtract(u, i, &current);
11917                             av_push(stack, current);
11918                             SvREFCNT_dec_NN(i);
11919                             SvREFCNT_dec_NN(u);
11920                             SvREFCNT_dec_NN(element);
11921                             break;
11922                         }
11923
11924                         default:
11925                             Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
11926                 }
11927                 SvREFCNT_dec_NN(top);
11928             }
11929         }
11930
11931         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11932     }
11933
11934     if (av_tindex(stack) < 0   /* Was empty */
11935         || ((final = av_pop(stack)) == NULL)
11936         || ! IS_OPERAND(final)
11937         || av_tindex(stack) >= 0)  /* More left on stack */
11938     {
11939         vFAIL("Incomplete expression within '(?[ ])'");
11940     }
11941
11942     /* Here, 'final' is the resultant inversion list from evaluating the
11943      * expression.  Return it if so requested */
11944     if (return_invlist) {
11945         *return_invlist = final;
11946         return END;
11947     }
11948
11949     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
11950      * expecting a string of ranges and individual code points */
11951     invlist_iterinit(final);
11952     result_string = newSVpvs("");
11953     while (invlist_iternext(final, &start, &end)) {
11954         if (start == end) {
11955             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
11956         }
11957         else {
11958             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
11959                                                      start,          end);
11960         }
11961     }
11962
11963     save_parse = RExC_parse;
11964     RExC_parse = SvPV(result_string, len);
11965     save_end = RExC_end;
11966     RExC_end = RExC_parse + len;
11967
11968     /* We turn off folding around the call, as the class we have constructed
11969      * already has all folding taken into consideration, and we don't want
11970      * regclass() to add to that */
11971     RExC_flags &= ~RXf_PMf_FOLD;
11972     /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
11973      */
11974     node = regclass(pRExC_state, flagp,depth+1,
11975                     FALSE, /* means parse the whole char class */
11976                     FALSE, /* don't allow multi-char folds */
11977                     TRUE, /* silence non-portable warnings.  The above may very
11978                              well have generated non-portable code points, but
11979                              they're valid on this machine */
11980                     NULL);
11981     if (!node)
11982         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X", flagp);
11983     if (save_fold) {
11984         RExC_flags |= RXf_PMf_FOLD;
11985     }
11986     RExC_parse = save_parse + 1;
11987     RExC_end = save_end;
11988     SvREFCNT_dec_NN(final);
11989     SvREFCNT_dec_NN(result_string);
11990     SvREFCNT_dec_NN(stack);
11991
11992     nextchar(pRExC_state);
11993     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
11994     return node;
11995 }
11996 #undef IS_OPERAND
11997
11998 /* The names of properties whose definitions are not known at compile time are
11999  * stored in this SV, after a constant heading.  So if the length has been
12000  * changed since initialization, then there is a run-time definition. */
12001 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
12002
12003 STATIC regnode *
12004 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
12005                  const bool stop_at_1,  /* Just parse the next thing, don't
12006                                            look for a full character class */
12007                  bool allow_multi_folds,
12008                  const bool silence_non_portable,   /* Don't output warnings
12009                                                        about too large
12010                                                        characters */
12011                  SV** ret_invlist)  /* Return an inversion list, not a node */
12012 {
12013     /* parse a bracketed class specification.  Most of these will produce an
12014      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
12015      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
12016      * under /i with multi-character folds: it will be rewritten following the
12017      * paradigm of this example, where the <multi-fold>s are characters which
12018      * fold to multiple character sequences:
12019      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
12020      * gets effectively rewritten as:
12021      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
12022      * reg() gets called (recursively) on the rewritten version, and this
12023      * function will return what it constructs.  (Actually the <multi-fold>s
12024      * aren't physically removed from the [abcdefghi], it's just that they are
12025      * ignored in the recursion by means of a flag:
12026      * <RExC_in_multi_char_class>.)
12027      *
12028      * ANYOF nodes contain a bit map for the first 256 characters, with the
12029      * corresponding bit set if that character is in the list.  For characters
12030      * above 255, a range list or swash is used.  There are extra bits for \w,
12031      * etc. in locale ANYOFs, as what these match is not determinable at
12032      * compile time
12033      *
12034      * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
12035      * to be restarted.  This can only happen if ret_invlist is non-NULL.
12036      */
12037
12038     dVAR;
12039     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
12040     IV range = 0;
12041     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
12042     regnode *ret;
12043     STRLEN numlen;
12044     IV namedclass = OOB_NAMEDCLASS;
12045     char *rangebegin = NULL;
12046     bool need_class = 0;
12047     SV *listsv = NULL;
12048     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
12049                                       than just initialized.  */
12050     SV* properties = NULL;    /* Code points that match \p{} \P{} */
12051     SV* posixes = NULL;     /* Code points that match classes like, [:word:],
12052                                extended beyond the Latin1 range */
12053     UV element_count = 0;   /* Number of distinct elements in the class.
12054                                Optimizations may be possible if this is tiny */
12055     AV * multi_char_matches = NULL; /* Code points that fold to more than one
12056                                        character; used under /i */
12057     UV n;
12058     char * stop_ptr = RExC_end;    /* where to stop parsing */
12059     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
12060                                                    space? */
12061     const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
12062
12063     /* Unicode properties are stored in a swash; this holds the current one
12064      * being parsed.  If this swash is the only above-latin1 component of the
12065      * character class, an optimization is to pass it directly on to the
12066      * execution engine.  Otherwise, it is set to NULL to indicate that there
12067      * are other things in the class that have to be dealt with at execution
12068      * time */
12069     SV* swash = NULL;           /* Code points that match \p{} \P{} */
12070
12071     /* Set if a component of this character class is user-defined; just passed
12072      * on to the engine */
12073     bool has_user_defined_property = FALSE;
12074
12075     /* inversion list of code points this node matches only when the target
12076      * string is in UTF-8.  (Because is under /d) */
12077     SV* depends_list = NULL;
12078
12079     /* inversion list of code points this node matches.  For much of the
12080      * function, it includes only those that match regardless of the utf8ness
12081      * of the target string */
12082     SV* cp_list = NULL;
12083
12084 #ifdef EBCDIC
12085     /* In a range, counts how many 0-2 of the ends of it came from literals,
12086      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
12087     UV literal_endpoint = 0;
12088 #endif
12089     bool invert = FALSE;    /* Is this class to be complemented */
12090
12091     /* Is there any thing like \W or [:^digit:] that matches above the legal
12092      * Unicode range? */
12093     bool runtime_posix_matches_above_Unicode = FALSE;
12094
12095     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
12096         case we need to change the emitted regop to an EXACT. */
12097     const char * orig_parse = RExC_parse;
12098     const I32 orig_size = RExC_size;
12099     GET_RE_DEBUG_FLAGS_DECL;
12100
12101     PERL_ARGS_ASSERT_REGCLASS;
12102 #ifndef DEBUGGING
12103     PERL_UNUSED_ARG(depth);
12104 #endif
12105
12106     DEBUG_PARSE("clas");
12107
12108     /* Assume we are going to generate an ANYOF node. */
12109     ret = reganode(pRExC_state, ANYOF, 0);
12110
12111     if (SIZE_ONLY) {
12112         RExC_size += ANYOF_SKIP;
12113         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
12114     }
12115     else {
12116         ANYOF_FLAGS(ret) = 0;
12117
12118         RExC_emit += ANYOF_SKIP;
12119         if (LOC) {
12120             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
12121         }
12122         listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
12123         initial_listsv_len = SvCUR(listsv);
12124         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
12125     }
12126
12127     if (skip_white) {
12128         RExC_parse = regpatws(pRExC_state, RExC_parse,
12129                               FALSE /* means don't recognize comments */);
12130     }
12131
12132     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
12133         RExC_parse++;
12134         invert = TRUE;
12135         allow_multi_folds = FALSE;
12136         RExC_naughty++;
12137         if (skip_white) {
12138             RExC_parse = regpatws(pRExC_state, RExC_parse,
12139                                   FALSE /* means don't recognize comments */);
12140         }
12141     }
12142
12143     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
12144     if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
12145         const char *s = RExC_parse;
12146         const char  c = *s++;
12147
12148         while (isWORDCHAR(*s))
12149             s++;
12150         if (*s && c == *s && s[1] == ']') {
12151             SAVEFREESV(RExC_rx_sv);
12152             ckWARN3reg(s+2,
12153                        "POSIX syntax [%c %c] belongs inside character classes",
12154                        c, c);
12155             (void)ReREFCNT_inc(RExC_rx_sv);
12156         }
12157     }
12158
12159     /* If the caller wants us to just parse a single element, accomplish this
12160      * by faking the loop ending condition */
12161     if (stop_at_1 && RExC_end > RExC_parse) {
12162         stop_ptr = RExC_parse + 1;
12163     }
12164
12165     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
12166     if (UCHARAT(RExC_parse) == ']')
12167         goto charclassloop;
12168
12169 parseit:
12170     while (1) {
12171         if  (RExC_parse >= stop_ptr) {
12172             break;
12173         }
12174
12175         if (skip_white) {
12176             RExC_parse = regpatws(pRExC_state, RExC_parse,
12177                                   FALSE /* means don't recognize comments */);
12178         }
12179
12180         if  (UCHARAT(RExC_parse) == ']') {
12181             break;
12182         }
12183
12184     charclassloop:
12185
12186         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
12187         save_value = value;
12188         save_prevvalue = prevvalue;
12189
12190         if (!range) {
12191             rangebegin = RExC_parse;
12192             element_count++;
12193         }
12194         if (UTF) {
12195             value = utf8n_to_uvchr((U8*)RExC_parse,
12196                                    RExC_end - RExC_parse,
12197                                    &numlen, UTF8_ALLOW_DEFAULT);
12198             RExC_parse += numlen;
12199         }
12200         else
12201             value = UCHARAT(RExC_parse++);
12202
12203         if (value == '['
12204             && RExC_parse < RExC_end
12205             && POSIXCC(UCHARAT(RExC_parse)))
12206         {
12207             namedclass = regpposixcc(pRExC_state, value, strict);
12208         }
12209         else if (value == '\\') {
12210             if (UTF) {
12211                 value = utf8n_to_uvchr((U8*)RExC_parse,
12212                                    RExC_end - RExC_parse,
12213                                    &numlen, UTF8_ALLOW_DEFAULT);
12214                 RExC_parse += numlen;
12215             }
12216             else
12217                 value = UCHARAT(RExC_parse++);
12218
12219             /* Some compilers cannot handle switching on 64-bit integer
12220              * values, therefore value cannot be an UV.  Yes, this will
12221              * be a problem later if we want switch on Unicode.
12222              * A similar issue a little bit later when switching on
12223              * namedclass. --jhi */
12224
12225             /* If the \ is escaping white space when white space is being
12226              * skipped, it means that that white space is wanted literally, and
12227              * is already in 'value'.  Otherwise, need to translate the escape
12228              * into what it signifies. */
12229             if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
12230
12231             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
12232             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
12233             case 's':   namedclass = ANYOF_SPACE;       break;
12234             case 'S':   namedclass = ANYOF_NSPACE;      break;
12235             case 'd':   namedclass = ANYOF_DIGIT;       break;
12236             case 'D':   namedclass = ANYOF_NDIGIT;      break;
12237             case 'v':   namedclass = ANYOF_VERTWS;      break;
12238             case 'V':   namedclass = ANYOF_NVERTWS;     break;
12239             case 'h':   namedclass = ANYOF_HORIZWS;     break;
12240             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
12241             case 'N':  /* Handle \N{NAME} in class */
12242                 {
12243                     /* We only pay attention to the first char of
12244                     multichar strings being returned. I kinda wonder
12245                     if this makes sense as it does change the behaviour
12246                     from earlier versions, OTOH that behaviour was broken
12247                     as well. */
12248                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
12249                                       TRUE, /* => charclass */
12250                                       strict))
12251                     {
12252                         if (*flagp & RESTART_UTF8)
12253                             FAIL("panic: grok_bslash_N set RESTART_UTF8");
12254                         goto parseit;
12255                     }
12256                 }
12257                 break;
12258             case 'p':
12259             case 'P':
12260                 {
12261                 char *e;
12262
12263                 /* We will handle any undefined properties ourselves */
12264                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
12265
12266                 if (RExC_parse >= RExC_end)
12267                     vFAIL2("Empty \\%c{}", (U8)value);
12268                 if (*RExC_parse == '{') {
12269                     const U8 c = (U8)value;
12270                     e = strchr(RExC_parse++, '}');
12271                     if (!e)
12272                         vFAIL2("Missing right brace on \\%c{}", c);
12273                     while (isSPACE(UCHARAT(RExC_parse)))
12274                         RExC_parse++;
12275                     if (e == RExC_parse)
12276                         vFAIL2("Empty \\%c{}", c);
12277                     n = e - RExC_parse;
12278                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
12279                         n--;
12280                 }
12281                 else {
12282                     e = RExC_parse;
12283                     n = 1;
12284                 }
12285                 if (!SIZE_ONLY) {
12286                     SV* invlist;
12287                     char* name;
12288
12289                     if (UCHARAT(RExC_parse) == '^') {
12290                          RExC_parse++;
12291                          n--;
12292                          /* toggle.  (The rhs xor gets the single bit that
12293                           * differs between P and p; the other xor inverts just
12294                           * that bit) */
12295                          value ^= 'P' ^ 'p';
12296
12297                          while (isSPACE(UCHARAT(RExC_parse))) {
12298                               RExC_parse++;
12299                               n--;
12300                          }
12301                     }
12302                     /* Try to get the definition of the property into
12303                      * <invlist>.  If /i is in effect, the effective property
12304                      * will have its name be <__NAME_i>.  The design is
12305                      * discussed in commit
12306                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
12307                     Newx(name, n + sizeof("_i__\n"), char);
12308
12309                     sprintf(name, "%s%.*s%s\n",
12310                                     (FOLD) ? "__" : "",
12311                                     (int)n,
12312                                     RExC_parse,
12313                                     (FOLD) ? "_i" : ""
12314                     );
12315
12316                     /* Look up the property name, and get its swash and
12317                      * inversion list, if the property is found  */
12318                     if (swash) {
12319                         SvREFCNT_dec_NN(swash);
12320                     }
12321                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
12322                                              1, /* binary */
12323                                              0, /* not tr/// */
12324                                              NULL, /* No inversion list */
12325                                              &swash_init_flags
12326                                             );
12327                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
12328                         if (swash) {
12329                             SvREFCNT_dec_NN(swash);
12330                             swash = NULL;
12331                         }
12332
12333                         /* Here didn't find it.  It could be a user-defined
12334                          * property that will be available at run-time.  If we
12335                          * accept only compile-time properties, is an error;
12336                          * otherwise add it to the list for run-time look up */
12337                         if (ret_invlist) {
12338                             RExC_parse = e + 1;
12339                             vFAIL3("Property '%.*s' is unknown", (int) n, name);
12340                         }
12341                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
12342                                         (value == 'p' ? '+' : '!'),
12343                                         name);
12344                         has_user_defined_property = TRUE;
12345
12346                         /* We don't know yet, so have to assume that the
12347                          * property could match something in the Latin1 range,
12348                          * hence something that isn't utf8.  Note that this
12349                          * would cause things in <depends_list> to match
12350                          * inappropriately, except that any \p{}, including
12351                          * this one forces Unicode semantics, which means there
12352                          * is <no depends_list> */
12353                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
12354                     }
12355                     else {
12356
12357                         /* Here, did get the swash and its inversion list.  If
12358                          * the swash is from a user-defined property, then this
12359                          * whole character class should be regarded as such */
12360                         has_user_defined_property =
12361                                     (swash_init_flags
12362                                      & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
12363
12364                         /* Invert if asking for the complement */
12365                         if (value == 'P') {
12366                             _invlist_union_complement_2nd(properties,
12367                                                           invlist,
12368                                                           &properties);
12369
12370                             /* The swash can't be used as-is, because we've
12371                              * inverted things; delay removing it to here after
12372                              * have copied its invlist above */
12373                             SvREFCNT_dec_NN(swash);
12374                             swash = NULL;
12375                         }
12376                         else {
12377                             _invlist_union(properties, invlist, &properties);
12378                         }
12379                     }
12380                     Safefree(name);
12381                 }
12382                 RExC_parse = e + 1;
12383                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
12384                                                 named */
12385
12386                 /* \p means they want Unicode semantics */
12387                 RExC_uni_semantics = 1;
12388                 }
12389                 break;
12390             case 'n':   value = '\n';                   break;
12391             case 'r':   value = '\r';                   break;
12392             case 't':   value = '\t';                   break;
12393             case 'f':   value = '\f';                   break;
12394             case 'b':   value = '\b';                   break;
12395             case 'e':   value = ASCII_TO_NATIVE('\033');break;
12396             case 'a':   value = ASCII_TO_NATIVE('\007');break;
12397             case 'o':
12398                 RExC_parse--;   /* function expects to be pointed at the 'o' */
12399                 {
12400                     const char* error_msg;
12401                     bool valid = grok_bslash_o(&RExC_parse,
12402                                                &value,
12403                                                &error_msg,
12404                                                SIZE_ONLY,   /* warnings in pass
12405                                                                1 only */
12406                                                strict,
12407                                                silence_non_portable,
12408                                                UTF);
12409                     if (! valid) {
12410                         vFAIL(error_msg);
12411                     }
12412                 }
12413                 if (PL_encoding && value < 0x100) {
12414                     goto recode_encoding;
12415                 }
12416                 break;
12417             case 'x':
12418                 RExC_parse--;   /* function expects to be pointed at the 'x' */
12419                 {
12420                     const char* error_msg;
12421                     bool valid = grok_bslash_x(&RExC_parse,
12422                                                &value,
12423                                                &error_msg,
12424                                                TRUE, /* Output warnings */
12425                                                strict,
12426                                                silence_non_portable,
12427                                                UTF);
12428                     if (! valid) {
12429                         vFAIL(error_msg);
12430                     }
12431                 }
12432                 if (PL_encoding && value < 0x100)
12433                     goto recode_encoding;
12434                 break;
12435             case 'c':
12436                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
12437                 break;
12438             case '0': case '1': case '2': case '3': case '4':
12439             case '5': case '6': case '7':
12440                 {
12441                     /* Take 1-3 octal digits */
12442                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12443                     numlen = (strict) ? 4 : 3;
12444                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
12445                     RExC_parse += numlen;
12446                     if (numlen != 3) {
12447                         if (strict) {
12448                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12449                             vFAIL("Need exactly 3 octal digits");
12450                         }
12451                         else if (! SIZE_ONLY /* like \08, \178 */
12452                                  && numlen < 3
12453                                  && RExC_parse < RExC_end
12454                                  && isDIGIT(*RExC_parse)
12455                                  && ckWARN(WARN_REGEXP))
12456                         {
12457                             SAVEFREESV(RExC_rx_sv);
12458                             reg_warn_non_literal_string(
12459                                  RExC_parse + 1,
12460                                  form_short_octal_warning(RExC_parse, numlen));
12461                             (void)ReREFCNT_inc(RExC_rx_sv);
12462                         }
12463                     }
12464                     if (PL_encoding && value < 0x100)
12465                         goto recode_encoding;
12466                     break;
12467                 }
12468             recode_encoding:
12469                 if (! RExC_override_recoding) {
12470                     SV* enc = PL_encoding;
12471                     value = reg_recode((const char)(U8)value, &enc);
12472                     if (!enc) {
12473                         if (strict) {
12474                             vFAIL("Invalid escape in the specified encoding");
12475                         }
12476                         else if (SIZE_ONLY) {
12477                             ckWARNreg(RExC_parse,
12478                                   "Invalid escape in the specified encoding");
12479                         }
12480                     }
12481                     break;
12482                 }
12483             default:
12484                 /* Allow \_ to not give an error */
12485                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
12486                     if (strict) {
12487                         vFAIL2("Unrecognized escape \\%c in character class",
12488                                (int)value);
12489                     }
12490                     else {
12491                         SAVEFREESV(RExC_rx_sv);
12492                         ckWARN2reg(RExC_parse,
12493                             "Unrecognized escape \\%c in character class passed through",
12494                             (int)value);
12495                         (void)ReREFCNT_inc(RExC_rx_sv);
12496                     }
12497                 }
12498                 break;
12499             }   /* End of switch on char following backslash */
12500         } /* end of handling backslash escape sequences */
12501 #ifdef EBCDIC
12502         else
12503             literal_endpoint++;
12504 #endif
12505
12506         /* Here, we have the current token in 'value' */
12507
12508         /* What matches in a locale is not known until runtime.  This includes
12509          * what the Posix classes (like \w, [:space:]) match.  Room must be
12510          * reserved (one time per class) to store such classes, either if Perl
12511          * is compiled so that locale nodes always should have this space, or
12512          * if there is such class info to be stored.  The space will contain a
12513          * bit for each named class that is to be matched against.  This isn't
12514          * needed for \p{} and pseudo-classes, as they are not affected by
12515          * locale, and hence are dealt with separately */
12516         if (LOC
12517             && ! need_class
12518             && (ANYOF_LOCALE == ANYOF_CLASS
12519                 || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
12520         {
12521             need_class = 1;
12522             if (SIZE_ONLY) {
12523                 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12524             }
12525             else {
12526                 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12527                 ANYOF_CLASS_ZERO(ret);
12528             }
12529             ANYOF_FLAGS(ret) |= ANYOF_CLASS;
12530         }
12531
12532         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
12533
12534             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
12535              * literal, as is the character that began the false range, i.e.
12536              * the 'a' in the examples */
12537             if (range) {
12538                 if (!SIZE_ONLY) {
12539                     const int w = (RExC_parse >= rangebegin)
12540                                   ? RExC_parse - rangebegin
12541                                   : 0;
12542                     if (strict) {
12543                         vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin);
12544                     }
12545                     else {
12546                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
12547                         ckWARN4reg(RExC_parse,
12548                                 "False [] range \"%*.*s\"",
12549                                 w, w, rangebegin);
12550                         (void)ReREFCNT_inc(RExC_rx_sv);
12551                         cp_list = add_cp_to_invlist(cp_list, '-');
12552                         cp_list = add_cp_to_invlist(cp_list, prevvalue);
12553                     }
12554                 }
12555
12556                 range = 0; /* this was not a true range */
12557                 element_count += 2; /* So counts for three values */
12558             }
12559
12560             if (! SIZE_ONLY) {
12561                 U8 classnum = namedclass_to_classnum(namedclass);
12562                 if (namedclass >= ANYOF_MAX) {  /* If a special class */
12563                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
12564
12565                         /* Here, should be \h, \H, \v, or \V.  Neither /d nor
12566                          * /l make a difference in what these match.  There
12567                          * would be problems if these characters had folds
12568                          * other than themselves, as cp_list is subject to
12569                          * folding. */
12570                         if (classnum != _CC_VERTSPACE) {
12571                             assert(   namedclass == ANYOF_HORIZWS
12572                                    || namedclass == ANYOF_NHORIZWS);
12573
12574                             /* It turns out that \h is just a synonym for
12575                              * XPosixBlank */
12576                             classnum = _CC_BLANK;
12577                         }
12578
12579                         _invlist_union_maybe_complement_2nd(
12580                                 cp_list,
12581                                 PL_XPosix_ptrs[classnum],
12582                                 cBOOL(namedclass % 2), /* Complement if odd
12583                                                           (NHORIZWS, NVERTWS)
12584                                                         */
12585                                 &cp_list);
12586                     }
12587                 }
12588                 else if (classnum == _CC_ASCII) {
12589 #ifdef HAS_ISASCII
12590                     if (LOC) {
12591                         ANYOF_CLASS_SET(ret, namedclass);
12592                     }
12593                     else
12594 #endif  /* Not isascii(); just use the hard-coded definition for it */
12595                         _invlist_union_maybe_complement_2nd(
12596                                 posixes,
12597                                 PL_ASCII,
12598                                 cBOOL(namedclass % 2), /* Complement if odd
12599                                                           (NASCII) */
12600                                 &posixes);
12601                 }
12602                 else {  /* Garden variety class */
12603
12604                     /* The ascii range inversion list */
12605                     SV* ascii_source = PL_Posix_ptrs[classnum];
12606
12607                     /* The full Latin1 range inversion list */
12608                     SV* l1_source = PL_L1Posix_ptrs[classnum];
12609
12610                     /* This code is structured into two major clauses.  The
12611                      * first is for classes whose complete definitions may not
12612                      * already be known.  It not, the Latin1 definition
12613                      * (guaranteed to already known) is used plus code is
12614                      * generated to load the rest at run-time (only if needed).
12615                      * If the complete definition is known, it drops down to
12616                      * the second clause, where the complete definition is
12617                      * known */
12618
12619                     if (classnum < _FIRST_NON_SWASH_CC) {
12620
12621                         /* Here, the class has a swash, which may or not
12622                          * already be loaded */
12623
12624                         /* The name of the property to use to match the full
12625                          * eXtended Unicode range swash for this character
12626                          * class */
12627                         const char *Xname = swash_property_names[classnum];
12628
12629                         /* If returning the inversion list, we can't defer
12630                          * getting this until runtime */
12631                         if (ret_invlist && !  PL_utf8_swash_ptrs[classnum]) {
12632                             PL_utf8_swash_ptrs[classnum] =
12633                                 _core_swash_init("utf8", Xname, &PL_sv_undef,
12634                                              1, /* binary */
12635                                              0, /* not tr/// */
12636                                              NULL, /* No inversion list */
12637                                              NULL  /* No flags */
12638                                             );
12639                             assert(PL_utf8_swash_ptrs[classnum]);
12640                         }
12641                         if ( !  PL_utf8_swash_ptrs[classnum]) {
12642                             if (namedclass % 2 == 0) { /* A non-complemented
12643                                                           class */
12644                                 /* If not /a matching, there are code points we
12645                                  * don't know at compile time.  Arrange for the
12646                                  * unknown matches to be loaded at run-time, if
12647                                  * needed */
12648                                 if (! AT_LEAST_ASCII_RESTRICTED) {
12649                                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
12650                                                                  Xname);
12651                                 }
12652                                 if (LOC) {  /* Under locale, set run-time
12653                                                lookup */
12654                                     ANYOF_CLASS_SET(ret, namedclass);
12655                                 }
12656                                 else {
12657                                     /* Add the current class's code points to
12658                                      * the running total */
12659                                     _invlist_union(posixes,
12660                                                    (AT_LEAST_ASCII_RESTRICTED)
12661                                                         ? ascii_source
12662                                                         : l1_source,
12663                                                    &posixes);
12664                                 }
12665                             }
12666                             else {  /* A complemented class */
12667                                 if (AT_LEAST_ASCII_RESTRICTED) {
12668                                     /* Under /a should match everything above
12669                                      * ASCII, plus the complement of the set's
12670                                      * ASCII matches */
12671                                     _invlist_union_complement_2nd(posixes,
12672                                                                   ascii_source,
12673                                                                   &posixes);
12674                                 }
12675                                 else {
12676                                     /* Arrange for the unknown matches to be
12677                                      * loaded at run-time, if needed */
12678                                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
12679                                                                  Xname);
12680                                     runtime_posix_matches_above_Unicode = TRUE;
12681                                     if (LOC) {
12682                                         ANYOF_CLASS_SET(ret, namedclass);
12683                                     }
12684                                     else {
12685
12686                                         /* We want to match everything in
12687                                          * Latin1, except those things that
12688                                          * l1_source matches */
12689                                         SV* scratch_list = NULL;
12690                                         _invlist_subtract(PL_Latin1, l1_source,
12691                                                           &scratch_list);
12692
12693                                         /* Add the list from this class to the
12694                                          * running total */
12695                                         if (! posixes) {
12696                                             posixes = scratch_list;
12697                                         }
12698                                         else {
12699                                             _invlist_union(posixes,
12700                                                            scratch_list,
12701                                                            &posixes);
12702                                             SvREFCNT_dec_NN(scratch_list);
12703                                         }
12704                                         if (DEPENDS_SEMANTICS) {
12705                                             ANYOF_FLAGS(ret)
12706                                                   |= ANYOF_NON_UTF8_LATIN1_ALL;
12707                                         }
12708                                     }
12709                                 }
12710                             }
12711                             goto namedclass_done;
12712                         }
12713
12714                         /* Here, there is a swash loaded for the class.  If no
12715                          * inversion list for it yet, get it */
12716                         if (! PL_XPosix_ptrs[classnum]) {
12717                             PL_XPosix_ptrs[classnum]
12718                              = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
12719                         }
12720                     }
12721
12722                     /* Here there is an inversion list already loaded for the
12723                      * entire class */
12724
12725                     if (namedclass % 2 == 0) {  /* A non-complemented class,
12726                                                    like ANYOF_PUNCT */
12727                         if (! LOC) {
12728                             /* For non-locale, just add it to any existing list
12729                              * */
12730                             _invlist_union(posixes,
12731                                            (AT_LEAST_ASCII_RESTRICTED)
12732                                                ? ascii_source
12733                                                : PL_XPosix_ptrs[classnum],
12734                                            &posixes);
12735                         }
12736                         else {  /* Locale */
12737                             SV* scratch_list = NULL;
12738
12739                             /* For above Latin1 code points, we use the full
12740                              * Unicode range */
12741                             _invlist_intersection(PL_AboveLatin1,
12742                                                   PL_XPosix_ptrs[classnum],
12743                                                   &scratch_list);
12744                             /* And set the output to it, adding instead if
12745                              * there already is an output.  Checking if
12746                              * 'posixes' is NULL first saves an extra clone.
12747                              * Its reference count will be decremented at the
12748                              * next union, etc, or if this is the only
12749                              * instance, at the end of the routine */
12750                             if (! posixes) {
12751                                 posixes = scratch_list;
12752                             }
12753                             else {
12754                                 _invlist_union(posixes, scratch_list, &posixes);
12755                                 SvREFCNT_dec_NN(scratch_list);
12756                             }
12757
12758 #ifndef HAS_ISBLANK
12759                             if (namedclass != ANYOF_BLANK) {
12760 #endif
12761                                 /* Set this class in the node for runtime
12762                                  * matching */
12763                                 ANYOF_CLASS_SET(ret, namedclass);
12764 #ifndef HAS_ISBLANK
12765                             }
12766                             else {
12767                                 /* No isblank(), use the hard-coded ASCII-range
12768                                  * blanks, adding them to the running total. */
12769
12770                                 _invlist_union(posixes, ascii_source, &posixes);
12771                             }
12772 #endif
12773                         }
12774                     }
12775                     else {  /* A complemented class, like ANYOF_NPUNCT */
12776                         if (! LOC) {
12777                             _invlist_union_complement_2nd(
12778                                                 posixes,
12779                                                 (AT_LEAST_ASCII_RESTRICTED)
12780                                                     ? ascii_source
12781                                                     : PL_XPosix_ptrs[classnum],
12782                                                 &posixes);
12783                             /* Under /d, everything in the upper half of the
12784                              * Latin1 range matches this complement */
12785                             if (DEPENDS_SEMANTICS) {
12786                                 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
12787                             }
12788                         }
12789                         else {  /* Locale */
12790                             SV* scratch_list = NULL;
12791                             _invlist_subtract(PL_AboveLatin1,
12792                                               PL_XPosix_ptrs[classnum],
12793                                               &scratch_list);
12794                             if (! posixes) {
12795                                 posixes = scratch_list;
12796                             }
12797                             else {
12798                                 _invlist_union(posixes, scratch_list, &posixes);
12799                                 SvREFCNT_dec_NN(scratch_list);
12800                             }
12801 #ifndef HAS_ISBLANK
12802                             if (namedclass != ANYOF_NBLANK) {
12803 #endif
12804                                 ANYOF_CLASS_SET(ret, namedclass);
12805 #ifndef HAS_ISBLANK
12806                             }
12807                             else {
12808                                 /* Get the list of all code points in Latin1
12809                                  * that are not ASCII blanks, and add them to
12810                                  * the running total */
12811                                 _invlist_subtract(PL_Latin1, ascii_source,
12812                                                   &scratch_list);
12813                                 _invlist_union(posixes, scratch_list, &posixes);
12814                                 SvREFCNT_dec_NN(scratch_list);
12815                             }
12816 #endif
12817                         }
12818                     }
12819                 }
12820               namedclass_done:
12821                 continue;   /* Go get next character */
12822             }
12823         } /* end of namedclass \blah */
12824
12825         /* Here, we have a single value.  If 'range' is set, it is the ending
12826          * of a range--check its validity.  Later, we will handle each
12827          * individual code point in the range.  If 'range' isn't set, this
12828          * could be the beginning of a range, so check for that by looking
12829          * ahead to see if the next real character to be processed is the range
12830          * indicator--the minus sign */
12831
12832         if (skip_white) {
12833             RExC_parse = regpatws(pRExC_state, RExC_parse,
12834                                 FALSE /* means don't recognize comments */);
12835         }
12836
12837         if (range) {
12838             if (prevvalue > value) /* b-a */ {
12839                 const int w = RExC_parse - rangebegin;
12840                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
12841                 range = 0; /* not a valid range */
12842             }
12843         }
12844         else {
12845             prevvalue = value; /* save the beginning of the potential range */
12846             if (! stop_at_1     /* Can't be a range if parsing just one thing */
12847                 && *RExC_parse == '-')
12848             {
12849                 char* next_char_ptr = RExC_parse + 1;
12850                 if (skip_white) {   /* Get the next real char after the '-' */
12851                     next_char_ptr = regpatws(pRExC_state,
12852                                              RExC_parse + 1,
12853                                              FALSE); /* means don't recognize
12854                                                         comments */
12855                 }
12856
12857                 /* If the '-' is at the end of the class (just before the ']',
12858                  * it is a literal minus; otherwise it is a range */
12859                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
12860                     RExC_parse = next_char_ptr;
12861
12862                     /* a bad range like \w-, [:word:]- ? */
12863                     if (namedclass > OOB_NAMEDCLASS) {
12864                         if (strict || ckWARN(WARN_REGEXP)) {
12865                             const int w =
12866                                 RExC_parse >= rangebegin ?
12867                                 RExC_parse - rangebegin : 0;
12868                             if (strict) {
12869                                 vFAIL4("False [] range \"%*.*s\"",
12870                                     w, w, rangebegin);
12871                             }
12872                             else {
12873                                 vWARN4(RExC_parse,
12874                                     "False [] range \"%*.*s\"",
12875                                     w, w, rangebegin);
12876                             }
12877                         }
12878                         if (!SIZE_ONLY) {
12879                             cp_list = add_cp_to_invlist(cp_list, '-');
12880                         }
12881                         element_count++;
12882                     } else
12883                         range = 1;      /* yeah, it's a range! */
12884                     continue;   /* but do it the next time */
12885                 }
12886             }
12887         }
12888
12889         /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12890          * if not */
12891
12892         /* non-Latin1 code point implies unicode semantics.  Must be set in
12893          * pass1 so is there for the whole of pass 2 */
12894         if (value > 255) {
12895             RExC_uni_semantics = 1;
12896         }
12897
12898         /* Ready to process either the single value, or the completed range.
12899          * For single-valued non-inverted ranges, we consider the possibility
12900          * of multi-char folds.  (We made a conscious decision to not do this
12901          * for the other cases because it can often lead to non-intuitive
12902          * results.  For example, you have the peculiar case that:
12903          *  "s s" =~ /^[^\xDF]+$/i => Y
12904          *  "ss"  =~ /^[^\xDF]+$/i => N
12905          *
12906          * See [perl #89750] */
12907         if (FOLD && allow_multi_folds && value == prevvalue) {
12908             if (value == LATIN_SMALL_LETTER_SHARP_S
12909                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
12910                                                         value)))
12911             {
12912                 /* Here <value> is indeed a multi-char fold.  Get what it is */
12913
12914                 U8 foldbuf[UTF8_MAXBYTES_CASE];
12915                 STRLEN foldlen;
12916
12917                 UV folded = _to_uni_fold_flags(
12918                                 value,
12919                                 foldbuf,
12920                                 &foldlen,
12921                                 FOLD_FLAGS_FULL
12922                                 | ((LOC) ?  FOLD_FLAGS_LOCALE
12923                                             : (ASCII_FOLD_RESTRICTED)
12924                                               ? FOLD_FLAGS_NOMIX_ASCII
12925                                               : 0)
12926                                 );
12927
12928                 /* Here, <folded> should be the first character of the
12929                  * multi-char fold of <value>, with <foldbuf> containing the
12930                  * whole thing.  But, if this fold is not allowed (because of
12931                  * the flags), <fold> will be the same as <value>, and should
12932                  * be processed like any other character, so skip the special
12933                  * handling */
12934                 if (folded != value) {
12935
12936                     /* Skip if we are recursed, currently parsing the class
12937                      * again.  Otherwise add this character to the list of
12938                      * multi-char folds. */
12939                     if (! RExC_in_multi_char_class) {
12940                         AV** this_array_ptr;
12941                         AV* this_array;
12942                         STRLEN cp_count = utf8_length(foldbuf,
12943                                                       foldbuf + foldlen);
12944                         SV* multi_fold = sv_2mortal(newSVpvn("", 0));
12945
12946                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
12947
12948
12949                         if (! multi_char_matches) {
12950                             multi_char_matches = newAV();
12951                         }
12952
12953                         /* <multi_char_matches> is actually an array of arrays.
12954                          * There will be one or two top-level elements: [2],
12955                          * and/or [3].  The [2] element is an array, each
12956                          * element thereof is a character which folds to two
12957                          * characters; likewise for [3].  (Unicode guarantees a
12958                          * maximum of 3 characters in any fold.)  When we
12959                          * rewrite the character class below, we will do so
12960                          * such that the longest folds are written first, so
12961                          * that it prefers the longest matching strings first.
12962                          * This is done even if it turns out that any
12963                          * quantifier is non-greedy, out of programmer
12964                          * laziness.  Tom Christiansen has agreed that this is
12965                          * ok.  This makes the test for the ligature 'ffi' come
12966                          * before the test for 'ff' */
12967                         if (av_exists(multi_char_matches, cp_count)) {
12968                             this_array_ptr = (AV**) av_fetch(multi_char_matches,
12969                                                              cp_count, FALSE);
12970                             this_array = *this_array_ptr;
12971                         }
12972                         else {
12973                             this_array = newAV();
12974                             av_store(multi_char_matches, cp_count,
12975                                      (SV*) this_array);
12976                         }
12977                         av_push(this_array, multi_fold);
12978                     }
12979
12980                     /* This element should not be processed further in this
12981                      * class */
12982                     element_count--;
12983                     value = save_value;
12984                     prevvalue = save_prevvalue;
12985                     continue;
12986                 }
12987             }
12988         }
12989
12990         /* Deal with this element of the class */
12991         if (! SIZE_ONLY) {
12992 #ifndef EBCDIC
12993             cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
12994 #else
12995             SV* this_range = _new_invlist(1);
12996             _append_range_to_invlist(this_range, prevvalue, value);
12997
12998             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
12999              * If this range was specified using something like 'i-j', we want
13000              * to include only the 'i' and the 'j', and not anything in
13001              * between, so exclude non-ASCII, non-alphabetics from it.
13002              * However, if the range was specified with something like
13003              * [\x89-\x91] or [\x89-j], all code points within it should be
13004              * included.  literal_endpoint==2 means both ends of the range used
13005              * a literal character, not \x{foo} */
13006             if (literal_endpoint == 2
13007                 && (prevvalue >= 'a' && value <= 'z')
13008                     || (prevvalue >= 'A' && value <= 'Z'))
13009             {
13010                 _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
13011                                       &this_range);
13012             }
13013             _invlist_union(cp_list, this_range, &cp_list);
13014             literal_endpoint = 0;
13015 #endif
13016         }
13017
13018         range = 0; /* this range (if it was one) is done now */
13019     } /* End of loop through all the text within the brackets */
13020
13021     /* If anything in the class expands to more than one character, we have to
13022      * deal with them by building up a substitute parse string, and recursively
13023      * calling reg() on it, instead of proceeding */
13024     if (multi_char_matches) {
13025         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
13026         I32 cp_count;
13027         STRLEN len;
13028         char *save_end = RExC_end;
13029         char *save_parse = RExC_parse;
13030         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
13031                                        a "|" */
13032         I32 reg_flags;
13033
13034         assert(! invert);
13035 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
13036            because too confusing */
13037         if (invert) {
13038             sv_catpv(substitute_parse, "(?:");
13039         }
13040 #endif
13041
13042         /* Look at the longest folds first */
13043         for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
13044
13045             if (av_exists(multi_char_matches, cp_count)) {
13046                 AV** this_array_ptr;
13047                 SV* this_sequence;
13048
13049                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
13050                                                  cp_count, FALSE);
13051                 while ((this_sequence = av_pop(*this_array_ptr)) !=
13052                                                                 &PL_sv_undef)
13053                 {
13054                     if (! first_time) {
13055                         sv_catpv(substitute_parse, "|");
13056                     }
13057                     first_time = FALSE;
13058
13059                     sv_catpv(substitute_parse, SvPVX(this_sequence));
13060                 }
13061             }
13062         }
13063
13064         /* If the character class contains anything else besides these
13065          * multi-character folds, have to include it in recursive parsing */
13066         if (element_count) {
13067             sv_catpv(substitute_parse, "|[");
13068             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
13069             sv_catpv(substitute_parse, "]");
13070         }
13071
13072         sv_catpv(substitute_parse, ")");
13073 #if 0
13074         if (invert) {
13075             /* This is a way to get the parse to skip forward a whole named
13076              * sequence instead of matching the 2nd character when it fails the
13077              * first */
13078             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
13079         }
13080 #endif
13081
13082         RExC_parse = SvPV(substitute_parse, len);
13083         RExC_end = RExC_parse + len;
13084         RExC_in_multi_char_class = 1;
13085         RExC_emit = (regnode *)orig_emit;
13086
13087         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
13088
13089         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
13090
13091         RExC_parse = save_parse;
13092         RExC_end = save_end;
13093         RExC_in_multi_char_class = 0;
13094         SvREFCNT_dec_NN(multi_char_matches);
13095         return ret;
13096     }
13097
13098     /* If the character class contains only a single element, it may be
13099      * optimizable into another node type which is smaller and runs faster.
13100      * Check if this is the case for this class */
13101     if (element_count == 1 && ! ret_invlist) {
13102         U8 op = END;
13103         U8 arg = 0;
13104
13105         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
13106                                               [:digit:] or \p{foo} */
13107
13108             /* All named classes are mapped into POSIXish nodes, with its FLAG
13109              * argument giving which class it is */
13110             switch ((I32)namedclass) {
13111                 case ANYOF_UNIPROP:
13112                     break;
13113
13114                 /* These don't depend on the charset modifiers.  They always
13115                  * match under /u rules */
13116                 case ANYOF_NHORIZWS:
13117                 case ANYOF_HORIZWS:
13118                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
13119                     /* FALLTHROUGH */
13120
13121                 case ANYOF_NVERTWS:
13122                 case ANYOF_VERTWS:
13123                     op = POSIXU;
13124                     goto join_posix;
13125
13126                 /* The actual POSIXish node for all the rest depends on the
13127                  * charset modifier.  The ones in the first set depend only on
13128                  * ASCII or, if available on this platform, locale */
13129                 case ANYOF_ASCII:
13130                 case ANYOF_NASCII:
13131 #ifdef HAS_ISASCII
13132                     op = (LOC) ? POSIXL : POSIXA;
13133 #else
13134                     op = POSIXA;
13135 #endif
13136                     goto join_posix;
13137
13138                 case ANYOF_NCASED:
13139                 case ANYOF_LOWER:
13140                 case ANYOF_NLOWER:
13141                 case ANYOF_UPPER:
13142                 case ANYOF_NUPPER:
13143                     /* under /a could be alpha */
13144                     if (FOLD) {
13145                         if (ASCII_RESTRICTED) {
13146                             namedclass = ANYOF_ALPHA + (namedclass % 2);
13147                         }
13148                         else if (! LOC) {
13149                             break;
13150                         }
13151                     }
13152                     /* FALLTHROUGH */
13153
13154                 /* The rest have more possibilities depending on the charset.
13155                  * We take advantage of the enum ordering of the charset
13156                  * modifiers to get the exact node type, */
13157                 default:
13158                     op = POSIXD + get_regex_charset(RExC_flags);
13159                     if (op > POSIXA) { /* /aa is same as /a */
13160                         op = POSIXA;
13161                     }
13162 #ifndef HAS_ISBLANK
13163                     if (op == POSIXL
13164                         && (namedclass == ANYOF_BLANK
13165                             || namedclass == ANYOF_NBLANK))
13166                     {
13167                         op = POSIXA;
13168                     }
13169 #endif
13170
13171                 join_posix:
13172                     /* The odd numbered ones are the complements of the
13173                      * next-lower even number one */
13174                     if (namedclass % 2 == 1) {
13175                         invert = ! invert;
13176                         namedclass--;
13177                     }
13178                     arg = namedclass_to_classnum(namedclass);
13179                     break;
13180             }
13181         }
13182         else if (value == prevvalue) {
13183
13184             /* Here, the class consists of just a single code point */
13185
13186             if (invert) {
13187                 if (! LOC && value == '\n') {
13188                     op = REG_ANY; /* Optimize [^\n] */
13189                     *flagp |= HASWIDTH|SIMPLE;
13190                     RExC_naughty++;
13191                 }
13192             }
13193             else if (value < 256 || UTF) {
13194
13195                 /* Optimize a single value into an EXACTish node, but not if it
13196                  * would require converting the pattern to UTF-8. */
13197                 op = compute_EXACTish(pRExC_state);
13198             }
13199         } /* Otherwise is a range */
13200         else if (! LOC) {   /* locale could vary these */
13201             if (prevvalue == '0') {
13202                 if (value == '9') {
13203                     arg = _CC_DIGIT;
13204                     op = POSIXA;
13205                 }
13206             }
13207         }
13208
13209         /* Here, we have changed <op> away from its initial value iff we found
13210          * an optimization */
13211         if (op != END) {
13212
13213             /* Throw away this ANYOF regnode, and emit the calculated one,
13214              * which should correspond to the beginning, not current, state of
13215              * the parse */
13216             const char * cur_parse = RExC_parse;
13217             RExC_parse = (char *)orig_parse;
13218             if ( SIZE_ONLY) {
13219                 if (! LOC) {
13220
13221                     /* To get locale nodes to not use the full ANYOF size would
13222                      * require moving the code above that writes the portions
13223                      * of it that aren't in other nodes to after this point.
13224                      * e.g.  ANYOF_CLASS_SET */
13225                     RExC_size = orig_size;
13226                 }
13227             }
13228             else {
13229                 RExC_emit = (regnode *)orig_emit;
13230                 if (PL_regkind[op] == POSIXD) {
13231                     if (invert) {
13232                         op += NPOSIXD - POSIXD;
13233                     }
13234                 }
13235             }
13236
13237             ret = reg_node(pRExC_state, op);
13238
13239             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
13240                 if (! SIZE_ONLY) {
13241                     FLAGS(ret) = arg;
13242                 }
13243                 *flagp |= HASWIDTH|SIMPLE;
13244             }
13245             else if (PL_regkind[op] == EXACT) {
13246                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13247             }
13248
13249             RExC_parse = (char *) cur_parse;
13250
13251             SvREFCNT_dec(posixes);
13252             SvREFCNT_dec(cp_list);
13253             return ret;
13254         }
13255     }
13256
13257     if (SIZE_ONLY)
13258         return ret;
13259     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
13260
13261     /* If folding, we calculate all characters that could fold to or from the
13262      * ones already on the list */
13263     if (FOLD && cp_list) {
13264         UV start, end;  /* End points of code point ranges */
13265
13266         SV* fold_intersection = NULL;
13267
13268         /* If the highest code point is within Latin1, we can use the
13269          * compiled-in Alphas list, and not have to go out to disk.  This
13270          * yields two false positives, the masculine and feminine ordinal
13271          * indicators, which are weeded out below using the
13272          * IS_IN_SOME_FOLD_L1() macro */
13273         if (invlist_highest(cp_list) < 256) {
13274             _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
13275                                                            &fold_intersection);
13276         }
13277         else {
13278
13279             /* Here, there are non-Latin1 code points, so we will have to go
13280              * fetch the list of all the characters that participate in folds
13281              */
13282             if (! PL_utf8_foldable) {
13283                 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13284                                        &PL_sv_undef, 1, 0);
13285                 PL_utf8_foldable = _get_swash_invlist(swash);
13286                 SvREFCNT_dec_NN(swash);
13287             }
13288
13289             /* This is a hash that for a particular fold gives all characters
13290              * that are involved in it */
13291             if (! PL_utf8_foldclosures) {
13292
13293                 /* If we were unable to find any folds, then we likely won't be
13294                  * able to find the closures.  So just create an empty list.
13295                  * Folding will effectively be restricted to the non-Unicode
13296                  * rules hard-coded into Perl.  (This case happens legitimately
13297                  * during compilation of Perl itself before the Unicode tables
13298                  * are generated) */
13299                 if (_invlist_len(PL_utf8_foldable) == 0) {
13300                     PL_utf8_foldclosures = newHV();
13301                 }
13302                 else {
13303                     /* If the folds haven't been read in, call a fold function
13304                      * to force that */
13305                     if (! PL_utf8_tofold) {
13306                         U8 dummy[UTF8_MAXBYTES+1];
13307
13308                         /* This string is just a short named one above \xff */
13309                         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
13310                         assert(PL_utf8_tofold); /* Verify that worked */
13311                     }
13312                     PL_utf8_foldclosures =
13313                                     _swash_inversion_hash(PL_utf8_tofold);
13314                 }
13315             }
13316
13317             /* Only the characters in this class that participate in folds need
13318              * be checked.  Get the intersection of this class and all the
13319              * possible characters that are foldable.  This can quickly narrow
13320              * down a large class */
13321             _invlist_intersection(PL_utf8_foldable, cp_list,
13322                                   &fold_intersection);
13323         }
13324
13325         /* Now look at the foldable characters in this class individually */
13326         invlist_iterinit(fold_intersection);
13327         while (invlist_iternext(fold_intersection, &start, &end)) {
13328             UV j;
13329
13330             /* Locale folding for Latin1 characters is deferred until runtime */
13331             if (LOC && start < 256) {
13332                 start = 256;
13333             }
13334
13335             /* Look at every character in the range */
13336             for (j = start; j <= end; j++) {
13337
13338                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
13339                 STRLEN foldlen;
13340                 SV** listp;
13341
13342                 if (j < 256) {
13343
13344                     /* We have the latin1 folding rules hard-coded here so that
13345                      * an innocent-looking character class, like /[ks]/i won't
13346                      * have to go out to disk to find the possible matches.
13347                      * XXX It would be better to generate these via regen, in
13348                      * case a new version of the Unicode standard adds new
13349                      * mappings, though that is not really likely, and may be
13350                      * caught by the default: case of the switch below. */
13351
13352                     if (IS_IN_SOME_FOLD_L1(j)) {
13353
13354                         /* ASCII is always matched; non-ASCII is matched only
13355                          * under Unicode rules */
13356                         if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
13357                             cp_list =
13358                                 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
13359                         }
13360                         else {
13361                             depends_list =
13362                              add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
13363                         }
13364                     }
13365
13366                     if (HAS_NONLATIN1_FOLD_CLOSURE(j)
13367                         && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
13368                     {
13369                         /* Certain Latin1 characters have matches outside
13370                          * Latin1.  To get here, <j> is one of those
13371                          * characters.   None of these matches is valid for
13372                          * ASCII characters under /aa, which is why the 'if'
13373                          * just above excludes those.  These matches only
13374                          * happen when the target string is utf8.  The code
13375                          * below adds the single fold closures for <j> to the
13376                          * inversion list. */
13377                         switch (j) {
13378                             case 'k':
13379                             case 'K':
13380                                 cp_list =
13381                                     add_cp_to_invlist(cp_list, KELVIN_SIGN);
13382                                 break;
13383                             case 's':
13384                             case 'S':
13385                                 cp_list = add_cp_to_invlist(cp_list,
13386                                                     LATIN_SMALL_LETTER_LONG_S);
13387                                 break;
13388                             case MICRO_SIGN:
13389                                 cp_list = add_cp_to_invlist(cp_list,
13390                                                     GREEK_CAPITAL_LETTER_MU);
13391                                 cp_list = add_cp_to_invlist(cp_list,
13392                                                     GREEK_SMALL_LETTER_MU);
13393                                 break;
13394                             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13395                             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13396                                 cp_list =
13397                                     add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
13398                                 break;
13399                             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13400                                 cp_list = add_cp_to_invlist(cp_list,
13401                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13402                                 break;
13403                             case LATIN_SMALL_LETTER_SHARP_S:
13404                                 cp_list = add_cp_to_invlist(cp_list,
13405                                                 LATIN_CAPITAL_LETTER_SHARP_S);
13406                                 break;
13407                             case 'F': case 'f':
13408                             case 'I': case 'i':
13409                             case 'L': case 'l':
13410                             case 'T': case 't':
13411                             case 'A': case 'a':
13412                             case 'H': case 'h':
13413                             case 'J': case 'j':
13414                             case 'N': case 'n':
13415                             case 'W': case 'w':
13416                             case 'Y': case 'y':
13417                                 /* These all are targets of multi-character
13418                                  * folds from code points that require UTF8 to
13419                                  * express, so they can't match unless the
13420                                  * target string is in UTF-8, so no action here
13421                                  * is necessary, as regexec.c properly handles
13422                                  * the general case for UTF-8 matching and
13423                                  * multi-char folds */
13424                                 break;
13425                             default:
13426                                 /* Use deprecated warning to increase the
13427                                  * chances of this being output */
13428                                 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
13429                                 break;
13430                         }
13431                     }
13432                     continue;
13433                 }
13434
13435                 /* Here is an above Latin1 character.  We don't have the rules
13436                  * hard-coded for it.  First, get its fold.  This is the simple
13437                  * fold, as the multi-character folds have been handled earlier
13438                  * and separated out */
13439                 _to_uni_fold_flags(j, foldbuf, &foldlen,
13440                                                ((LOC)
13441                                                ? FOLD_FLAGS_LOCALE
13442                                                : (ASCII_FOLD_RESTRICTED)
13443                                                   ? FOLD_FLAGS_NOMIX_ASCII
13444                                                   : 0));
13445
13446                 /* Single character fold of above Latin1.  Add everything in
13447                  * its fold closure to the list that this node should match.
13448                  * The fold closures data structure is a hash with the keys
13449                  * being the UTF-8 of every character that is folded to, like
13450                  * 'k', and the values each an array of all code points that
13451                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
13452                  * Multi-character folds are not included */
13453                 if ((listp = hv_fetch(PL_utf8_foldclosures,
13454                                       (char *) foldbuf, foldlen, FALSE)))
13455                 {
13456                     AV* list = (AV*) *listp;
13457                     IV k;
13458                     for (k = 0; k <= av_len(list); k++) {
13459                         SV** c_p = av_fetch(list, k, FALSE);
13460                         UV c;
13461                         if (c_p == NULL) {
13462                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
13463                         }
13464                         c = SvUV(*c_p);
13465
13466                         /* /aa doesn't allow folds between ASCII and non-; /l
13467                          * doesn't allow them between above and below 256 */
13468                         if ((ASCII_FOLD_RESTRICTED
13469                                   && (isASCII(c) != isASCII(j)))
13470                             || (LOC && ((c < 256) != (j < 256))))
13471                         {
13472                             continue;
13473                         }
13474
13475                         /* Folds involving non-ascii Latin1 characters
13476                          * under /d are added to a separate list */
13477                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
13478                         {
13479                             cp_list = add_cp_to_invlist(cp_list, c);
13480                         }
13481                         else {
13482                           depends_list = add_cp_to_invlist(depends_list, c);
13483                         }
13484                     }
13485                 }
13486             }
13487         }
13488         SvREFCNT_dec_NN(fold_intersection);
13489     }
13490
13491     /* And combine the result (if any) with any inversion list from posix
13492      * classes.  The lists are kept separate up to now because we don't want to
13493      * fold the classes (folding of those is automatically handled by the swash
13494      * fetching code) */
13495     if (posixes) {
13496         if (! DEPENDS_SEMANTICS) {
13497             if (cp_list) {
13498                 _invlist_union(cp_list, posixes, &cp_list);
13499                 SvREFCNT_dec_NN(posixes);
13500             }
13501             else {
13502                 cp_list = posixes;
13503             }
13504         }
13505         else {
13506             /* Under /d, we put into a separate list the Latin1 things that
13507              * match only when the target string is utf8 */
13508             SV* nonascii_but_latin1_properties = NULL;
13509             _invlist_intersection(posixes, PL_Latin1,
13510                                   &nonascii_but_latin1_properties);
13511             _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
13512                               &nonascii_but_latin1_properties);
13513             _invlist_subtract(posixes, nonascii_but_latin1_properties,
13514                               &posixes);
13515             if (cp_list) {
13516                 _invlist_union(cp_list, posixes, &cp_list);
13517                 SvREFCNT_dec_NN(posixes);
13518             }
13519             else {
13520                 cp_list = posixes;
13521             }
13522
13523             if (depends_list) {
13524                 _invlist_union(depends_list, nonascii_but_latin1_properties,
13525                                &depends_list);
13526                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
13527             }
13528             else {
13529                 depends_list = nonascii_but_latin1_properties;
13530             }
13531         }
13532     }
13533
13534     /* And combine the result (if any) with any inversion list from properties.
13535      * The lists are kept separate up to now so that we can distinguish the two
13536      * in regards to matching above-Unicode.  A run-time warning is generated
13537      * if a Unicode property is matched against a non-Unicode code point. But,
13538      * we allow user-defined properties to match anything, without any warning,
13539      * and we also suppress the warning if there is a portion of the character
13540      * class that isn't a Unicode property, and which matches above Unicode, \W
13541      * or [\x{110000}] for example.
13542      * (Note that in this case, unlike the Posix one above, there is no
13543      * <depends_list>, because having a Unicode property forces Unicode
13544      * semantics */
13545     if (properties) {
13546         bool warn_super = ! has_user_defined_property;
13547         if (cp_list) {
13548
13549             /* If it matters to the final outcome, see if a non-property
13550              * component of the class matches above Unicode.  If so, the
13551              * warning gets suppressed.  This is true even if just a single
13552              * such code point is specified, as though not strictly correct if
13553              * another such code point is matched against, the fact that they
13554              * are using above-Unicode code points indicates they should know
13555              * the issues involved */
13556             if (warn_super) {
13557                 bool non_prop_matches_above_Unicode =
13558                             runtime_posix_matches_above_Unicode
13559                             | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
13560                 if (invert) {
13561                     non_prop_matches_above_Unicode =
13562                                             !  non_prop_matches_above_Unicode;
13563                 }
13564                 warn_super = ! non_prop_matches_above_Unicode;
13565             }
13566
13567             _invlist_union(properties, cp_list, &cp_list);
13568             SvREFCNT_dec_NN(properties);
13569         }
13570         else {
13571             cp_list = properties;
13572         }
13573
13574         if (warn_super) {
13575             OP(ret) = ANYOF_WARN_SUPER;
13576         }
13577     }
13578
13579     /* Here, we have calculated what code points should be in the character
13580      * class.
13581      *
13582      * Now we can see about various optimizations.  Fold calculation (which we
13583      * did above) needs to take place before inversion.  Otherwise /[^k]/i
13584      * would invert to include K, which under /i would match k, which it
13585      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
13586      * folded until runtime */
13587
13588     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
13589      * at compile time.  Besides not inverting folded locale now, we can't
13590      * invert if there are things such as \w, which aren't known until runtime
13591      * */
13592     if (invert
13593         && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
13594         && ! depends_list
13595         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13596     {
13597         _invlist_invert(cp_list);
13598
13599         /* Any swash can't be used as-is, because we've inverted things */
13600         if (swash) {
13601             SvREFCNT_dec_NN(swash);
13602             swash = NULL;
13603         }
13604
13605         /* Clear the invert flag since have just done it here */
13606         invert = FALSE;
13607     }
13608
13609     if (ret_invlist) {
13610         *ret_invlist = cp_list;
13611
13612         /* Discard the generated node */
13613         if (SIZE_ONLY) {
13614             RExC_size = orig_size;
13615         }
13616         else {
13617             RExC_emit = orig_emit;
13618         }
13619         return orig_emit;
13620     }
13621
13622     /* If we didn't do folding, it's because some information isn't available
13623      * until runtime; set the run-time fold flag for these.  (We don't have to
13624      * worry about properties folding, as that is taken care of by the swash
13625      * fetching) */
13626     if (FOLD && LOC)
13627     {
13628        ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
13629     }
13630
13631     /* Some character classes are equivalent to other nodes.  Such nodes take
13632      * up less room and generally fewer operations to execute than ANYOF nodes.
13633      * Above, we checked for and optimized into some such equivalents for
13634      * certain common classes that are easy to test.  Getting to this point in
13635      * the code means that the class didn't get optimized there.  Since this
13636      * code is only executed in Pass 2, it is too late to save space--it has
13637      * been allocated in Pass 1, and currently isn't given back.  But turning
13638      * things into an EXACTish node can allow the optimizer to join it to any
13639      * adjacent such nodes.  And if the class is equivalent to things like /./,
13640      * expensive run-time swashes can be avoided.  Now that we have more
13641      * complete information, we can find things necessarily missed by the
13642      * earlier code.  I (khw) am not sure how much to look for here.  It would
13643      * be easy, but perhaps too slow, to check any candidates against all the
13644      * node types they could possibly match using _invlistEQ(). */
13645
13646     if (cp_list
13647         && ! invert
13648         && ! depends_list
13649         && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
13650         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13651     {
13652         UV start, end;
13653         U8 op = END;  /* The optimzation node-type */
13654         const char * cur_parse= RExC_parse;
13655
13656         invlist_iterinit(cp_list);
13657         if (! invlist_iternext(cp_list, &start, &end)) {
13658
13659             /* Here, the list is empty.  This happens, for example, when a
13660              * Unicode property is the only thing in the character class, and
13661              * it doesn't match anything.  (perluniprops.pod notes such
13662              * properties) */
13663             op = OPFAIL;
13664             *flagp |= HASWIDTH|SIMPLE;
13665         }
13666         else if (start == end) {    /* The range is a single code point */
13667             if (! invlist_iternext(cp_list, &start, &end)
13668
13669                     /* Don't do this optimization if it would require changing
13670                      * the pattern to UTF-8 */
13671                 && (start < 256 || UTF))
13672             {
13673                 /* Here, the list contains a single code point.  Can optimize
13674                  * into an EXACT node */
13675
13676                 value = start;
13677
13678                 if (! FOLD) {
13679                     op = EXACT;
13680                 }
13681                 else if (LOC) {
13682
13683                     /* A locale node under folding with one code point can be
13684                      * an EXACTFL, as its fold won't be calculated until
13685                      * runtime */
13686                     op = EXACTFL;
13687                 }
13688                 else {
13689
13690                     /* Here, we are generally folding, but there is only one
13691                      * code point to match.  If we have to, we use an EXACT
13692                      * node, but it would be better for joining with adjacent
13693                      * nodes in the optimization pass if we used the same
13694                      * EXACTFish node that any such are likely to be.  We can
13695                      * do this iff the code point doesn't participate in any
13696                      * folds.  For example, an EXACTF of a colon is the same as
13697                      * an EXACT one, since nothing folds to or from a colon. */
13698                     if (value < 256) {
13699                         if (IS_IN_SOME_FOLD_L1(value)) {
13700                             op = EXACT;
13701                         }
13702                     }
13703                     else {
13704                         if (! PL_utf8_foldable) {
13705                             SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13706                                                 &PL_sv_undef, 1, 0);
13707                             PL_utf8_foldable = _get_swash_invlist(swash);
13708                             SvREFCNT_dec_NN(swash);
13709                         }
13710                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13711                             op = EXACT;
13712                         }
13713                     }
13714
13715                     /* If we haven't found the node type, above, it means we
13716                      * can use the prevailing one */
13717                     if (op == END) {
13718                         op = compute_EXACTish(pRExC_state);
13719                     }
13720                 }
13721             }
13722         }
13723         else if (start == 0) {
13724             if (end == UV_MAX) {
13725                 op = SANY;
13726                 *flagp |= HASWIDTH|SIMPLE;
13727                 RExC_naughty++;
13728             }
13729             else if (end == '\n' - 1
13730                     && invlist_iternext(cp_list, &start, &end)
13731                     && start == '\n' + 1 && end == UV_MAX)
13732             {
13733                 op = REG_ANY;
13734                 *flagp |= HASWIDTH|SIMPLE;
13735                 RExC_naughty++;
13736             }
13737         }
13738         invlist_iterfinish(cp_list);
13739
13740         if (op != END) {
13741             RExC_parse = (char *)orig_parse;
13742             RExC_emit = (regnode *)orig_emit;
13743
13744             ret = reg_node(pRExC_state, op);
13745
13746             RExC_parse = (char *)cur_parse;
13747
13748             if (PL_regkind[op] == EXACT) {
13749                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13750             }
13751
13752             SvREFCNT_dec_NN(cp_list);
13753             return ret;
13754         }
13755     }
13756
13757     /* Here, <cp_list> contains all the code points we can determine at
13758      * compile time that match under all conditions.  Go through it, and
13759      * for things that belong in the bitmap, put them there, and delete from
13760      * <cp_list>.  While we are at it, see if everything above 255 is in the
13761      * list, and if so, set a flag to speed up execution */
13762     ANYOF_BITMAP_ZERO(ret);
13763     if (cp_list) {
13764
13765         /* This gets set if we actually need to modify things */
13766         bool change_invlist = FALSE;
13767
13768         UV start, end;
13769
13770         /* Start looking through <cp_list> */
13771         invlist_iterinit(cp_list);
13772         while (invlist_iternext(cp_list, &start, &end)) {
13773             UV high;
13774             int i;
13775
13776             if (end == UV_MAX && start <= 256) {
13777                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13778             }
13779
13780             /* Quit if are above what we should change */
13781             if (start > 255) {
13782                 break;
13783             }
13784
13785             change_invlist = TRUE;
13786
13787             /* Set all the bits in the range, up to the max that we are doing */
13788             high = (end < 255) ? end : 255;
13789             for (i = start; i <= (int) high; i++) {
13790                 if (! ANYOF_BITMAP_TEST(ret, i)) {
13791                     ANYOF_BITMAP_SET(ret, i);
13792                     prevvalue = value;
13793                     value = i;
13794                 }
13795             }
13796         }
13797         invlist_iterfinish(cp_list);
13798
13799         /* Done with loop; remove any code points that are in the bitmap from
13800          * <cp_list> */
13801         if (change_invlist) {
13802             _invlist_subtract(cp_list, PL_Latin1, &cp_list);
13803         }
13804
13805         /* If have completely emptied it, remove it completely */
13806         if (_invlist_len(cp_list) == 0) {
13807             SvREFCNT_dec_NN(cp_list);
13808             cp_list = NULL;
13809         }
13810     }
13811
13812     if (invert) {
13813         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
13814     }
13815
13816     /* Here, the bitmap has been populated with all the Latin1 code points that
13817      * always match.  Can now add to the overall list those that match only
13818      * when the target string is UTF-8 (<depends_list>). */
13819     if (depends_list) {
13820         if (cp_list) {
13821             _invlist_union(cp_list, depends_list, &cp_list);
13822             SvREFCNT_dec_NN(depends_list);
13823         }
13824         else {
13825             cp_list = depends_list;
13826         }
13827     }
13828
13829     /* If there is a swash and more than one element, we can't use the swash in
13830      * the optimization below. */
13831     if (swash && element_count > 1) {
13832         SvREFCNT_dec_NN(swash);
13833         swash = NULL;
13834     }
13835
13836     if (! cp_list
13837         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13838     {
13839         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
13840     }
13841     else {
13842         /* av[0] stores the character class description in its textual form:
13843          *       used later (regexec.c:Perl_regclass_swash()) to initialize the
13844          *       appropriate swash, and is also useful for dumping the regnode.
13845          * av[1] if NULL, is a placeholder to later contain the swash computed
13846          *       from av[0].  But if no further computation need be done, the
13847          *       swash is stored there now.
13848          * av[2] stores the cp_list inversion list for use in addition or
13849          *       instead of av[0]; used only if av[1] is NULL
13850          * av[3] is set if any component of the class is from a user-defined
13851          *       property; used only if av[1] is NULL */
13852         AV * const av = newAV();
13853         SV *rv;
13854
13855         av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13856                         ? SvREFCNT_inc(listsv) : &PL_sv_undef);
13857         if (swash) {
13858             av_store(av, 1, swash);
13859             SvREFCNT_dec_NN(cp_list);
13860         }
13861         else {
13862             av_store(av, 1, NULL);
13863             if (cp_list) {
13864                 av_store(av, 2, cp_list);
13865                 av_store(av, 3, newSVuv(has_user_defined_property));
13866             }
13867         }
13868
13869         rv = newRV_noinc(MUTABLE_SV(av));
13870         n = add_data(pRExC_state, 1, "s");
13871         RExC_rxi->data->data[n] = (void*)rv;
13872         ARG_SET(ret, n);
13873     }
13874
13875     *flagp |= HASWIDTH|SIMPLE;
13876     return ret;
13877 }
13878 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
13879
13880
13881 /* reg_skipcomment()
13882
13883    Absorbs an /x style # comments from the input stream.
13884    Returns true if there is more text remaining in the stream.
13885    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
13886    terminates the pattern without including a newline.
13887
13888    Note its the callers responsibility to ensure that we are
13889    actually in /x mode
13890
13891 */
13892
13893 STATIC bool
13894 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
13895 {
13896     bool ended = 0;
13897
13898     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
13899
13900     while (RExC_parse < RExC_end)
13901         if (*RExC_parse++ == '\n') {
13902             ended = 1;
13903             break;
13904         }
13905     if (!ended) {
13906         /* we ran off the end of the pattern without ending
13907            the comment, so we have to add an \n when wrapping */
13908         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
13909         return 0;
13910     } else
13911         return 1;
13912 }
13913
13914 /* nextchar()
13915
13916    Advances the parse position, and optionally absorbs
13917    "whitespace" from the inputstream.
13918
13919    Without /x "whitespace" means (?#...) style comments only,
13920    with /x this means (?#...) and # comments and whitespace proper.
13921
13922    Returns the RExC_parse point from BEFORE the scan occurs.
13923
13924    This is the /x friendly way of saying RExC_parse++.
13925 */
13926
13927 STATIC char*
13928 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
13929 {
13930     char* const retval = RExC_parse++;
13931
13932     PERL_ARGS_ASSERT_NEXTCHAR;
13933
13934     for (;;) {
13935         if (RExC_end - RExC_parse >= 3
13936             && *RExC_parse == '('
13937             && RExC_parse[1] == '?'
13938             && RExC_parse[2] == '#')
13939         {
13940             while (*RExC_parse != ')') {
13941                 if (RExC_parse == RExC_end)
13942                     FAIL("Sequence (?#... not terminated");
13943                 RExC_parse++;
13944             }
13945             RExC_parse++;
13946             continue;
13947         }
13948         if (RExC_flags & RXf_PMf_EXTENDED) {
13949             if (isSPACE(*RExC_parse)) {
13950                 RExC_parse++;
13951                 continue;
13952             }
13953             else if (*RExC_parse == '#') {
13954                 if ( reg_skipcomment( pRExC_state ) )
13955                     continue;
13956             }
13957         }
13958         return retval;
13959     }
13960 }
13961
13962 /*
13963 - reg_node - emit a node
13964 */
13965 STATIC regnode *                        /* Location. */
13966 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
13967 {
13968     dVAR;
13969     regnode *ptr;
13970     regnode * const ret = RExC_emit;
13971     GET_RE_DEBUG_FLAGS_DECL;
13972
13973     PERL_ARGS_ASSERT_REG_NODE;
13974
13975     if (SIZE_ONLY) {
13976         SIZE_ALIGN(RExC_size);
13977         RExC_size += 1;
13978         return(ret);
13979     }
13980     if (RExC_emit >= RExC_emit_bound)
13981         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13982                    op, RExC_emit, RExC_emit_bound);
13983
13984     NODE_ALIGN_FILL(ret);
13985     ptr = ret;
13986     FILL_ADVANCE_NODE(ptr, op);
13987 #ifdef RE_TRACK_PATTERN_OFFSETS
13988     if (RExC_offsets) {         /* MJD */
13989         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
13990               "reg_node", __LINE__,
13991               PL_reg_name[op],
13992               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
13993                 ? "Overwriting end of array!\n" : "OK",
13994               (UV)(RExC_emit - RExC_emit_start),
13995               (UV)(RExC_parse - RExC_start),
13996               (UV)RExC_offsets[0]));
13997         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
13998     }
13999 #endif
14000     RExC_emit = ptr;
14001     return(ret);
14002 }
14003
14004 /*
14005 - reganode - emit a node with an argument
14006 */
14007 STATIC regnode *                        /* Location. */
14008 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
14009 {
14010     dVAR;
14011     regnode *ptr;
14012     regnode * const ret = RExC_emit;
14013     GET_RE_DEBUG_FLAGS_DECL;
14014
14015     PERL_ARGS_ASSERT_REGANODE;
14016
14017     if (SIZE_ONLY) {
14018         SIZE_ALIGN(RExC_size);
14019         RExC_size += 2;
14020         /*
14021            We can't do this:
14022
14023            assert(2==regarglen[op]+1);
14024
14025            Anything larger than this has to allocate the extra amount.
14026            If we changed this to be:
14027
14028            RExC_size += (1 + regarglen[op]);
14029
14030            then it wouldn't matter. Its not clear what side effect
14031            might come from that so its not done so far.
14032            -- dmq
14033         */
14034         return(ret);
14035     }
14036     if (RExC_emit >= RExC_emit_bound)
14037         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14038                    op, RExC_emit, RExC_emit_bound);
14039
14040     NODE_ALIGN_FILL(ret);
14041     ptr = ret;
14042     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
14043 #ifdef RE_TRACK_PATTERN_OFFSETS
14044     if (RExC_offsets) {         /* MJD */
14045         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14046               "reganode",
14047               __LINE__,
14048               PL_reg_name[op],
14049               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
14050               "Overwriting end of array!\n" : "OK",
14051               (UV)(RExC_emit - RExC_emit_start),
14052               (UV)(RExC_parse - RExC_start),
14053               (UV)RExC_offsets[0]));
14054         Set_Cur_Node_Offset;
14055     }
14056 #endif
14057     RExC_emit = ptr;
14058     return(ret);
14059 }
14060
14061 /*
14062 - reguni - emit (if appropriate) a Unicode character
14063 */
14064 STATIC STRLEN
14065 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
14066 {
14067     dVAR;
14068
14069     PERL_ARGS_ASSERT_REGUNI;
14070
14071     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
14072 }
14073
14074 /*
14075 - reginsert - insert an operator in front of already-emitted operand
14076 *
14077 * Means relocating the operand.
14078 */
14079 STATIC void
14080 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
14081 {
14082     dVAR;
14083     regnode *src;
14084     regnode *dst;
14085     regnode *place;
14086     const int offset = regarglen[(U8)op];
14087     const int size = NODE_STEP_REGNODE + offset;
14088     GET_RE_DEBUG_FLAGS_DECL;
14089
14090     PERL_ARGS_ASSERT_REGINSERT;
14091     PERL_UNUSED_ARG(depth);
14092 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
14093     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
14094     if (SIZE_ONLY) {
14095         RExC_size += size;
14096         return;
14097     }
14098
14099     src = RExC_emit;
14100     RExC_emit += size;
14101     dst = RExC_emit;
14102     if (RExC_open_parens) {
14103         int paren;
14104         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
14105         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
14106             if ( RExC_open_parens[paren] >= opnd ) {
14107                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
14108                 RExC_open_parens[paren] += size;
14109             } else {
14110                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
14111             }
14112             if ( RExC_close_parens[paren] >= opnd ) {
14113                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
14114                 RExC_close_parens[paren] += size;
14115             } else {
14116                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
14117             }
14118         }
14119     }
14120
14121     while (src > opnd) {
14122         StructCopy(--src, --dst, regnode);
14123 #ifdef RE_TRACK_PATTERN_OFFSETS
14124         if (RExC_offsets) {     /* MJD 20010112 */
14125             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
14126                   "reg_insert",
14127                   __LINE__,
14128                   PL_reg_name[op],
14129                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
14130                     ? "Overwriting end of array!\n" : "OK",
14131                   (UV)(src - RExC_emit_start),
14132                   (UV)(dst - RExC_emit_start),
14133                   (UV)RExC_offsets[0]));
14134             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
14135             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
14136         }
14137 #endif
14138     }
14139
14140
14141     place = opnd;               /* Op node, where operand used to be. */
14142 #ifdef RE_TRACK_PATTERN_OFFSETS
14143     if (RExC_offsets) {         /* MJD */
14144         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14145               "reginsert",
14146               __LINE__,
14147               PL_reg_name[op],
14148               (UV)(place - RExC_emit_start) > RExC_offsets[0]
14149               ? "Overwriting end of array!\n" : "OK",
14150               (UV)(place - RExC_emit_start),
14151               (UV)(RExC_parse - RExC_start),
14152               (UV)RExC_offsets[0]));
14153         Set_Node_Offset(place, RExC_parse);
14154         Set_Node_Length(place, 1);
14155     }
14156 #endif
14157     src = NEXTOPER(place);
14158     FILL_ADVANCE_NODE(place, op);
14159     Zero(src, offset, regnode);
14160 }
14161
14162 /*
14163 - regtail - set the next-pointer at the end of a node chain of p to val.
14164 - SEE ALSO: regtail_study
14165 */
14166 /* TODO: All three parms should be const */
14167 STATIC void
14168 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14169 {
14170     dVAR;
14171     regnode *scan;
14172     GET_RE_DEBUG_FLAGS_DECL;
14173
14174     PERL_ARGS_ASSERT_REGTAIL;
14175 #ifndef DEBUGGING
14176     PERL_UNUSED_ARG(depth);
14177 #endif
14178
14179     if (SIZE_ONLY)
14180         return;
14181
14182     /* Find last node. */
14183     scan = p;
14184     for (;;) {
14185         regnode * const temp = regnext(scan);
14186         DEBUG_PARSE_r({
14187             SV * const mysv=sv_newmortal();
14188             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
14189             regprop(RExC_rx, mysv, scan);
14190             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
14191                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
14192                     (temp == NULL ? "->" : ""),
14193                     (temp == NULL ? PL_reg_name[OP(val)] : "")
14194             );
14195         });
14196         if (temp == NULL)
14197             break;
14198         scan = temp;
14199     }
14200
14201     if (reg_off_by_arg[OP(scan)]) {
14202         ARG_SET(scan, val - scan);
14203     }
14204     else {
14205         NEXT_OFF(scan) = val - scan;
14206     }
14207 }
14208
14209 #ifdef DEBUGGING
14210 /*
14211 - regtail_study - set the next-pointer at the end of a node chain of p to val.
14212 - Look for optimizable sequences at the same time.
14213 - currently only looks for EXACT chains.
14214
14215 This is experimental code. The idea is to use this routine to perform
14216 in place optimizations on branches and groups as they are constructed,
14217 with the long term intention of removing optimization from study_chunk so
14218 that it is purely analytical.
14219
14220 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
14221 to control which is which.
14222
14223 */
14224 /* TODO: All four parms should be const */
14225
14226 STATIC U8
14227 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14228 {
14229     dVAR;
14230     regnode *scan;
14231     U8 exact = PSEUDO;
14232 #ifdef EXPERIMENTAL_INPLACESCAN
14233     I32 min = 0;
14234 #endif
14235     GET_RE_DEBUG_FLAGS_DECL;
14236
14237     PERL_ARGS_ASSERT_REGTAIL_STUDY;
14238
14239
14240     if (SIZE_ONLY)
14241         return exact;
14242
14243     /* Find last node. */
14244
14245     scan = p;
14246     for (;;) {
14247         regnode * const temp = regnext(scan);
14248 #ifdef EXPERIMENTAL_INPLACESCAN
14249         if (PL_regkind[OP(scan)] == EXACT) {
14250             bool has_exactf_sharp_s;    /* Unexamined in this routine */
14251             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
14252                 return EXACT;
14253         }
14254 #endif
14255         if ( exact ) {
14256             switch (OP(scan)) {
14257                 case EXACT:
14258                 case EXACTF:
14259                 case EXACTFA:
14260                 case EXACTFU:
14261                 case EXACTFU_SS:
14262                 case EXACTFU_TRICKYFOLD:
14263                 case EXACTFL:
14264                         if( exact == PSEUDO )
14265                             exact= OP(scan);
14266                         else if ( exact != OP(scan) )
14267                             exact= 0;
14268                 case NOTHING:
14269                     break;
14270                 default:
14271                     exact= 0;
14272             }
14273         }
14274         DEBUG_PARSE_r({
14275             SV * const mysv=sv_newmortal();
14276             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
14277             regprop(RExC_rx, mysv, scan);
14278             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
14279                 SvPV_nolen_const(mysv),
14280                 REG_NODE_NUM(scan),
14281                 PL_reg_name[exact]);
14282         });
14283         if (temp == NULL)
14284             break;
14285         scan = temp;
14286     }
14287     DEBUG_PARSE_r({
14288         SV * const mysv_val=sv_newmortal();
14289         DEBUG_PARSE_MSG("");
14290         regprop(RExC_rx, mysv_val, val);
14291         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
14292                       SvPV_nolen_const(mysv_val),
14293                       (IV)REG_NODE_NUM(val),
14294                       (IV)(val - scan)
14295         );
14296     });
14297     if (reg_off_by_arg[OP(scan)]) {
14298         ARG_SET(scan, val - scan);
14299     }
14300     else {
14301         NEXT_OFF(scan) = val - scan;
14302     }
14303
14304     return exact;
14305 }
14306 #endif
14307
14308 /*
14309  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
14310  */
14311 #ifdef DEBUGGING
14312 static void
14313 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
14314 {
14315     int bit;
14316     int set=0;
14317     regex_charset cs;
14318
14319     for (bit=0; bit<32; bit++) {
14320         if (flags & (1<<bit)) {
14321             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
14322                 continue;
14323             }
14324             if (!set++ && lead)
14325                 PerlIO_printf(Perl_debug_log, "%s",lead);
14326             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
14327         }
14328     }
14329     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
14330             if (!set++ && lead) {
14331                 PerlIO_printf(Perl_debug_log, "%s",lead);
14332             }
14333             switch (cs) {
14334                 case REGEX_UNICODE_CHARSET:
14335                     PerlIO_printf(Perl_debug_log, "UNICODE");
14336                     break;
14337                 case REGEX_LOCALE_CHARSET:
14338                     PerlIO_printf(Perl_debug_log, "LOCALE");
14339                     break;
14340                 case REGEX_ASCII_RESTRICTED_CHARSET:
14341                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
14342                     break;
14343                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
14344                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
14345                     break;
14346                 default:
14347                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
14348                     break;
14349             }
14350     }
14351     if (lead)  {
14352         if (set)
14353             PerlIO_printf(Perl_debug_log, "\n");
14354         else
14355             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14356     }
14357 }
14358 #endif
14359
14360 void
14361 Perl_regdump(pTHX_ const regexp *r)
14362 {
14363 #ifdef DEBUGGING
14364     dVAR;
14365     SV * const sv = sv_newmortal();
14366     SV *dsv= sv_newmortal();
14367     RXi_GET_DECL(r,ri);
14368     GET_RE_DEBUG_FLAGS_DECL;
14369
14370     PERL_ARGS_ASSERT_REGDUMP;
14371
14372     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
14373
14374     /* Header fields of interest. */
14375     if (r->anchored_substr) {
14376         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
14377             RE_SV_DUMPLEN(r->anchored_substr), 30);
14378         PerlIO_printf(Perl_debug_log,
14379                       "anchored %s%s at %"IVdf" ",
14380                       s, RE_SV_TAIL(r->anchored_substr),
14381                       (IV)r->anchored_offset);
14382     } else if (r->anchored_utf8) {
14383         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
14384             RE_SV_DUMPLEN(r->anchored_utf8), 30);
14385         PerlIO_printf(Perl_debug_log,
14386                       "anchored utf8 %s%s at %"IVdf" ",
14387                       s, RE_SV_TAIL(r->anchored_utf8),
14388                       (IV)r->anchored_offset);
14389     }
14390     if (r->float_substr) {
14391         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
14392             RE_SV_DUMPLEN(r->float_substr), 30);
14393         PerlIO_printf(Perl_debug_log,
14394                       "floating %s%s at %"IVdf"..%"UVuf" ",
14395                       s, RE_SV_TAIL(r->float_substr),
14396                       (IV)r->float_min_offset, (UV)r->float_max_offset);
14397     } else if (r->float_utf8) {
14398         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
14399             RE_SV_DUMPLEN(r->float_utf8), 30);
14400         PerlIO_printf(Perl_debug_log,
14401                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
14402                       s, RE_SV_TAIL(r->float_utf8),
14403                       (IV)r->float_min_offset, (UV)r->float_max_offset);
14404     }
14405     if (r->check_substr || r->check_utf8)
14406         PerlIO_printf(Perl_debug_log,
14407                       (const char *)
14408                       (r->check_substr == r->float_substr
14409                        && r->check_utf8 == r->float_utf8
14410                        ? "(checking floating" : "(checking anchored"));
14411     if (r->extflags & RXf_NOSCAN)
14412         PerlIO_printf(Perl_debug_log, " noscan");
14413     if (r->extflags & RXf_CHECK_ALL)
14414         PerlIO_printf(Perl_debug_log, " isall");
14415     if (r->check_substr || r->check_utf8)
14416         PerlIO_printf(Perl_debug_log, ") ");
14417
14418     if (ri->regstclass) {
14419         regprop(r, sv, ri->regstclass);
14420         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
14421     }
14422     if (r->extflags & RXf_ANCH) {
14423         PerlIO_printf(Perl_debug_log, "anchored");
14424         if (r->extflags & RXf_ANCH_BOL)
14425             PerlIO_printf(Perl_debug_log, "(BOL)");
14426         if (r->extflags & RXf_ANCH_MBOL)
14427             PerlIO_printf(Perl_debug_log, "(MBOL)");
14428         if (r->extflags & RXf_ANCH_SBOL)
14429             PerlIO_printf(Perl_debug_log, "(SBOL)");
14430         if (r->extflags & RXf_ANCH_GPOS)
14431             PerlIO_printf(Perl_debug_log, "(GPOS)");
14432         PerlIO_putc(Perl_debug_log, ' ');
14433     }
14434     if (r->extflags & RXf_GPOS_SEEN)
14435         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
14436     if (r->intflags & PREGf_SKIP)
14437         PerlIO_printf(Perl_debug_log, "plus ");
14438     if (r->intflags & PREGf_IMPLICIT)
14439         PerlIO_printf(Perl_debug_log, "implicit ");
14440     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
14441     if (r->extflags & RXf_EVAL_SEEN)
14442         PerlIO_printf(Perl_debug_log, "with eval ");
14443     PerlIO_printf(Perl_debug_log, "\n");
14444     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
14445 #else
14446     PERL_ARGS_ASSERT_REGDUMP;
14447     PERL_UNUSED_CONTEXT;
14448     PERL_UNUSED_ARG(r);
14449 #endif  /* DEBUGGING */
14450 }
14451
14452 /*
14453 - regprop - printable representation of opcode
14454 */
14455 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
14456 STMT_START { \
14457         if (do_sep) {                           \
14458             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
14459             if (flags & ANYOF_INVERT)           \
14460                 /*make sure the invert info is in each */ \
14461                 sv_catpvs(sv, "^");             \
14462             do_sep = 0;                         \
14463         }                                       \
14464 } STMT_END
14465
14466 void
14467 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
14468 {
14469 #ifdef DEBUGGING
14470     dVAR;
14471     int k;
14472
14473     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
14474     static const char * const anyofs[] = {
14475 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
14476     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
14477     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
14478     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
14479     || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
14480     || _CC_VERTSPACE != 16
14481   #error Need to adjust order of anyofs[]
14482 #endif
14483         "[\\w]",
14484         "[\\W]",
14485         "[\\d]",
14486         "[\\D]",
14487         "[:alpha:]",
14488         "[:^alpha:]",
14489         "[:lower:]",
14490         "[:^lower:]",
14491         "[:upper:]",
14492         "[:^upper:]",
14493         "[:punct:]",
14494         "[:^punct:]",
14495         "[:print:]",
14496         "[:^print:]",
14497         "[:alnum:]",
14498         "[:^alnum:]",
14499         "[:graph:]",
14500         "[:^graph:]",
14501         "[:cased:]",
14502         "[:^cased:]",
14503         "[\\s]",
14504         "[\\S]",
14505         "[:blank:]",
14506         "[:^blank:]",
14507         "[:xdigit:]",
14508         "[:^xdigit:]",
14509         "[:space:]",
14510         "[:^space:]",
14511         "[:cntrl:]",
14512         "[:^cntrl:]",
14513         "[:ascii:]",
14514         "[:^ascii:]",
14515         "[\\v]",
14516         "[\\V]"
14517     };
14518     RXi_GET_DECL(prog,progi);
14519     GET_RE_DEBUG_FLAGS_DECL;
14520
14521     PERL_ARGS_ASSERT_REGPROP;
14522
14523     sv_setpvs(sv, "");
14524
14525     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
14526         /* It would be nice to FAIL() here, but this may be called from
14527            regexec.c, and it would be hard to supply pRExC_state. */
14528         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
14529     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
14530
14531     k = PL_regkind[OP(o)];
14532
14533     if (k == EXACT) {
14534         sv_catpvs(sv, " ");
14535         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
14536          * is a crude hack but it may be the best for now since
14537          * we have no flag "this EXACTish node was UTF-8"
14538          * --jhi */
14539         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
14540                   PERL_PV_ESCAPE_UNI_DETECT |
14541                   PERL_PV_ESCAPE_NONASCII   |
14542                   PERL_PV_PRETTY_ELLIPSES   |
14543                   PERL_PV_PRETTY_LTGT       |
14544                   PERL_PV_PRETTY_NOCLEAR
14545                   );
14546     } else if (k == TRIE) {
14547         /* print the details of the trie in dumpuntil instead, as
14548          * progi->data isn't available here */
14549         const char op = OP(o);
14550         const U32 n = ARG(o);
14551         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
14552                (reg_ac_data *)progi->data->data[n] :
14553                NULL;
14554         const reg_trie_data * const trie
14555             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
14556
14557         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
14558         DEBUG_TRIE_COMPILE_r(
14559             Perl_sv_catpvf(aTHX_ sv,
14560                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
14561                 (UV)trie->startstate,
14562                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
14563                 (UV)trie->wordcount,
14564                 (UV)trie->minlen,
14565                 (UV)trie->maxlen,
14566                 (UV)TRIE_CHARCOUNT(trie),
14567                 (UV)trie->uniquecharcount
14568             )
14569         );
14570         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
14571             int i;
14572             int rangestart = -1;
14573             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
14574             sv_catpvs(sv, "[");
14575             for (i = 0; i <= 256; i++) {
14576                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
14577                     if (rangestart == -1)
14578                         rangestart = i;
14579                 } else if (rangestart != -1) {
14580                     if (i <= rangestart + 3)
14581                         for (; rangestart < i; rangestart++)
14582                             put_byte(sv, rangestart);
14583                     else {
14584                         put_byte(sv, rangestart);
14585                         sv_catpvs(sv, "-");
14586                         put_byte(sv, i - 1);
14587                     }
14588                     rangestart = -1;
14589                 }
14590             }
14591             sv_catpvs(sv, "]");
14592         }
14593
14594     } else if (k == CURLY) {
14595         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
14596             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
14597         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
14598     }
14599     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
14600         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
14601     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
14602         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
14603         if ( RXp_PAREN_NAMES(prog) ) {
14604             if ( k != REF || (OP(o) < NREF)) {
14605                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
14606                 SV **name= av_fetch(list, ARG(o), 0 );
14607                 if (name)
14608                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14609             }
14610             else {
14611                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
14612                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
14613                 I32 *nums=(I32*)SvPVX(sv_dat);
14614                 SV **name= av_fetch(list, nums[0], 0 );
14615                 I32 n;
14616                 if (name) {
14617                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
14618                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
14619                                     (n ? "," : ""), (IV)nums[n]);
14620                     }
14621                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14622                 }
14623             }
14624         }
14625     } else if (k == GOSUB)
14626         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
14627     else if (k == VERB) {
14628         if (!o->flags)
14629             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
14630                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
14631     } else if (k == LOGICAL)
14632         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
14633     else if (k == ANYOF) {
14634         int i, rangestart = -1;
14635         const U8 flags = ANYOF_FLAGS(o);
14636         int do_sep = 0;
14637
14638
14639         if (flags & ANYOF_LOCALE)
14640             sv_catpvs(sv, "{loc}");
14641         if (flags & ANYOF_LOC_FOLD)
14642             sv_catpvs(sv, "{i}");
14643         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
14644         if (flags & ANYOF_INVERT)
14645             sv_catpvs(sv, "^");
14646
14647         /* output what the standard cp 0-255 bitmap matches */
14648         for (i = 0; i <= 256; i++) {
14649             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
14650                 if (rangestart == -1)
14651                     rangestart = i;
14652             } else if (rangestart != -1) {
14653                 if (i <= rangestart + 3)
14654                     for (; rangestart < i; rangestart++)
14655                         put_byte(sv, rangestart);
14656                 else {
14657                     put_byte(sv, rangestart);
14658                     sv_catpvs(sv, "-");
14659                     put_byte(sv, i - 1);
14660                 }
14661                 do_sep = 1;
14662                 rangestart = -1;
14663             }
14664         }
14665
14666         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14667         /* output any special charclass tests (used entirely under use locale) */
14668         if (ANYOF_CLASS_TEST_ANY_SET(o))
14669             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
14670                 if (ANYOF_CLASS_TEST(o,i)) {
14671                     sv_catpv(sv, anyofs[i]);
14672                     do_sep = 1;
14673                 }
14674
14675         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14676
14677         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
14678             sv_catpvs(sv, "{non-utf8-latin1-all}");
14679         }
14680
14681         /* output information about the unicode matching */
14682         if (flags & ANYOF_UNICODE_ALL)
14683             sv_catpvs(sv, "{unicode_all}");
14684         else if (ANYOF_NONBITMAP(o))
14685             sv_catpvs(sv, "{unicode}");
14686         if (flags & ANYOF_NONBITMAP_NON_UTF8)
14687             sv_catpvs(sv, "{outside bitmap}");
14688
14689         if (ANYOF_NONBITMAP(o)) {
14690             SV *lv; /* Set if there is something outside the bit map */
14691             SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
14692             bool byte_output = FALSE;   /* If something in the bitmap has been
14693                                            output */
14694
14695             if (lv && lv != &PL_sv_undef) {
14696                 if (sw) {
14697                     U8 s[UTF8_MAXBYTES_CASE+1];
14698
14699                     for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
14700                         uvchr_to_utf8(s, i);
14701
14702                         if (i < 256
14703                             && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
14704                                                                things already
14705                                                                output as part
14706                                                                of the bitmap */
14707                             && swash_fetch(sw, s, TRUE))
14708                         {
14709                             if (rangestart == -1)
14710                                 rangestart = i;
14711                         } else if (rangestart != -1) {
14712                             byte_output = TRUE;
14713                             if (i <= rangestart + 3)
14714                                 for (; rangestart < i; rangestart++) {
14715                                     put_byte(sv, rangestart);
14716                                 }
14717                             else {
14718                                 put_byte(sv, rangestart);
14719                                 sv_catpvs(sv, "-");
14720                                 put_byte(sv, i-1);
14721                             }
14722                             rangestart = -1;
14723                         }
14724                     }
14725                 }
14726
14727                 {
14728                     char *s = savesvpv(lv);
14729                     char * const origs = s;
14730
14731                     while (*s && *s != '\n')
14732                         s++;
14733
14734                     if (*s == '\n') {
14735                         const char * const t = ++s;
14736
14737                         if (byte_output) {
14738                             sv_catpvs(sv, " ");
14739                         }
14740
14741                         while (*s) {
14742                             if (*s == '\n') {
14743
14744                                 /* Truncate very long output */
14745                                 if (s - origs > 256) {
14746                                     Perl_sv_catpvf(aTHX_ sv,
14747                                                    "%.*s...",
14748                                                    (int) (s - origs - 1),
14749                                                    t);
14750                                     goto out_dump;
14751                                 }
14752                                 *s = ' ';
14753                             }
14754                             else if (*s == '\t') {
14755                                 *s = '-';
14756                             }
14757                             s++;
14758                         }
14759                         if (s[-1] == ' ')
14760                             s[-1] = 0;
14761
14762                         sv_catpv(sv, t);
14763                     }
14764
14765                 out_dump:
14766
14767                     Safefree(origs);
14768                 }
14769                 SvREFCNT_dec_NN(lv);
14770             }
14771         }
14772
14773         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14774     }
14775     else if (k == POSIXD || k == NPOSIXD) {
14776         U8 index = FLAGS(o) * 2;
14777         if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14778             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14779         }
14780         else {
14781             sv_catpv(sv, anyofs[index]);
14782         }
14783     }
14784     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14785         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14786 #else
14787     PERL_UNUSED_CONTEXT;
14788     PERL_UNUSED_ARG(sv);
14789     PERL_UNUSED_ARG(o);
14790     PERL_UNUSED_ARG(prog);
14791 #endif  /* DEBUGGING */
14792 }
14793
14794 SV *
14795 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14796 {                               /* Assume that RE_INTUIT is set */
14797     dVAR;
14798     struct regexp *const prog = ReANY(r);
14799     GET_RE_DEBUG_FLAGS_DECL;
14800
14801     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14802     PERL_UNUSED_CONTEXT;
14803
14804     DEBUG_COMPILE_r(
14805         {
14806             const char * const s = SvPV_nolen_const(prog->check_substr
14807                       ? prog->check_substr : prog->check_utf8);
14808
14809             if (!PL_colorset) reginitcolors();
14810             PerlIO_printf(Perl_debug_log,
14811                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14812                       PL_colors[4],
14813                       prog->check_substr ? "" : "utf8 ",
14814                       PL_colors[5],PL_colors[0],
14815                       s,
14816                       PL_colors[1],
14817                       (strlen(s) > 60 ? "..." : ""));
14818         } );
14819
14820     return prog->check_substr ? prog->check_substr : prog->check_utf8;
14821 }
14822
14823 /*
14824    pregfree()
14825
14826    handles refcounting and freeing the perl core regexp structure. When
14827    it is necessary to actually free the structure the first thing it
14828    does is call the 'free' method of the regexp_engine associated to
14829    the regexp, allowing the handling of the void *pprivate; member
14830    first. (This routine is not overridable by extensions, which is why
14831    the extensions free is called first.)
14832
14833    See regdupe and regdupe_internal if you change anything here.
14834 */
14835 #ifndef PERL_IN_XSUB_RE
14836 void
14837 Perl_pregfree(pTHX_ REGEXP *r)
14838 {
14839     SvREFCNT_dec(r);
14840 }
14841
14842 void
14843 Perl_pregfree2(pTHX_ REGEXP *rx)
14844 {
14845     dVAR;
14846     struct regexp *const r = ReANY(rx);
14847     GET_RE_DEBUG_FLAGS_DECL;
14848
14849     PERL_ARGS_ASSERT_PREGFREE2;
14850
14851     if (r->mother_re) {
14852         ReREFCNT_dec(r->mother_re);
14853     } else {
14854         CALLREGFREE_PVT(rx); /* free the private data */
14855         SvREFCNT_dec(RXp_PAREN_NAMES(r));
14856         Safefree(r->xpv_len_u.xpvlenu_pv);
14857     }
14858     if (r->substrs) {
14859         SvREFCNT_dec(r->anchored_substr);
14860         SvREFCNT_dec(r->anchored_utf8);
14861         SvREFCNT_dec(r->float_substr);
14862         SvREFCNT_dec(r->float_utf8);
14863         Safefree(r->substrs);
14864     }
14865     RX_MATCH_COPY_FREE(rx);
14866 #ifdef PERL_ANY_COW
14867     SvREFCNT_dec(r->saved_copy);
14868 #endif
14869     Safefree(r->offs);
14870     SvREFCNT_dec(r->qr_anoncv);
14871     rx->sv_u.svu_rx = 0;
14872 }
14873
14874 /*  reg_temp_copy()
14875
14876     This is a hacky workaround to the structural issue of match results
14877     being stored in the regexp structure which is in turn stored in
14878     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
14879     could be PL_curpm in multiple contexts, and could require multiple
14880     result sets being associated with the pattern simultaneously, such
14881     as when doing a recursive match with (??{$qr})
14882
14883     The solution is to make a lightweight copy of the regexp structure
14884     when a qr// is returned from the code executed by (??{$qr}) this
14885     lightweight copy doesn't actually own any of its data except for
14886     the starp/end and the actual regexp structure itself.
14887
14888 */
14889
14890
14891 REGEXP *
14892 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
14893 {
14894     struct regexp *ret;
14895     struct regexp *const r = ReANY(rx);
14896     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
14897
14898     PERL_ARGS_ASSERT_REG_TEMP_COPY;
14899
14900     if (!ret_x)
14901         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
14902     else {
14903         SvOK_off((SV *)ret_x);
14904         if (islv) {
14905             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
14906                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
14907                made both spots point to the same regexp body.) */
14908             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
14909             assert(!SvPVX(ret_x));
14910             ret_x->sv_u.svu_rx = temp->sv_any;
14911             temp->sv_any = NULL;
14912             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
14913             SvREFCNT_dec_NN(temp);
14914             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
14915                ing below will not set it. */
14916             SvCUR_set(ret_x, SvCUR(rx));
14917         }
14918     }
14919     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
14920        sv_force_normal(sv) is called.  */
14921     SvFAKE_on(ret_x);
14922     ret = ReANY(ret_x);
14923
14924     SvFLAGS(ret_x) |= SvUTF8(rx);
14925     /* We share the same string buffer as the original regexp, on which we
14926        hold a reference count, incremented when mother_re is set below.
14927        The string pointer is copied here, being part of the regexp struct.
14928      */
14929     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
14930            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
14931     if (r->offs) {
14932         const I32 npar = r->nparens+1;
14933         Newx(ret->offs, npar, regexp_paren_pair);
14934         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14935     }
14936     if (r->substrs) {
14937         Newx(ret->substrs, 1, struct reg_substr_data);
14938         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14939
14940         SvREFCNT_inc_void(ret->anchored_substr);
14941         SvREFCNT_inc_void(ret->anchored_utf8);
14942         SvREFCNT_inc_void(ret->float_substr);
14943         SvREFCNT_inc_void(ret->float_utf8);
14944
14945         /* check_substr and check_utf8, if non-NULL, point to either their
14946            anchored or float namesakes, and don't hold a second reference.  */
14947     }
14948     RX_MATCH_COPIED_off(ret_x);
14949 #ifdef PERL_ANY_COW
14950     ret->saved_copy = NULL;
14951 #endif
14952     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
14953     SvREFCNT_inc_void(ret->qr_anoncv);
14954
14955     return ret_x;
14956 }
14957 #endif
14958
14959 /* regfree_internal()
14960
14961    Free the private data in a regexp. This is overloadable by
14962    extensions. Perl takes care of the regexp structure in pregfree(),
14963    this covers the *pprivate pointer which technically perl doesn't
14964    know about, however of course we have to handle the
14965    regexp_internal structure when no extension is in use.
14966
14967    Note this is called before freeing anything in the regexp
14968    structure.
14969  */
14970
14971 void
14972 Perl_regfree_internal(pTHX_ REGEXP * const rx)
14973 {
14974     dVAR;
14975     struct regexp *const r = ReANY(rx);
14976     RXi_GET_DECL(r,ri);
14977     GET_RE_DEBUG_FLAGS_DECL;
14978
14979     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
14980
14981     DEBUG_COMPILE_r({
14982         if (!PL_colorset)
14983             reginitcolors();
14984         {
14985             SV *dsv= sv_newmortal();
14986             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
14987                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
14988             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
14989                 PL_colors[4],PL_colors[5],s);
14990         }
14991     });
14992 #ifdef RE_TRACK_PATTERN_OFFSETS
14993     if (ri->u.offsets)
14994         Safefree(ri->u.offsets);             /* 20010421 MJD */
14995 #endif
14996     if (ri->code_blocks) {
14997         int n;
14998         for (n = 0; n < ri->num_code_blocks; n++)
14999             SvREFCNT_dec(ri->code_blocks[n].src_regex);
15000         Safefree(ri->code_blocks);
15001     }
15002
15003     if (ri->data) {
15004         int n = ri->data->count;
15005
15006         while (--n >= 0) {
15007           /* If you add a ->what type here, update the comment in regcomp.h */
15008             switch (ri->data->what[n]) {
15009             case 'a':
15010             case 'r':
15011             case 's':
15012             case 'S':
15013             case 'u':
15014                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
15015                 break;
15016             case 'f':
15017                 Safefree(ri->data->data[n]);
15018                 break;
15019             case 'l':
15020             case 'L':
15021                 break;
15022             case 'T':
15023                 { /* Aho Corasick add-on structure for a trie node.
15024                      Used in stclass optimization only */
15025                     U32 refcount;
15026                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
15027                     OP_REFCNT_LOCK;
15028                     refcount = --aho->refcount;
15029                     OP_REFCNT_UNLOCK;
15030                     if ( !refcount ) {
15031                         PerlMemShared_free(aho->states);
15032                         PerlMemShared_free(aho->fail);
15033                          /* do this last!!!! */
15034                         PerlMemShared_free(ri->data->data[n]);
15035                         PerlMemShared_free(ri->regstclass);
15036                     }
15037                 }
15038                 break;
15039             case 't':
15040                 {
15041                     /* trie structure. */
15042                     U32 refcount;
15043                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
15044                     OP_REFCNT_LOCK;
15045                     refcount = --trie->refcount;
15046                     OP_REFCNT_UNLOCK;
15047                     if ( !refcount ) {
15048                         PerlMemShared_free(trie->charmap);
15049                         PerlMemShared_free(trie->states);
15050                         PerlMemShared_free(trie->trans);
15051                         if (trie->bitmap)
15052                             PerlMemShared_free(trie->bitmap);
15053                         if (trie->jump)
15054                             PerlMemShared_free(trie->jump);
15055                         PerlMemShared_free(trie->wordinfo);
15056                         /* do this last!!!! */
15057                         PerlMemShared_free(ri->data->data[n]);
15058                     }
15059                 }
15060                 break;
15061             default:
15062                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
15063             }
15064         }
15065         Safefree(ri->data->what);
15066         Safefree(ri->data);
15067     }
15068
15069     Safefree(ri);
15070 }
15071
15072 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
15073 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
15074 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
15075
15076 /*
15077    re_dup - duplicate a regexp.
15078
15079    This routine is expected to clone a given regexp structure. It is only
15080    compiled under USE_ITHREADS.
15081
15082    After all of the core data stored in struct regexp is duplicated
15083    the regexp_engine.dupe method is used to copy any private data
15084    stored in the *pprivate pointer. This allows extensions to handle
15085    any duplication it needs to do.
15086
15087    See pregfree() and regfree_internal() if you change anything here.
15088 */
15089 #if defined(USE_ITHREADS)
15090 #ifndef PERL_IN_XSUB_RE
15091 void
15092 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
15093 {
15094     dVAR;
15095     I32 npar;
15096     const struct regexp *r = ReANY(sstr);
15097     struct regexp *ret = ReANY(dstr);
15098
15099     PERL_ARGS_ASSERT_RE_DUP_GUTS;
15100
15101     npar = r->nparens+1;
15102     Newx(ret->offs, npar, regexp_paren_pair);
15103     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15104
15105     if (ret->substrs) {
15106         /* Do it this way to avoid reading from *r after the StructCopy().
15107            That way, if any of the sv_dup_inc()s dislodge *r from the L1
15108            cache, it doesn't matter.  */
15109         const bool anchored = r->check_substr
15110             ? r->check_substr == r->anchored_substr
15111             : r->check_utf8 == r->anchored_utf8;
15112         Newx(ret->substrs, 1, struct reg_substr_data);
15113         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15114
15115         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
15116         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
15117         ret->float_substr = sv_dup_inc(ret->float_substr, param);
15118         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
15119
15120         /* check_substr and check_utf8, if non-NULL, point to either their
15121            anchored or float namesakes, and don't hold a second reference.  */
15122
15123         if (ret->check_substr) {
15124             if (anchored) {
15125                 assert(r->check_utf8 == r->anchored_utf8);
15126                 ret->check_substr = ret->anchored_substr;
15127                 ret->check_utf8 = ret->anchored_utf8;
15128             } else {
15129                 assert(r->check_substr == r->float_substr);
15130                 assert(r->check_utf8 == r->float_utf8);
15131                 ret->check_substr = ret->float_substr;
15132                 ret->check_utf8 = ret->float_utf8;
15133             }
15134         } else if (ret->check_utf8) {
15135             if (anchored) {
15136                 ret->check_utf8 = ret->anchored_utf8;
15137             } else {
15138                 ret->check_utf8 = ret->float_utf8;
15139             }
15140         }
15141     }
15142
15143     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
15144     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
15145
15146     if (ret->pprivate)
15147         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
15148
15149     if (RX_MATCH_COPIED(dstr))
15150         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
15151     else
15152         ret->subbeg = NULL;
15153 #ifdef PERL_ANY_COW
15154     ret->saved_copy = NULL;
15155 #endif
15156
15157     /* Whether mother_re be set or no, we need to copy the string.  We
15158        cannot refrain from copying it when the storage points directly to
15159        our mother regexp, because that's
15160                1: a buffer in a different thread
15161                2: something we no longer hold a reference on
15162                so we need to copy it locally.  */
15163     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
15164     ret->mother_re   = NULL;
15165     ret->gofs = 0;
15166 }
15167 #endif /* PERL_IN_XSUB_RE */
15168
15169 /*
15170    regdupe_internal()
15171
15172    This is the internal complement to regdupe() which is used to copy
15173    the structure pointed to by the *pprivate pointer in the regexp.
15174    This is the core version of the extension overridable cloning hook.
15175    The regexp structure being duplicated will be copied by perl prior
15176    to this and will be provided as the regexp *r argument, however
15177    with the /old/ structures pprivate pointer value. Thus this routine
15178    may override any copying normally done by perl.
15179
15180    It returns a pointer to the new regexp_internal structure.
15181 */
15182
15183 void *
15184 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
15185 {
15186     dVAR;
15187     struct regexp *const r = ReANY(rx);
15188     regexp_internal *reti;
15189     int len;
15190     RXi_GET_DECL(r,ri);
15191
15192     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
15193
15194     len = ProgLen(ri);
15195
15196     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
15197     Copy(ri->program, reti->program, len+1, regnode);
15198
15199     reti->num_code_blocks = ri->num_code_blocks;
15200     if (ri->code_blocks) {
15201         int n;
15202         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
15203                 struct reg_code_block);
15204         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
15205                 struct reg_code_block);
15206         for (n = 0; n < ri->num_code_blocks; n++)
15207              reti->code_blocks[n].src_regex = (REGEXP*)
15208                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
15209     }
15210     else
15211         reti->code_blocks = NULL;
15212
15213     reti->regstclass = NULL;
15214
15215     if (ri->data) {
15216         struct reg_data *d;
15217         const int count = ri->data->count;
15218         int i;
15219
15220         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
15221                 char, struct reg_data);
15222         Newx(d->what, count, U8);
15223
15224         d->count = count;
15225         for (i = 0; i < count; i++) {
15226             d->what[i] = ri->data->what[i];
15227             switch (d->what[i]) {
15228                 /* see also regcomp.h and regfree_internal() */
15229             case 'a': /* actually an AV, but the dup function is identical.  */
15230             case 'r':
15231             case 's':
15232             case 'S':
15233             case 'u': /* actually an HV, but the dup function is identical.  */
15234                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
15235                 break;
15236             case 'f':
15237                 /* This is cheating. */
15238                 Newx(d->data[i], 1, struct regnode_charclass_class);
15239                 StructCopy(ri->data->data[i], d->data[i],
15240                             struct regnode_charclass_class);
15241                 reti->regstclass = (regnode*)d->data[i];
15242                 break;
15243             case 'T':
15244                 /* Trie stclasses are readonly and can thus be shared
15245                  * without duplication. We free the stclass in pregfree
15246                  * when the corresponding reg_ac_data struct is freed.
15247                  */
15248                 reti->regstclass= ri->regstclass;
15249                 /* Fall through */
15250             case 't':
15251                 OP_REFCNT_LOCK;
15252                 ((reg_trie_data*)ri->data->data[i])->refcount++;
15253                 OP_REFCNT_UNLOCK;
15254                 /* Fall through */
15255             case 'l':
15256             case 'L':
15257                 d->data[i] = ri->data->data[i];
15258                 break;
15259             default:
15260                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
15261             }
15262         }
15263
15264         reti->data = d;
15265     }
15266     else
15267         reti->data = NULL;
15268
15269     reti->name_list_idx = ri->name_list_idx;
15270
15271 #ifdef RE_TRACK_PATTERN_OFFSETS
15272     if (ri->u.offsets) {
15273         Newx(reti->u.offsets, 2*len+1, U32);
15274         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
15275     }
15276 #else
15277     SetProgLen(reti,len);
15278 #endif
15279
15280     return (void*)reti;
15281 }
15282
15283 #endif    /* USE_ITHREADS */
15284
15285 #ifndef PERL_IN_XSUB_RE
15286
15287 /*
15288  - regnext - dig the "next" pointer out of a node
15289  */
15290 regnode *
15291 Perl_regnext(pTHX_ regnode *p)
15292 {
15293     dVAR;
15294     I32 offset;
15295
15296     if (!p)
15297         return(NULL);
15298
15299     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
15300         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
15301     }
15302
15303     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
15304     if (offset == 0)
15305         return(NULL);
15306
15307     return(p+offset);
15308 }
15309 #endif
15310
15311 STATIC void
15312 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
15313 {
15314     va_list args;
15315     STRLEN l1 = strlen(pat1);
15316     STRLEN l2 = strlen(pat2);
15317     char buf[512];
15318     SV *msv;
15319     const char *message;
15320
15321     PERL_ARGS_ASSERT_RE_CROAK2;
15322
15323     if (l1 > 510)
15324         l1 = 510;
15325     if (l1 + l2 > 510)
15326         l2 = 510 - l1;
15327     Copy(pat1, buf, l1 , char);
15328     Copy(pat2, buf + l1, l2 , char);
15329     buf[l1 + l2] = '\n';
15330     buf[l1 + l2 + 1] = '\0';
15331 #ifdef I_STDARG
15332     /* ANSI variant takes additional second argument */
15333     va_start(args, pat2);
15334 #else
15335     va_start(args);
15336 #endif
15337     msv = vmess(buf, &args);
15338     va_end(args);
15339     message = SvPV_const(msv,l1);
15340     if (l1 > 512)
15341         l1 = 512;
15342     Copy(message, buf, l1 , char);
15343     buf[l1-1] = '\0';                   /* Overwrite \n */
15344     Perl_croak(aTHX_ "%s", buf);
15345 }
15346
15347 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
15348
15349 #ifndef PERL_IN_XSUB_RE
15350 void
15351 Perl_save_re_context(pTHX)
15352 {
15353     dVAR;
15354
15355     struct re_save_state *state;
15356
15357     SAVEVPTR(PL_curcop);
15358     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
15359
15360     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
15361     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
15362     SSPUSHUV(SAVEt_RE_STATE);
15363
15364     Copy(&PL_reg_state, state, 1, struct re_save_state);
15365
15366     PL_reg_oldsaved = NULL;
15367     PL_reg_oldsavedlen = 0;
15368     PL_reg_oldsavedoffset = 0;
15369     PL_reg_oldsavedcoffset = 0;
15370     PL_reg_maxiter = 0;
15371     PL_reg_leftiter = 0;
15372     PL_reg_poscache = NULL;
15373     PL_reg_poscache_size = 0;
15374 #ifdef PERL_ANY_COW
15375     PL_nrs = NULL;
15376 #endif
15377
15378     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
15379     if (PL_curpm) {
15380         const REGEXP * const rx = PM_GETRE(PL_curpm);
15381         if (rx) {
15382             U32 i;
15383             for (i = 1; i <= RX_NPARENS(rx); i++) {
15384                 char digits[TYPE_CHARS(long)];
15385                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
15386                 GV *const *const gvp
15387                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
15388
15389                 if (gvp) {
15390                     GV * const gv = *gvp;
15391                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
15392                         save_scalar(gv);
15393                 }
15394             }
15395         }
15396     }
15397 }
15398 #endif
15399
15400 #ifdef DEBUGGING
15401
15402 STATIC void
15403 S_put_byte(pTHX_ SV *sv, int c)
15404 {
15405     PERL_ARGS_ASSERT_PUT_BYTE;
15406
15407     /* Our definition of isPRINT() ignores locales, so only bytes that are
15408        not part of UTF-8 are considered printable. I assume that the same
15409        holds for UTF-EBCDIC.
15410        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
15411        which Wikipedia says:
15412
15413        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
15414        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
15415        identical, to the ASCII delete (DEL) or rubout control character. ...
15416        it is typically mapped to hexadecimal code 9F, in order to provide a
15417        unique character mapping in both directions)
15418
15419        So the old condition can be simplified to !isPRINT(c)  */
15420     if (!isPRINT(c)) {
15421         if (c < 256) {
15422             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
15423         }
15424         else {
15425             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
15426         }
15427     }
15428     else {
15429         const char string = c;
15430         if (c == '-' || c == ']' || c == '\\' || c == '^')
15431             sv_catpvs(sv, "\\");
15432         sv_catpvn(sv, &string, 1);
15433     }
15434 }
15435
15436
15437 #define CLEAR_OPTSTART \
15438     if (optstart) STMT_START { \
15439             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
15440             optstart=NULL; \
15441     } STMT_END
15442
15443 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
15444
15445 STATIC const regnode *
15446 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
15447             const regnode *last, const regnode *plast,
15448             SV* sv, I32 indent, U32 depth)
15449 {
15450     dVAR;
15451     U8 op = PSEUDO;     /* Arbitrary non-END op. */
15452     const regnode *next;
15453     const regnode *optstart= NULL;
15454
15455     RXi_GET_DECL(r,ri);
15456     GET_RE_DEBUG_FLAGS_DECL;
15457
15458     PERL_ARGS_ASSERT_DUMPUNTIL;
15459
15460 #ifdef DEBUG_DUMPUNTIL
15461     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
15462         last ? last-start : 0,plast ? plast-start : 0);
15463 #endif
15464
15465     if (plast && plast < last)
15466         last= plast;
15467
15468     while (PL_regkind[op] != END && (!last || node < last)) {
15469         /* While that wasn't END last time... */
15470         NODE_ALIGN(node);
15471         op = OP(node);
15472         if (op == CLOSE || op == WHILEM)
15473             indent--;
15474         next = regnext((regnode *)node);
15475
15476         /* Where, what. */
15477         if (OP(node) == OPTIMIZED) {
15478             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
15479                 optstart = node;
15480             else
15481                 goto after_print;
15482         } else
15483             CLEAR_OPTSTART;
15484
15485         regprop(r, sv, node);
15486         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
15487                       (int)(2*indent + 1), "", SvPVX_const(sv));
15488
15489         if (OP(node) != OPTIMIZED) {
15490             if (next == NULL)           /* Next ptr. */
15491                 PerlIO_printf(Perl_debug_log, " (0)");
15492             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
15493                 PerlIO_printf(Perl_debug_log, " (FAIL)");
15494             else
15495                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
15496             (void)PerlIO_putc(Perl_debug_log, '\n');
15497         }
15498
15499       after_print:
15500         if (PL_regkind[(U8)op] == BRANCHJ) {
15501             assert(next);
15502             {
15503                 const regnode *nnode = (OP(next) == LONGJMP
15504                                        ? regnext((regnode *)next)
15505                                        : next);
15506                 if (last && nnode > last)
15507                     nnode = last;
15508                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
15509             }
15510         }
15511         else if (PL_regkind[(U8)op] == BRANCH) {
15512             assert(next);
15513             DUMPUNTIL(NEXTOPER(node), next);
15514         }
15515         else if ( PL_regkind[(U8)op]  == TRIE ) {
15516             const regnode *this_trie = node;
15517             const char op = OP(node);
15518             const U32 n = ARG(node);
15519             const reg_ac_data * const ac = op>=AHOCORASICK ?
15520                (reg_ac_data *)ri->data->data[n] :
15521                NULL;
15522             const reg_trie_data * const trie =
15523                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
15524 #ifdef DEBUGGING
15525             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
15526 #endif
15527             const regnode *nextbranch= NULL;
15528             I32 word_idx;
15529             sv_setpvs(sv, "");
15530             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
15531                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
15532
15533                 PerlIO_printf(Perl_debug_log, "%*s%s ",
15534                    (int)(2*(indent+3)), "",
15535                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
15536                             PL_colors[0], PL_colors[1],
15537                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
15538                             PERL_PV_PRETTY_ELLIPSES    |
15539                             PERL_PV_PRETTY_LTGT
15540                             )
15541                             : "???"
15542                 );
15543                 if (trie->jump) {
15544                     U16 dist= trie->jump[word_idx+1];
15545                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
15546                                   (UV)((dist ? this_trie + dist : next) - start));
15547                     if (dist) {
15548                         if (!nextbranch)
15549                             nextbranch= this_trie + trie->jump[0];
15550                         DUMPUNTIL(this_trie + dist, nextbranch);
15551                     }
15552                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
15553                         nextbranch= regnext((regnode *)nextbranch);
15554                 } else {
15555                     PerlIO_printf(Perl_debug_log, "\n");
15556                 }
15557             }
15558             if (last && next > last)
15559                 node= last;
15560             else
15561                 node= next;
15562         }
15563         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
15564             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
15565                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
15566         }
15567         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
15568             assert(next);
15569             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
15570         }
15571         else if ( op == PLUS || op == STAR) {
15572             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
15573         }
15574         else if (PL_regkind[(U8)op] == ANYOF) {
15575             /* arglen 1 + class block */
15576             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
15577                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
15578             node = NEXTOPER(node);
15579         }
15580         else if (PL_regkind[(U8)op] == EXACT) {
15581             /* Literal string, where present. */
15582             node += NODE_SZ_STR(node) - 1;
15583             node = NEXTOPER(node);
15584         }
15585         else {
15586             node = NEXTOPER(node);
15587             node += regarglen[(U8)op];
15588         }
15589         if (op == CURLYX || op == OPEN)
15590             indent++;
15591     }
15592     CLEAR_OPTSTART;
15593 #ifdef DEBUG_DUMPUNTIL
15594     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
15595 #endif
15596     return node;
15597 }
15598
15599 #endif  /* DEBUGGING */
15600
15601 /*
15602  * Local variables:
15603  * c-indentation-style: bsd
15604  * c-basic-offset: 4
15605  * indent-tabs-mode: nil
15606  * End:
15607  *
15608  * ex: set ts=8 sts=4 sw=4 et:
15609  */