resource/csdk/connectivity/lib/android/glib-master/glib/pcre/pcre_compile.c

   1 /*************************************************
   2 *      Perl-Compatible Regular Expressions       *
   3 *************************************************/
   4
   5 /* PCRE is a library of functions to support regular expressions whose syntax
   6 and semantics are as close as possible to those of the Perl 5 language.
   7
   8                        Written by Philip Hazel
   9            Copyright (c) 1997-2010 University of Cambridge
  10
  11 -----------------------------------------------------------------------------
  12 Redistribution and use in source and binary forms, with or without
  13 modification, are permitted provided that the following conditions are met:
  14
  15     * Redistributions of source code must retain the above copyright notice,
  16       this list of conditions and the following disclaimer.
  17
  18     * Redistributions in binary form must reproduce the above copyright
  19       notice, this list of conditions and the following disclaimer in the
  20       documentation and/or other materials provided with the distribution.
  21
  22     * Neither the name of the University of Cambridge nor the names of its
  23       contributors may be used to endorse or promote products derived from
  24       this software without specific prior written permission.
  25
  26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  36 POSSIBILITY OF SUCH DAMAGE.
  37 -----------------------------------------------------------------------------
  38 */
  39
  40
  41 /* This module contains the external function pcre_compile(), along with
  42 supporting internal functions that are not used by other modules. */
  43
  44
  45 #ifdef HAVE_CONFIG_H
  46 #include "config.h"
  47 #endif
  48
  49 #define NLBLOCK cd             /* Block containing newline information */
  50 #define PSSTART start_pattern  /* Field containing processed string start */
  51 #define PSEND   end_pattern    /* Field containing processed string end */
  52
  53 #include "pcre_internal.h"
  54
  55
  56 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
  57 also used by pcretest. PCRE_DEBUG is not defined when building a production
  58 library. */
  59
  60 #ifdef PCRE_DEBUG
  61 #include "pcre_printint.src"
  62 #endif
  63
  64
  65 /* Macro for setting individual bits in class bitmaps. */
  66
  67 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
  68
  69 /* Maximum length value to check against when making sure that the integer that
  70 holds the compiled pattern length does not overflow. We make it a bit less than
  71 INT_MAX to allow for adding in group terminating bytes, so that we don't have
  72 to check them every time. */
  73
  74 #define OFLOW_MAX (INT_MAX - 20)
  75
  76
  77 /*************************************************
  78 *      Code parameters and static tables         *
  79 *************************************************/
  80
  81 /* This value specifies the size of stack workspace that is used during the
  82 first pre-compile phase that determines how much memory is required. The regex
  83 is partly compiled into this space, but the compiled parts are discarded as
  84 soon as they can be, so that hopefully there will never be an overrun. The code
  85 does, however, check for an overrun. The largest amount I've seen used is 218,
  86 so this number is very generous.
  87
  88 The same workspace is used during the second, actual compile phase for
  89 remembering forward references to groups so that they can be filled in at the
  90 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
  91 is 4 there is plenty of room. */
  92
  93 #define COMPILE_WORK_SIZE (4096)
  94
  95 /* The overrun tests check for a slightly smaller size so that they detect the
  96 overrun before it actually does run off the end of the data block. */
  97
  98 #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
  99
 100
 101 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
 102 are simple data values; negative values are for special things like \d and so
 103 on. Zero means further processing is needed (for things like \x), or the escape
 104 is invalid. */
 105
 106 #ifndef EBCDIC
 107
 108 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
 109 in UTF-8 mode. */
 110
 111 static const short int escapes[] = {
 112      0,                       0,
 113      0,                       0,
 114      0,                       0,
 115      0,                       0,
 116      0,                       0,
 117      CHAR_COLON,              CHAR_SEMICOLON,
 118      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
 119      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
 120      CHAR_COMMERCIAL_AT,      -ESC_A,
 121      -ESC_B,                  -ESC_C,
 122      -ESC_D,                  -ESC_E,
 123      0,                       -ESC_G,
 124      -ESC_H,                  0,
 125      0,                       -ESC_K,
 126      0,                       0,
 127      0,                       0,
 128      -ESC_P,                  -ESC_Q,
 129      -ESC_R,                  -ESC_S,
 130      0,                       0,
 131      -ESC_V,                  -ESC_W,
 132      -ESC_X,                  0,
 133      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
 134      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
 135      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
 136      CHAR_GRAVE_ACCENT,       7,
 137      -ESC_b,                  0,
 138      -ESC_d,                  ESC_e,
 139      ESC_f,                   0,
 140      -ESC_h,                  0,
 141      0,                       -ESC_k,
 142      0,                       0,
 143      ESC_n,                   0,
 144      -ESC_p,                  0,
 145      ESC_r,                   -ESC_s,
 146      ESC_tee,                 0,
 147      -ESC_v,                  -ESC_w,
 148      0,                       0,
 149      -ESC_z
 150 };
 151
 152 #else
 153
 154 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
 155
 156 static const short int escapes[] = {
 157 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
 158 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
 159 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
 160 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
 161 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
 162 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
 163 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
 164 /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
 165 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
 166 /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
 167 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
 168 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
 169 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
 170 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
 171 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
 172 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
 173 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
 174 /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
 175 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
 176 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
 177 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
 178 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
 179 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
 180 };
 181 #endif
 182
 183
 184 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
 185 searched linearly. Put all the names into a single string, in order to reduce
 186 the number of relocations when a shared library is dynamically linked. The
 187 string is built from string macros so that it works in UTF-8 mode on EBCDIC
 188 platforms. */
 189
 190 typedef struct verbitem {
 191   int   len;
 192   int   op;
 193 } verbitem;
 194
 195 static const char verbnames[] =
 196   STRING_ACCEPT0
 197   STRING_COMMIT0
 198   STRING_F0
 199   STRING_FAIL0
 200   STRING_PRUNE0
 201   STRING_SKIP0
 202   STRING_THEN;
 203
 204 static const verbitem verbs[] = {
 205   { 6, OP_ACCEPT },
 206   { 6, OP_COMMIT },
 207   { 1, OP_FAIL },
 208   { 4, OP_FAIL },
 209   { 5, OP_PRUNE },
 210   { 4, OP_SKIP  },
 211   { 4, OP_THEN  }
 212 };
 213
 214 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
 215
 216
 217 /* Tables of names of POSIX character classes and their lengths. The names are
 218 now all in a single string, to reduce the number of relocations when a shared
 219 library is dynamically loaded. The list of lengths is terminated by a zero
 220 length entry. The first three must be alpha, lower, upper, as this is assumed
 221 for handling case independence. */
 222
 223 static const char posix_names[] =
 224   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
 225   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
 226   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
 227   STRING_word0  STRING_xdigit;
 228
 229 static const uschar posix_name_lengths[] = {
 230   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
 231
 232 /* Table of class bit maps for each POSIX class. Each class is formed from a
 233 base map, with an optional addition or removal of another map. Then, for some
 234 classes, there is some additional tweaking: for [:blank:] the vertical space
 235 characters are removed, and for [:alpha:] and [:alnum:] the underscore
 236 character is removed. The triples in the table consist of the base map offset,
 237 second map offset or -1 if no second map, and a non-negative value for map
 238 addition or a negative value for map subtraction (if there are two maps). The
 239 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
 240 remove vertical space characters, 2 => remove underscore. */
 241
 242 static const int posix_class_maps[] = {
 243   cbit_word,  cbit_digit, -2,             /* alpha */
 244   cbit_lower, -1,          0,             /* lower */
 245   cbit_upper, -1,          0,             /* upper */
 246   cbit_word,  -1,          2,             /* alnum - word without underscore */
 247   cbit_print, cbit_cntrl,  0,             /* ascii */
 248   cbit_space, -1,          1,             /* blank - a GNU extension */
 249   cbit_cntrl, -1,          0,             /* cntrl */
 250   cbit_digit, -1,          0,             /* digit */
 251   cbit_graph, -1,          0,             /* graph */
 252   cbit_print, -1,          0,             /* print */
 253   cbit_punct, -1,          0,             /* punct */
 254   cbit_space, -1,          0,             /* space */
 255   cbit_word,  -1,          0,             /* word - a Perl extension */
 256   cbit_xdigit,-1,          0              /* xdigit */
 257 };
 258
 259
 260 #define STRING(a)  # a
 261 #define XSTRING(s) STRING(s)
 262
 263 /* The texts of compile-time error messages. These are "char *" because they
 264 are passed to the outside world. Do not ever re-use any error number, because
 265 they are documented. Always add a new error instead. Messages marked DEAD below
 266 are no longer used. This used to be a table of strings, but in order to reduce
 267 the number of relocations needed when a shared library is loaded dynamically,
 268 it is now one long string. We cannot use a table of offsets, because the
 269 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
 270 simply count through to the one we want - this isn't a performance issue
 271 because these strings are used only when there is a compilation error.
 272
 273 Each substring ends with \0 to insert a null character. This includes the final
 274 substring, so that the whole string ends with \0\0, which can be detected when
 275 counting through. */
 276
 277 static const char error_texts[] =
 278   "no error\0"
 279   "\\ at end of pattern\0"
 280   "\\c at end of pattern\0"
 281   "unrecognized character follows \\\0"
 282   "numbers out of order in {} quantifier\0"
 283   /* 5 */
 284   "number too big in {} quantifier\0"
 285   "missing terminating ] for character class\0"
 286   "invalid escape sequence in character class\0"
 287   "range out of order in character class\0"
 288   "nothing to repeat\0"
 289   /* 10 */
 290   "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
 291   "internal error: unexpected repeat\0"
 292   "unrecognized character after (? or (?-\0"
 293   "POSIX named classes are supported only within a class\0"
 294   "missing )\0"
 295   /* 15 */
 296   "reference to non-existent subpattern\0"
 297   "erroffset passed as NULL\0"
 298   "unknown option bit(s) set\0"
 299   "missing ) after comment\0"
 300   "parentheses nested too deeply\0"  /** DEAD **/
 301   /* 20 */
 302   "regular expression is too large\0"
 303   "failed to get memory\0"
 304   "unmatched parentheses\0"
 305   "internal error: code overflow\0"
 306   "unrecognized character after (?<\0"
 307   /* 25 */
 308   "lookbehind assertion is not fixed length\0"
 309   "malformed number or name after (?(\0"
 310   "conditional group contains more than two branches\0"
 311   "assertion expected after (?(\0"
 312   "(?R or (?[+-]digits must be followed by )\0"
 313   /* 30 */
 314   "unknown POSIX class name\0"
 315   "POSIX collating elements are not supported\0"
 316   "this version of PCRE is not compiled with PCRE_UTF8 support\0"
 317   "spare error\0"  /** DEAD **/
 318   "character value in \\x{...} sequence is too large\0"
 319   /* 35 */
 320   "invalid condition (?(0)\0"
 321   "\\C not allowed in lookbehind assertion\0"
 322   "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
 323   "number after (?C is > 255\0"
 324   "closing ) for (?C expected\0"
 325   /* 40 */
 326   "recursive call could loop indefinitely\0"
 327   "unrecognized character after (?P\0"
 328   "syntax error in subpattern name (missing terminator)\0"
 329   "two named subpatterns have the same name\0"
 330   "invalid UTF-8 string\0"
 331   /* 45 */
 332   "support for \\P, \\p, and \\X has not been compiled\0"
 333   "malformed \\P or \\p sequence\0"
 334   "unknown property name after \\P or \\p\0"
 335   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
 336   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
 337   /* 50 */
 338   "repeated subpattern is too long\0"    /** DEAD **/
 339   "octal value is greater than \\377 (not in UTF-8 mode)\0"
 340   "internal error: overran compiling workspace\0"
 341   "internal error: previously-checked referenced subpattern not found\0"
 342   "DEFINE group contains more than one branch\0"
 343   /* 55 */
 344   "repeating a DEFINE group is not allowed\0"
 345   "inconsistent NEWLINE options\0"
 346   "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
 347   "a numbered reference must not be zero\0"
 348   "(*VERB) with an argument is not supported\0"
 349   /* 60 */
 350   "(*VERB) not recognized\0"
 351   "number is too big\0"
 352   "subpattern name expected\0"
 353   "digit expected after (?+\0"
 354   "] is an invalid data character in JavaScript compatibility mode\0"
 355   /* 65 */
 356   "different names for subpatterns of the same number are not allowed\0";
 357
 358
 359 /* Definition to allow mutual recursion */
 360
 361 static BOOL
 362   compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
 363     int *, int *, branch_chain *, compile_data *, int *);
 364
 365
 366
 367 /*************************************************
 368 *            Find an error text                  *
 369 *************************************************/
 370
 371 /* The error texts are now all in one long string, to save on relocations. As
 372 some of the text is of unknown length, we can't use a table of offsets.
 373 Instead, just count through the strings. This is not a performance issue
 374 because it happens only when there has been a compilation error.
 375
 376 Argument:   the error number
 377 Returns:    pointer to the error string
 378 */
 379
 380 static const char *
 381 find_error_text(int n)
 382 {
 383 const char *s = error_texts;
 384 for (; n > 0; n--)
 385   {
 386   while (*s++ != 0) {};
 387   if (*s == 0) return "Error text not found (please report)";
 388   }
 389 return s;
 390 }
 391
 392
 393 /*************************************************
 394 *            Handle escapes                      *
 395 *************************************************/
 396
 397 /* This function is called when a \ has been encountered. It either returns a
 398 positive value for a simple escape such as \n, or a negative value which
 399 encodes one of the more complicated things such as \d. A backreference to group
 400 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
 401 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
 402 ptr is pointing at the \. On exit, it is on the final character of the escape
 403 sequence.
 404
 405 Arguments:
 406   ptrptr         points to the pattern position pointer
 407   errorcodeptr   points to the errorcode variable
 408   bracount       number of previous extracting brackets
 409   options        the options bits
 410   isclass        TRUE if inside a character class
 411
 412 Returns:         zero or positive => a data character
 413                  negative => a special escape sequence
 414                  on error, errorcodeptr is set
 415 */
 416
 417 static int
 418 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
 419   int options, BOOL isclass)
 420 {
 421 BOOL utf8 = (options & PCRE_UTF8) != 0;
 422 const uschar *ptr = *ptrptr + 1;
 423 int c, i;
 424
 425 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
 426 ptr--;                            /* Set pointer back to the last byte */
 427
 428 /* If backslash is at the end of the pattern, it's an error. */
 429
 430 if (c == 0) *errorcodeptr = ERR1;
 431
 432 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
 433 in a table. A non-zero result is something that can be returned immediately.
 434 Otherwise further processing may be required. */
 435
 436 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
 437 else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */
 438 else if ((i = escapes[c - CHAR_0]) != 0) c = i;
 439
 440 #else           /* EBCDIC coding */
 441 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
 442 else if ((i = escapes[c - 0x48]) != 0)  c = i;
 443 #endif
 444
 445 /* Escapes that need further processing, or are illegal. */
 446
 447 else
 448   {
 449   const uschar *oldptr;
 450   BOOL braced, negated;
 451
 452   switch (c)
 453     {
 454     /* A number of Perl escapes are not handled by PCRE. We give an explicit
 455     error. */
 456
 457     case CHAR_l:
 458     case CHAR_L:
 459     case CHAR_N:
 460     case CHAR_u:
 461     case CHAR_U:
 462     *errorcodeptr = ERR37;
 463     break;
 464
 465     /* \g must be followed by one of a number of specific things:
 466
 467     (1) A number, either plain or braced. If positive, it is an absolute
 468     backreference. If negative, it is a relative backreference. This is a Perl
 469     5.10 feature.
 470
 471     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
 472     is part of Perl's movement towards a unified syntax for back references. As
 473     this is synonymous with \k{name}, we fudge it up by pretending it really
 474     was \k.
 475
 476     (3) For Oniguruma compatibility we also support \g followed by a name or a
 477     number either in angle brackets or in single quotes. However, these are
 478     (possibly recursive) subroutine calls, _not_ backreferences. Just return
 479     the -ESC_g code (cf \k). */
 480
 481     case CHAR_g:
 482     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
 483       {
 484       c = -ESC_g;
 485       break;
 486       }
 487
 488     /* Handle the Perl-compatible cases */
 489
 490     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
 491       {
 492       const uschar *p;
 493       for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
 494         if (*p != CHAR_MINUS && g_ascii_isdigit(*p) == 0) break;
 495       if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
 496         {
 497         c = -ESC_k;
 498         break;
 499         }
 500       braced = TRUE;
 501       ptr++;
 502       }
 503     else braced = FALSE;
 504
 505     if (ptr[1] == CHAR_MINUS)
 506       {
 507       negated = TRUE;
 508       ptr++;
 509       }
 510     else negated = FALSE;
 511
 512     c = 0;
 513     while (g_ascii_isdigit(ptr[1]) != 0)
 514       c = c * 10 + *(++ptr) - CHAR_0;
 515
 516     if (c < 0)   /* Integer overflow */
 517       {
 518       *errorcodeptr = ERR61;
 519       break;
 520       }
 521
 522     if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
 523       {
 524       *errorcodeptr = ERR57;
 525       break;
 526       }
 527
 528     if (c == 0)
 529       {
 530       *errorcodeptr = ERR58;
 531       break;
 532       }
 533
 534     if (negated)
 535       {
 536       if (c > bracount)
 537         {
 538         *errorcodeptr = ERR15;
 539         break;
 540         }
 541       c = bracount - (c - 1);
 542       }
 543
 544     c = -(ESC_REF + c);
 545     break;
 546
 547     /* The handling of escape sequences consisting of a string of digits
 548     starting with one that is not zero is not straightforward. By experiment,
 549     the way Perl works seems to be as follows:
 550
 551     Outside a character class, the digits are read as a decimal number. If the
 552     number is less than 10, or if there are that many previous extracting
 553     left brackets, then it is a back reference. Otherwise, up to three octal
 554     digits are read to form an escaped byte. Thus \123 is likely to be octal
 555     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
 556     value is greater than 377, the least significant 8 bits are taken. Inside a
 557     character class, \ followed by a digit is always an octal number. */
 558
 559     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
 560     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
 561
 562     if (!isclass)
 563       {
 564       oldptr = ptr;
 565       c -= CHAR_0;
 566       while (g_ascii_isdigit(ptr[1]) != 0)
 567         c = c * 10 + *(++ptr) - CHAR_0;
 568       if (c < 0)    /* Integer overflow */
 569         {
 570         *errorcodeptr = ERR61;
 571         break;
 572         }
 573       if (c < 10 || c <= bracount)
 574         {
 575         c = -(ESC_REF + c);
 576         break;
 577         }
 578       ptr = oldptr;      /* Put the pointer back and fall through */
 579       }
 580
 581     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
 582     generates a binary zero byte and treats the digit as a following literal.
 583     Thus we have to pull back the pointer by one. */
 584
 585     if ((c = *ptr) >= CHAR_8)
 586       {
 587       ptr--;
 588       c = 0;
 589       break;
 590       }
 591
 592     /* \0 always starts an octal number, but we may drop through to here with a
 593     larger first octal digit. The original code used just to take the least
 594     significant 8 bits of octal numbers (I think this is what early Perls used
 595     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
 596     than 3 octal digits. */
 597
 598     case CHAR_0:
 599     c -= CHAR_0;
 600     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
 601         c = c * 8 + *(++ptr) - CHAR_0;
 602     if (!utf8 && c > 255) *errorcodeptr = ERR51;
 603     break;
 604
 605     /* \x is complicated. \x{ddd} is a character number which can be greater
 606     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
 607     treated as a data character. */
 608
 609     case CHAR_x:
 610     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
 611       {
 612       const uschar *pt = ptr + 2;
 613       int count = 0;
 614
 615       c = 0;
 616       while (g_ascii_isxdigit(*pt) != 0)
 617         {
 618         register int cc = *pt++;
 619         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
 620         count++;
 621
 622 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
 623         if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
 624         c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
 625 #else           /* EBCDIC coding */
 626         if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
 627         c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
 628 #endif
 629         }
 630
 631       if (*pt == CHAR_RIGHT_CURLY_BRACKET)
 632         {
 633         if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
 634         ptr = pt;
 635         break;
 636         }
 637
 638       /* If the sequence of hex digits does not end with '}', then we don't
 639       recognize this construct; fall through to the normal \x handling. */
 640       }
 641
 642     /* Read just a single-byte hex-defined char */
 643
 644     c = 0;
 645     while (i++ < 2 && g_ascii_isxdigit(ptr[1]) != 0)
 646       {
 647       int cc;                                  /* Some compilers don't like */
 648       cc = *(++ptr);                           /* ++ in initializers */
 649 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
 650       if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
 651       c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
 652 #else           /* EBCDIC coding */
 653       if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
 654       c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
 655 #endif
 656       }
 657     break;
 658
 659     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
 660     This coding is ASCII-specific, but then the whole concept of \cx is
 661     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
 662
 663     case CHAR_c:
 664     c = *(++ptr);
 665     if (c == 0)
 666       {
 667       *errorcodeptr = ERR2;
 668       break;
 669       }
 670
 671 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
 672     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
 673     c ^= 0x40;
 674 #else           /* EBCDIC coding */
 675     if (c >= CHAR_a && c <= CHAR_z) c += 64;
 676     c ^= 0xC0;
 677 #endif
 678     break;
 679
 680     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
 681     other alphanumeric following \ is an error if PCRE_EXTRA was set;
 682     otherwise, for Perl compatibility, it is a literal. This code looks a bit
 683     odd, but there used to be some cases other than the default, and there may
 684     be again in future, so I haven't "optimized" it. */
 685
 686     default:
 687     if ((options & PCRE_EXTRA) != 0) switch(c)
 688       {
 689       default:
 690       *errorcodeptr = ERR3;
 691       break;
 692       }
 693     break;
 694     }
 695   }
 696
 697 *ptrptr = ptr;
 698 return c;
 699 }
 700
 701
 702
 703 #ifdef SUPPORT_UCP
 704 /*************************************************
 705 *               Handle \P and \p                 *
 706 *************************************************/
 707
 708 /* This function is called after \P or \p has been encountered, provided that
 709 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
 710 pointing at the P or p. On exit, it is pointing at the final character of the
 711 escape sequence.
 712
 713 Argument:
 714   ptrptr         points to the pattern position pointer
 715   negptr         points to a boolean that is set TRUE for negation else FALSE
 716   dptr           points to an int that is set to the detailed property value
 717   errorcodeptr   points to the error code variable
 718
 719 Returns:         type value from ucp_type_table, or -1 for an invalid type
 720 */
 721
 722 static int
 723 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
 724 {
 725 int c, i, bot, top;
 726 const uschar *ptr = *ptrptr;
 727 char name[32];
 728
 729 c = *(++ptr);
 730 if (c == 0) goto ERROR_RETURN;
 731
 732 *negptr = FALSE;
 733
 734 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
 735 negation. */
 736
 737 if (c == CHAR_LEFT_CURLY_BRACKET)
 738   {
 739   if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
 740     {
 741     *negptr = TRUE;
 742     ptr++;
 743     }
 744   for (i = 0; i < (int)sizeof(name) - 1; i++)
 745     {
 746     c = *(++ptr);
 747     if (c == 0) goto ERROR_RETURN;
 748     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
 749     name[i] = c;
 750     }
 751   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
 752   name[i] = 0;
 753   }
 754
 755 /* Otherwise there is just one following character */
 756
 757 else
 758   {
 759   name[0] = c;
 760   name[1] = 0;
 761   }
 762
 763 *ptrptr = ptr;
 764
 765 /* Search for a recognized property name using binary chop */
 766
 767 bot = 0;
 768 top = _pcre_utt_size;
 769
 770 while (bot < top)
 771   {
 772   i = (bot + top) >> 1;
 773   c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
 774   if (c == 0)
 775     {
 776     *dptr = _pcre_utt[i].value;
 777     return _pcre_utt[i].type;
 778     }
 779   if (c > 0) bot = i + 1; else top = i;
 780   }
 781
 782 *errorcodeptr = ERR47;
 783 *ptrptr = ptr;
 784 return -1;
 785
 786 ERROR_RETURN:
 787 *errorcodeptr = ERR46;
 788 *ptrptr = ptr;
 789 return -1;
 790 }
 791 #endif
 792
 793
 794
 795
 796 /*************************************************
 797 *            Check for counted repeat            *
 798 *************************************************/
 799
 800 /* This function is called when a '{' is encountered in a place where it might
 801 start a quantifier. It looks ahead to see if it really is a quantifier or not.
 802 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
 803 where the ddds are digits.
 804
 805 Arguments:
 806   p         pointer to the first char after '{'
 807
 808 Returns:    TRUE or FALSE
 809 */
 810
 811 static BOOL
 812 is_counted_repeat(const uschar *p)
 813 {
 814 if (g_ascii_isdigit(*p++) == 0) return FALSE;
 815 while (g_ascii_isdigit(*p) != 0) p++;
 816 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
 817
 818 if (*p++ != CHAR_COMMA) return FALSE;
 819 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
 820
 821 if (g_ascii_isdigit(*p++) == 0) return FALSE;
 822 while (g_ascii_isdigit(*p) != 0) p++;
 823
 824 return (*p == CHAR_RIGHT_CURLY_BRACKET);
 825 }
 826
 827
 828
 829 /*************************************************
 830 *         Read repeat counts                     *
 831 *************************************************/
 832
 833 /* Read an item of the form {n,m} and return the values. This is called only
 834 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
 835 so the syntax is guaranteed to be correct, but we need to check the values.
 836
 837 Arguments:
 838   p              pointer to first char after '{'
 839   minp           pointer to int for min
 840   maxp           pointer to int for max
 841                  returned as -1 if no max
 842   errorcodeptr   points to error code variable
 843
 844 Returns:         pointer to '}' on success;
 845                  current ptr on error, with errorcodeptr set non-zero
 846 */
 847
 848 static const uschar *
 849 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
 850 {
 851 int min = 0;
 852 int max = -1;
 853
 854 /* Read the minimum value and do a paranoid check: a negative value indicates
 855 an integer overflow. */
 856
 857 while (g_ascii_isdigit(*p) != 0) min = min * 10 + *p++ - CHAR_0;
 858 if (min < 0 || min > 65535)
 859   {
 860   *errorcodeptr = ERR5;
 861   return p;
 862   }
 863
 864 /* Read the maximum value if there is one, and again do a paranoid on its size.
 865 Also, max must not be less than min. */
 866
 867 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
 868   {
 869   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
 870     {
 871     max = 0;
 872     while(g_ascii_isdigit(*p) != 0) max = max * 10 + *p++ - CHAR_0;
 873     if (max < 0 || max > 65535)
 874       {
 875       *errorcodeptr = ERR5;
 876       return p;
 877       }
 878     if (max < min)
 879       {
 880       *errorcodeptr = ERR4;
 881       return p;
 882       }
 883     }
 884   }
 885
 886 /* Fill in the required variables, and pass back the pointer to the terminating
 887 '}'. */
 888
 889 *minp = min;
 890 *maxp = max;
 891 return p;
 892 }
 893
 894
 895
 896 /*************************************************
 897 *  Subroutine for finding forward reference      *
 898 *************************************************/
 899
 900 /* This recursive function is called only from find_parens() below. The
 901 top-level call starts at the beginning of the pattern. All other calls must
 902 start at a parenthesis. It scans along a pattern's text looking for capturing
 903 subpatterns, and counting them. If it finds a named pattern that matches the
 904 name it is given, it returns its number. Alternatively, if the name is NULL, it
 905 returns when it reaches a given numbered subpattern. We know that if (?P< is
 906 encountered, the name will be terminated by '>' because that is checked in the
 907 first pass. Recursion is used to keep track of subpatterns that reset the
 908 capturing group numbers - the (?| feature.
 909
 910 Arguments:
 911   ptrptr       address of the current character pointer (updated)
 912   cd           compile background data
 913   name         name to seek, or NULL if seeking a numbered subpattern
 914   lorn         name length, or subpattern number if name is NULL
 915   xmode        TRUE if we are in /x mode
 916   count        pointer to the current capturing subpattern number (updated)
 917
 918 Returns:       the number of the named subpattern, or -1 if not found
 919 */
 920
 921 static int
 922 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
 923   BOOL xmode, int *count)
 924 {
 925 uschar *ptr = *ptrptr;
 926 int start_count = *count;
 927 int hwm_count = start_count;
 928 BOOL dup_parens = FALSE;
 929
 930 /* If the first character is a parenthesis, check on the type of group we are
 931 dealing with. The very first call may not start with a parenthesis. */
 932
 933 if (ptr[0] == CHAR_LEFT_PARENTHESIS)
 934   {
 935   if (ptr[1] == CHAR_QUESTION_MARK &&
 936       ptr[2] == CHAR_VERTICAL_LINE)
 937     {
 938     ptr += 3;
 939     dup_parens = TRUE;
 940     }
 941
 942   /* Handle a normal, unnamed capturing parenthesis */
 943
 944   else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
 945     {
 946     *count += 1;
 947     if (name == NULL && *count == lorn) return *count;
 948     ptr++;
 949     }
 950
 951   /* Handle a condition. If it is an assertion, just carry on so that it
 952   is processed as normal. If not, skip to the closing parenthesis of the
 953   condition (there can't be any nested parens. */
 954
 955   else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
 956     {
 957     ptr += 2;
 958     if (ptr[1] != CHAR_QUESTION_MARK)
 959       {
 960       while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
 961       if (*ptr != 0) ptr++;
 962       }
 963     }
 964
 965   /* We have either (? or (* and not a condition */
 966
 967   else
 968     {
 969     ptr += 2;
 970     if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
 971
 972     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
 973
 974     if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
 975         ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
 976       {
 977       int term;
 978       const uschar *thisname;
 979       *count += 1;
 980       if (name == NULL && *count == lorn) return *count;
 981       term = *ptr++;
 982       if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
 983       thisname = ptr;
 984       while (*ptr != term) ptr++;
 985       if (name != NULL && lorn == ptr - thisname &&
 986           strncmp((const char *)name, (const char *)thisname, lorn) == 0)
 987         return *count;
 988       term++;
 989       }
 990     }
 991   }
 992
 993 /* Past any initial parenthesis handling, scan for parentheses or vertical
 994 bars. */
 995
 996 for (; *ptr != 0; ptr++)
 997   {
 998   /* Skip over backslashed characters and also entire \Q...\E */
 999
1000   if (*ptr == CHAR_BACKSLASH)
1001     {
1002     if (*(++ptr) == 0) goto FAIL_EXIT;
1003     if (*ptr == CHAR_Q) for (;;)
1004       {
1005       while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1006       if (*ptr == 0) goto FAIL_EXIT;
1007       if (*(++ptr) == CHAR_E) break;
1008       }
1009     continue;
1010     }
1011
1012   /* Skip over character classes; this logic must be similar to the way they
1013   are handled for real. If the first character is '^', skip it. Also, if the
1014   first few characters (either before or after ^) are \Q\E or \E we skip them
1015   too. This makes for compatibility with Perl. Note the use of STR macros to
1016   encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1017
1018   if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1019     {
1020     BOOL negate_class = FALSE;
1021     for (;;)
1022       {
1023       if (ptr[1] == CHAR_BACKSLASH)
1024         {
1025         if (ptr[2] == CHAR_E)
1026           ptr+= 2;
1027         else if (strncmp((const char *)ptr+2,
1028                  STR_Q STR_BACKSLASH STR_E, 3) == 0)
1029           ptr += 4;
1030         else
1031           break;
1032         }
1033       else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1034         {
1035         negate_class = TRUE;
1036         ptr++;
1037         }
1038       else break;
1039       }
1040
1041     /* If the next character is ']', it is a data character that must be
1042     skipped, except in JavaScript compatibility mode. */
1043
1044     if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1045         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1046       ptr++;
1047
1048     while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1049       {
1050       if (*ptr == 0) return -1;
1051       if (*ptr == CHAR_BACKSLASH)
1052         {
1053         if (*(++ptr) == 0) goto FAIL_EXIT;
1054         if (*ptr == CHAR_Q) for (;;)
1055           {
1056           while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1057           if (*ptr == 0) goto FAIL_EXIT;
1058           if (*(++ptr) == CHAR_E) break;
1059           }
1060         continue;
1061         }
1062       }
1063     continue;
1064     }
1065
1066   /* Skip comments in /x mode */
1067
1068   if (xmode && *ptr == CHAR_NUMBER_SIGN)
1069     {
1070     while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1071     if (*ptr == 0) goto FAIL_EXIT;
1072     continue;
1073     }
1074
1075   /* Check for the special metacharacters */
1076
1077   if (*ptr == CHAR_LEFT_PARENTHESIS)
1078     {
1079     int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1080     if (rc > 0) return rc;
1081     if (*ptr == 0) goto FAIL_EXIT;
1082     }
1083
1084   else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1085     {
1086     if (dup_parens && *count < hwm_count) *count = hwm_count;
1087     *ptrptr = ptr;
1088     return -1;
1089     }
1090
1091   else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1092     {
1093     if (*count > hwm_count) hwm_count = *count;
1094     *count = start_count;
1095     }
1096   }
1097
1098 FAIL_EXIT:
1099 *ptrptr = ptr;
1100 return -1;
1101 }
1102
1103
1104
1105
1106 /*************************************************
1107 *       Find forward referenced subpattern       *
1108 *************************************************/
1109
1110 /* This function scans along a pattern's text looking for capturing
1111 subpatterns, and counting them. If it finds a named pattern that matches the
1112 name it is given, it returns its number. Alternatively, if the name is NULL, it
1113 returns when it reaches a given numbered subpattern. This is used for forward
1114 references to subpatterns. We used to be able to start this scan from the
1115 current compiling point, using the current count value from cd->bracount, and
1116 do it all in a single loop, but the addition of the possibility of duplicate
1117 subpattern numbers means that we have to scan from the very start, in order to
1118 take account of such duplicates, and to use a recursive function to keep track
1119 of the different types of group.
1120
1121 Arguments:
1122   cd           compile background data
1123   name         name to seek, or NULL if seeking a numbered subpattern
1124   lorn         name length, or subpattern number if name is NULL
1125   xmode        TRUE if we are in /x mode
1126
1127 Returns:       the number of the found subpattern, or -1 if not found
1128 */
1129
1130 static int
1131 find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1132 {
1133 uschar *ptr = (uschar *)cd->start_pattern;
1134 int count = 0;
1135 int rc;
1136
1137 /* If the pattern does not start with an opening parenthesis, the first call
1138 to find_parens_sub() will scan right to the end (if necessary). However, if it
1139 does start with a parenthesis, find_parens_sub() will return when it hits the
1140 matching closing parens. That is why we have to have a loop. */
1141
1142 for (;;)
1143   {
1144   rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1145   if (rc > 0 || *ptr++ == 0) break;
1146   }
1147
1148 return rc;
1149 }
1150
1151
1152
1153
1154 /*************************************************
1155 *      Find first significant op code            *
1156 *************************************************/
1157
1158 /* This is called by several functions that scan a compiled expression looking
1159 for a fixed first character, or an anchoring op code etc. It skips over things
1160 that do not influence this. For some calls, a change of option is important.
1161 For some calls, it makes sense to skip negative forward and all backward
1162 assertions, and also the \b assertion; for others it does not.
1163
1164 Arguments:
1165   code         pointer to the start of the group
1166   options      pointer to external options
1167   optbit       the option bit whose changing is significant, or
1168                  zero if none are
1169   skipassert   TRUE if certain assertions are to be skipped
1170
1171 Returns:       pointer to the first significant opcode
1172 */
1173
1174 static const uschar*
1175 first_significant_code(const uschar *code, int *options, int optbit,
1176   BOOL skipassert)
1177 {
1178 for (;;)
1179   {
1180   switch ((int)*code)
1181     {
1182     case OP_OPT:
1183     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1184       *options = (int)code[1];
1185     code += 2;
1186     break;
1187
1188     case OP_ASSERT_NOT:
1189     case OP_ASSERTBACK:
1190     case OP_ASSERTBACK_NOT:
1191     if (!skipassert) return code;
1192     do code += GET(code, 1); while (*code == OP_ALT);
1193     code += _pcre_OP_lengths[*code];
1194     break;
1195
1196     case OP_WORD_BOUNDARY:
1197     case OP_NOT_WORD_BOUNDARY:
1198     if (!skipassert) return code;
1199     /* Fall through */
1200
1201     case OP_CALLOUT:
1202     case OP_CREF:
1203     case OP_NCREF:
1204     case OP_RREF:
1205     case OP_NRREF:
1206     case OP_DEF:
1207     code += _pcre_OP_lengths[*code];
1208     break;
1209
1210     default:
1211     return code;
1212     }
1213   }
1214 /* Control never reaches here */
1215 }
1216
1217
1218
1219
1220 /*************************************************
1221 *        Find the fixed length of a branch       *
1222 *************************************************/
1223
1224 /* Scan a branch and compute the fixed length of subject that will match it,
1225 if the length is fixed. This is needed for dealing with backward assertions.
1226 In UTF8 mode, the result is in characters rather than bytes. The branch is
1227 temporarily terminated with OP_END when this function is called.
1228
1229 This function is called when a backward assertion is encountered, so that if it
1230 fails, the error message can point to the correct place in the pattern.
1231 However, we cannot do this when the assertion contains subroutine calls,
1232 because they can be forward references. We solve this by remembering this case
1233 and doing the check at the end; a flag specifies which mode we are running in.
1234
1235 Arguments:
1236   code     points to the start of the pattern (the bracket)
1237   options  the compiling options
1238   atend    TRUE if called when the pattern is complete
1239   cd       the "compile data" structure
1240
1241 Returns:   the fixed length,
1242              or -1 if there is no fixed length,
1243              or -2 if \C was encountered
1244              or -3 if an OP_RECURSE item was encountered and atend is FALSE
1245 */
1246
1247 static int
1248 find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1249 {
1250 int length = -1;
1251
1252 register int branchlength = 0;
1253 register uschar *cc = code + 1 + LINK_SIZE;
1254
1255 /* Scan along the opcodes for this branch. If we get to the end of the
1256 branch, check the length against that of the other branches. */
1257
1258 for (;;)
1259   {
1260   int d;
1261   uschar *ce, *cs;
1262   register int op = *cc;
1263   switch (op)
1264     {
1265     case OP_CBRA:
1266     case OP_BRA:
1267     case OP_ONCE:
1268     case OP_COND:
1269     d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1270     if (d < 0) return d;
1271     branchlength += d;
1272     do cc += GET(cc, 1); while (*cc == OP_ALT);
1273     cc += 1 + LINK_SIZE;
1274     break;
1275
1276     /* Reached end of a branch; if it's a ket it is the end of a nested
1277     call. If it's ALT it is an alternation in a nested call. If it is
1278     END it's the end of the outer call. All can be handled by the same code. */
1279
1280     case OP_ALT:
1281     case OP_KET:
1282     case OP_KETRMAX:
1283     case OP_KETRMIN:
1284     case OP_END:
1285     if (length < 0) length = branchlength;
1286       else if (length != branchlength) return -1;
1287     if (*cc != OP_ALT) return length;
1288     cc += 1 + LINK_SIZE;
1289     branchlength = 0;
1290     break;
1291
1292     /* A true recursion implies not fixed length, but a subroutine call may
1293     be OK. If the subroutine is a forward reference, we can't deal with
1294     it until the end of the pattern, so return -3. */
1295
1296     case OP_RECURSE:
1297     if (!atend) return -3;
1298     cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1299     do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */
1300     if (cc > cs && cc < ce) return -1;                /* Recursion */
1301     d = find_fixedlength(cs + 2, options, atend, cd);
1302     if (d < 0) return d;
1303     branchlength += d;
1304     cc += 1 + LINK_SIZE;
1305     break;
1306
1307     /* Skip over assertive subpatterns */
1308
1309     case OP_ASSERT:
1310     case OP_ASSERT_NOT:
1311     case OP_ASSERTBACK:
1312     case OP_ASSERTBACK_NOT:
1313     do cc += GET(cc, 1); while (*cc == OP_ALT);
1314     /* Fall through */
1315
1316     /* Skip over things that don't match chars */
1317
1318     case OP_REVERSE:
1319     case OP_CREF:
1320     case OP_NCREF:
1321     case OP_RREF:
1322     case OP_NRREF:
1323     case OP_DEF:
1324     case OP_OPT:
1325     case OP_CALLOUT:
1326     case OP_SOD:
1327     case OP_SOM:
1328     case OP_SET_SOM:
1329     case OP_EOD:
1330     case OP_EODN:
1331     case OP_CIRC:
1332     case OP_DOLL:
1333     case OP_NOT_WORD_BOUNDARY:
1334     case OP_WORD_BOUNDARY:
1335     cc += _pcre_OP_lengths[*cc];
1336     break;
1337
1338     /* Handle literal characters */
1339
1340     case OP_CHAR:
1341     case OP_CHARNC:
1342     case OP_NOT:
1343     branchlength++;
1344     cc += 2;
1345 #ifdef SUPPORT_UTF8
1346     if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1347       cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1348 #endif
1349     break;
1350
1351     /* Handle exact repetitions. The count is already in characters, but we
1352     need to skip over a multibyte character in UTF8 mode.  */
1353
1354     case OP_EXACT:
1355     branchlength += GET2(cc,1);
1356     cc += 4;
1357 #ifdef SUPPORT_UTF8
1358     if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1359       cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1360 #endif
1361     break;
1362
1363     case OP_TYPEEXACT:
1364     branchlength += GET2(cc,1);
1365     if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1366     cc += 4;
1367     break;
1368
1369     /* Handle single-char matchers */
1370
1371     case OP_PROP:
1372     case OP_NOTPROP:
1373     cc += 2;
1374     /* Fall through */
1375
1376     case OP_NOT_DIGIT:
1377     case OP_DIGIT:
1378     case OP_NOT_WHITESPACE:
1379     case OP_WHITESPACE:
1380     case OP_NOT_WORDCHAR:
1381     case OP_WORDCHAR:
1382     case OP_ANY:
1383     case OP_ALLANY:
1384     branchlength++;
1385     cc++;
1386     break;
1387
1388     /* The single-byte matcher isn't allowed */
1389
1390     case OP_ANYBYTE:
1391     return -2;
1392
1393     /* Check a class for variable quantification */
1394
1395 #ifdef SUPPORT_UTF8
1396     case OP_XCLASS:
1397     cc += GET(cc, 1) - 33;
1398     /* Fall through */
1399 #endif
1400
1401     case OP_CLASS:
1402     case OP_NCLASS:
1403     cc += 33;
1404
1405     switch (*cc)
1406       {
1407       case OP_CRSTAR:
1408       case OP_CRMINSTAR:
1409       case OP_CRQUERY:
1410       case OP_CRMINQUERY:
1411       return -1;
1412
1413       case OP_CRRANGE:
1414       case OP_CRMINRANGE:
1415       if (GET2(cc,1) != GET2(cc,3)) return -1;
1416       branchlength += GET2(cc,1);
1417       cc += 5;
1418       break;
1419
1420       default:
1421       branchlength++;
1422       }
1423     break;
1424
1425     /* Anything else is variable length */
1426
1427     default:
1428     return -1;
1429     }
1430   }
1431 /* Control never gets here */
1432 }
1433
1434
1435
1436
1437 /*************************************************
1438 *    Scan compiled regex for specific bracket    *
1439 *************************************************/
1440
1441 /* This little function scans through a compiled pattern until it finds a
1442 capturing bracket with the given number, or, if the number is negative, an
1443 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1444 so that it can be called from pcre_study() when finding the minimum matching
1445 length.
1446
1447 Arguments:
1448   code        points to start of expression
1449   utf8        TRUE in UTF-8 mode
1450   number      the required bracket number or negative to find a lookbehind
1451
1452 Returns:      pointer to the opcode for the bracket, or NULL if not found
1453 */
1454
1455 const uschar *
1456 _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1457 {
1458 for (;;)
1459   {
1460   register int c = *code;
1461   if (c == OP_END) return NULL;
1462
1463   /* XCLASS is used for classes that cannot be represented just by a bit
1464   map. This includes negated single high-valued characters. The length in
1465   the table is zero; the actual length is stored in the compiled code. */
1466
1467   if (c == OP_XCLASS) code += GET(code, 1);
1468
1469   /* Handle recursion */
1470
1471   else if (c == OP_REVERSE)
1472     {
1473     if (number < 0) return (uschar *)code;
1474     code += _pcre_OP_lengths[c];
1475     }
1476
1477   /* Handle capturing bracket */
1478
1479   else if (c == OP_CBRA)
1480     {
1481     int n = GET2(code, 1+LINK_SIZE);
1482     if (n == number) return (uschar *)code;
1483     code += _pcre_OP_lengths[c];
1484     }
1485
1486   /* Otherwise, we can get the item's length from the table, except that for
1487   repeated character types, we have to test for \p and \P, which have an extra
1488   two bytes of parameters. */
1489
1490   else
1491     {
1492     switch(c)
1493       {
1494       case OP_TYPESTAR:
1495       case OP_TYPEMINSTAR:
1496       case OP_TYPEPLUS:
1497       case OP_TYPEMINPLUS:
1498       case OP_TYPEQUERY:
1499       case OP_TYPEMINQUERY:
1500       case OP_TYPEPOSSTAR:
1501       case OP_TYPEPOSPLUS:
1502       case OP_TYPEPOSQUERY:
1503       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1504       break;
1505
1506       case OP_TYPEUPTO:
1507       case OP_TYPEMINUPTO:
1508       case OP_TYPEEXACT:
1509       case OP_TYPEPOSUPTO:
1510       if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1511       break;
1512       }
1513
1514     /* Add in the fixed length from the table */
1515
1516     code += _pcre_OP_lengths[c];
1517
1518   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1519   a multi-byte character. The length in the table is a minimum, so we have to
1520   arrange to skip the extra bytes. */
1521
1522 #ifdef SUPPORT_UTF8
1523     if (utf8) switch(c)
1524       {
1525       case OP_CHAR:
1526       case OP_CHARNC:
1527       case OP_EXACT:
1528       case OP_UPTO:
1529       case OP_MINUPTO:
1530       case OP_POSUPTO:
1531       case OP_STAR:
1532       case OP_MINSTAR:
1533       case OP_POSSTAR:
1534       case OP_PLUS:
1535       case OP_MINPLUS:
1536       case OP_POSPLUS:
1537       case OP_QUERY:
1538       case OP_MINQUERY:
1539       case OP_POSQUERY:
1540       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1541       break;
1542       }
1543 #else
1544     (void)(utf8);  /* Keep compiler happy by referencing function argument */
1545 #endif
1546     }
1547   }
1548 }
1549
1550
1551
1552 /*************************************************
1553 *   Scan compiled regex for recursion reference  *
1554 *************************************************/
1555
1556 /* This little function scans through a compiled pattern until it finds an
1557 instance of OP_RECURSE.
1558
1559 Arguments:
1560   code        points to start of expression
1561   utf8        TRUE in UTF-8 mode
1562
1563 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
1564 */
1565
1566 static const uschar *
1567 find_recurse(const uschar *code, BOOL utf8)
1568 {
1569 for (;;)
1570   {
1571   register int c = *code;
1572   if (c == OP_END) return NULL;
1573   if (c == OP_RECURSE) return code;
1574
1575   /* XCLASS is used for classes that cannot be represented just by a bit
1576   map. This includes negated single high-valued characters. The length in
1577   the table is zero; the actual length is stored in the compiled code. */
1578
1579   if (c == OP_XCLASS) code += GET(code, 1);
1580
1581   /* Otherwise, we can get the item's length from the table, except that for
1582   repeated character types, we have to test for \p and \P, which have an extra
1583   two bytes of parameters. */
1584
1585   else
1586     {
1587     switch(c)
1588       {
1589       case OP_TYPESTAR:
1590       case OP_TYPEMINSTAR:
1591       case OP_TYPEPLUS:
1592       case OP_TYPEMINPLUS:
1593       case OP_TYPEQUERY:
1594       case OP_TYPEMINQUERY:
1595       case OP_TYPEPOSSTAR:
1596       case OP_TYPEPOSPLUS:
1597       case OP_TYPEPOSQUERY:
1598       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1599       break;
1600
1601       case OP_TYPEPOSUPTO:
1602       case OP_TYPEUPTO:
1603       case OP_TYPEMINUPTO:
1604       case OP_TYPEEXACT:
1605       if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1606       break;
1607       }
1608
1609     /* Add in the fixed length from the table */
1610
1611     code += _pcre_OP_lengths[c];
1612
1613     /* In UTF-8 mode, opcodes that are followed by a character may be followed
1614     by a multi-byte character. The length in the table is a minimum, so we have
1615     to arrange to skip the extra bytes. */
1616
1617 #ifdef SUPPORT_UTF8
1618     if (utf8) switch(c)
1619       {
1620       case OP_CHAR:
1621       case OP_CHARNC:
1622       case OP_EXACT:
1623       case OP_UPTO:
1624       case OP_MINUPTO:
1625       case OP_POSUPTO:
1626       case OP_STAR:
1627       case OP_MINSTAR:
1628       case OP_POSSTAR:
1629       case OP_PLUS:
1630       case OP_MINPLUS:
1631       case OP_POSPLUS:
1632       case OP_QUERY:
1633       case OP_MINQUERY:
1634       case OP_POSQUERY:
1635       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1636       break;
1637       }
1638 #else
1639     (void)(utf8);  /* Keep compiler happy by referencing function argument */
1640 #endif
1641     }
1642   }
1643 }
1644
1645
1646
1647 /*************************************************
1648 *    Scan compiled branch for non-emptiness      *
1649 *************************************************/
1650
1651 /* This function scans through a branch of a compiled pattern to see whether it
1652 can match the empty string or not. It is called from could_be_empty()
1653 below and from compile_branch() when checking for an unlimited repeat of a
1654 group that can match nothing. Note that first_significant_code() skips over
1655 backward and negative forward assertions when its final argument is TRUE. If we
1656 hit an unclosed bracket, we return "empty" - this means we've struck an inner
1657 bracket whose current branch will already have been scanned.
1658
1659 Arguments:
1660   code        points to start of search
1661   endcode     points to where to stop
1662   utf8        TRUE if in UTF8 mode
1663   cd          contains pointers to tables etc.
1664
1665 Returns:      TRUE if what is matched could be empty
1666 */
1667
1668 static BOOL
1669 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
1670   compile_data *cd)
1671 {
1672 register int c;
1673 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1674      code < endcode;
1675      code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1676   {
1677   const uschar *ccode;
1678
1679   c = *code;
1680
1681   /* Skip over forward assertions; the other assertions are skipped by
1682   first_significant_code() with a TRUE final argument. */
1683
1684   if (c == OP_ASSERT)
1685     {
1686     do code += GET(code, 1); while (*code == OP_ALT);
1687     c = *code;
1688     continue;
1689     }
1690
1691   /* Groups with zero repeats can of course be empty; skip them. */
1692
1693   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1694     {
1695     code += _pcre_OP_lengths[c];
1696     do code += GET(code, 1); while (*code == OP_ALT);
1697     c = *code;
1698     continue;
1699     }
1700
1701   /* For a recursion/subroutine call, if its end has been reached, which
1702   implies a subroutine call, we can scan it. */
1703
1704   if (c == OP_RECURSE)
1705     {
1706     BOOL empty_branch = FALSE;
1707     const uschar *scode = cd->start_code + GET(code, 1);
1708     if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
1709     do
1710       {
1711       if (could_be_empty_branch(scode, endcode, utf8, cd))
1712         {
1713         empty_branch = TRUE;
1714         break;
1715         }
1716       scode += GET(scode, 1);
1717       }
1718     while (*scode == OP_ALT);
1719     if (!empty_branch) return FALSE;  /* All branches are non-empty */
1720     continue;
1721     }
1722
1723   /* For other groups, scan the branches. */
1724
1725   if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1726     {
1727     BOOL empty_branch;
1728     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1729
1730     /* If a conditional group has only one branch, there is a second, implied,
1731     empty branch, so just skip over the conditional, because it could be empty.
1732     Otherwise, scan the individual branches of the group. */
1733
1734     if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1735       code += GET(code, 1);
1736     else
1737       {
1738       empty_branch = FALSE;
1739       do
1740         {
1741         if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
1742           empty_branch = TRUE;
1743         code += GET(code, 1);
1744         }
1745       while (*code == OP_ALT);
1746       if (!empty_branch) return FALSE;   /* All branches are non-empty */
1747       }
1748
1749     c = *code;
1750     continue;
1751     }
1752
1753   /* Handle the other opcodes */
1754
1755   switch (c)
1756     {
1757     /* Check for quantifiers after a class. XCLASS is used for classes that
1758     cannot be represented just by a bit map. This includes negated single
1759     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1760     actual length is stored in the compiled code, so we must update "code"
1761     here. */
1762
1763 #ifdef SUPPORT_UTF8
1764     case OP_XCLASS:
1765     ccode = code += GET(code, 1);
1766     goto CHECK_CLASS_REPEAT;
1767 #endif
1768
1769     case OP_CLASS:
1770     case OP_NCLASS:
1771     ccode = code + 33;
1772
1773 #ifdef SUPPORT_UTF8
1774     CHECK_CLASS_REPEAT:
1775 #endif
1776
1777     switch (*ccode)
1778       {
1779       case OP_CRSTAR:            /* These could be empty; continue */
1780       case OP_CRMINSTAR:
1781       case OP_CRQUERY:
1782       case OP_CRMINQUERY:
1783       break;
1784
1785       default:                   /* Non-repeat => class must match */
1786       case OP_CRPLUS:            /* These repeats aren't empty */
1787       case OP_CRMINPLUS:
1788       return FALSE;
1789
1790       case OP_CRRANGE:
1791       case OP_CRMINRANGE:
1792       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
1793       break;
1794       }
1795     break;
1796
1797     /* Opcodes that must match a character */
1798
1799     case OP_PROP:
1800     case OP_NOTPROP:
1801     case OP_EXTUNI:
1802     case OP_NOT_DIGIT:
1803     case OP_DIGIT:
1804     case OP_NOT_WHITESPACE:
1805     case OP_WHITESPACE:
1806     case OP_NOT_WORDCHAR:
1807     case OP_WORDCHAR:
1808     case OP_ANY:
1809     case OP_ALLANY:
1810     case OP_ANYBYTE:
1811     case OP_CHAR:
1812     case OP_CHARNC:
1813     case OP_NOT:
1814     case OP_PLUS:
1815     case OP_MINPLUS:
1816     case OP_POSPLUS:
1817     case OP_EXACT:
1818     case OP_NOTPLUS:
1819     case OP_NOTMINPLUS:
1820     case OP_NOTPOSPLUS:
1821     case OP_NOTEXACT:
1822     case OP_TYPEPLUS:
1823     case OP_TYPEMINPLUS:
1824     case OP_TYPEPOSPLUS:
1825     case OP_TYPEEXACT:
1826     return FALSE;
1827
1828     /* These are going to continue, as they may be empty, but we have to
1829     fudge the length for the \p and \P cases. */
1830
1831     case OP_TYPESTAR:
1832     case OP_TYPEMINSTAR:
1833     case OP_TYPEPOSSTAR:
1834     case OP_TYPEQUERY:
1835     case OP_TYPEMINQUERY:
1836     case OP_TYPEPOSQUERY:
1837     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1838     break;
1839
1840     /* Same for these */
1841
1842     case OP_TYPEUPTO:
1843     case OP_TYPEMINUPTO:
1844     case OP_TYPEPOSUPTO:
1845     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1846     break;
1847
1848     /* End of branch */
1849
1850     case OP_KET:
1851     case OP_KETRMAX:
1852     case OP_KETRMIN:
1853     case OP_ALT:
1854     return TRUE;
1855
1856     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1857     MINUPTO, and POSUPTO may be followed by a multibyte character */
1858
1859 #ifdef SUPPORT_UTF8
1860     case OP_STAR:
1861     case OP_MINSTAR:
1862     case OP_POSSTAR:
1863     case OP_QUERY:
1864     case OP_MINQUERY:
1865     case OP_POSQUERY:
1866     if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
1867     break;
1868
1869     case OP_UPTO:
1870     case OP_MINUPTO:
1871     case OP_POSUPTO:
1872     if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
1873     break;
1874 #endif
1875
1876     /* None of the remaining opcodes are required to match a character. */
1877
1878     default:
1879     break;
1880     }
1881   }
1882
1883 return TRUE;
1884 }
1885
1886
1887
1888 /*************************************************
1889 *    Scan compiled regex for non-emptiness       *
1890 *************************************************/
1891
1892 /* This function is called to check for left recursive calls. We want to check
1893 the current branch of the current pattern to see if it could match the empty
1894 string. If it could, we must look outwards for branches at other levels,
1895 stopping when we pass beyond the bracket which is the subject of the recursion.
1896
1897 Arguments:
1898   code        points to start of the recursion
1899   endcode     points to where to stop (current RECURSE item)
1900   bcptr       points to the chain of current (unclosed) branch starts
1901   utf8        TRUE if in UTF-8 mode
1902   cd          pointers to tables etc
1903
1904 Returns:      TRUE if what is matched could be empty
1905 */
1906
1907 static BOOL
1908 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1909   BOOL utf8, compile_data *cd)
1910 {
1911 while (bcptr != NULL && bcptr->current_branch >= code)
1912   {
1913   if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
1914     return FALSE;
1915   bcptr = bcptr->outer;
1916   }
1917 return TRUE;
1918 }
1919
1920
1921
1922 /*************************************************
1923 *           Check for POSIX class syntax         *
1924 *************************************************/
1925
1926 /* This function is called when the sequence "[:" or "[." or "[=" is
1927 encountered in a character class. It checks whether this is followed by a
1928 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1929 reach an unescaped ']' without the special preceding character, return FALSE.
1930
1931 Originally, this function only recognized a sequence of letters between the
1932 terminators, but it seems that Perl recognizes any sequence of characters,
1933 though of course unknown POSIX names are subsequently rejected. Perl gives an
1934 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1935 didn't consider this to be a POSIX class. Likewise for [:1234:].
1936
1937 The problem in trying to be exactly like Perl is in the handling of escapes. We
1938 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1939 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1940 below handles the special case of \], but does not try to do any other escape
1941 processing. This makes it different from Perl for cases such as [:l\ower:]
1942 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1943 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1944 I think.
1945
1946 Arguments:
1947   ptr      pointer to the initial [
1948   endptr   where to return the end pointer
1949
1950 Returns:   TRUE or FALSE
1951 */
1952
1953 static BOOL
1954 check_posix_syntax(const uschar *ptr, const uschar **endptr)
1955 {
1956 int terminator;          /* Don't combine these lines; the Solaris cc */
1957 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1958 for (++ptr; *ptr != 0; ptr++)
1959   {
1960   if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
1961     {
1962     if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
1963     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
1964       {
1965       *endptr = ptr;
1966       return TRUE;
1967       }
1968     }
1969   }
1970 return FALSE;
1971 }
1972
1973
1974
1975
1976 /*************************************************
1977 *          Check POSIX class name                *
1978 *************************************************/
1979
1980 /* This function is called to check the name given in a POSIX-style class entry
1981 such as [:alnum:].
1982
1983 Arguments:
1984   ptr        points to the first letter
1985   len        the length of the name
1986
1987 Returns:     a value representing the name, or -1 if unknown
1988 */
1989
1990 static int
1991 check_posix_name(const uschar *ptr, int len)
1992 {
1993 const char *pn = posix_names;
1994 register int yield = 0;
1995 while (posix_name_lengths[yield] != 0)
1996   {
1997   if (len == posix_name_lengths[yield] &&
1998     strncmp((const char *)ptr, pn, len) == 0) return yield;
1999   pn += posix_name_lengths[yield] + 1;
2000   yield++;
2001   }
2002 return -1;
2003 }
2004
2005
2006 /*************************************************
2007 *    Adjust OP_RECURSE items in repeated group   *
2008 *************************************************/
2009
2010 /* OP_RECURSE items contain an offset from the start of the regex to the group
2011 that is referenced. This means that groups can be replicated for fixed
2012 repetition simply by copying (because the recursion is allowed to refer to
2013 earlier groups that are outside the current group). However, when a group is
2014 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2015 inserted before it, after it has been compiled. This means that any OP_RECURSE
2016 items within it that refer to the group itself or any contained groups have to
2017 have their offsets adjusted. That one of the jobs of this function. Before it
2018 is called, the partially compiled regex must be temporarily terminated with
2019 OP_END.
2020
2021 This function has been extended with the possibility of forward references for
2022 recursions and subroutine calls. It must also check the list of such references
2023 for the group we are dealing with. If it finds that one of the recursions in
2024 the current group is on this list, it adjusts the offset in the list, not the
2025 value in the reference (which is a group number).
2026
2027 Arguments:
2028   group      points to the start of the group
2029   adjust     the amount by which the group is to be moved
2030   utf8       TRUE in UTF-8 mode
2031   cd         contains pointers to tables etc.
2032   save_hwm   the hwm forward reference pointer at the start of the group
2033
2034 Returns:     nothing
2035 */
2036
2037 static void
2038 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2039   uschar *save_hwm)
2040 {
2041 uschar *ptr = group;
2042
2043 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2044   {
2045   int offset;
2046   uschar *hc;
2047
2048   /* See if this recursion is on the forward reference list. If so, adjust the
2049   reference. */
2050
2051   for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2052     {
2053     offset = GET(hc, 0);
2054     if (cd->start_code + offset == ptr + 1)
2055       {
2056       PUT(hc, 0, offset + adjust);
2057       break;
2058       }
2059     }
2060
2061   /* Otherwise, adjust the recursion offset if it's after the start of this
2062   group. */
2063
2064   if (hc >= cd->hwm)
2065     {
2066     offset = GET(ptr, 1);
2067     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2068     }
2069
2070   ptr += 1 + LINK_SIZE;
2071   }
2072 }
2073
2074
2075
2076 /*************************************************
2077 *        Insert an automatic callout point       *
2078 *************************************************/
2079
2080 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2081 callout points before each pattern item.
2082
2083 Arguments:
2084   code           current code pointer
2085   ptr            current pattern pointer
2086   cd             pointers to tables etc
2087
2088 Returns:         new code pointer
2089 */
2090
2091 static uschar *
2092 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2093 {
2094 *code++ = OP_CALLOUT;
2095 *code++ = 255;
2096 PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */
2097 PUT(code, LINK_SIZE, 0);                /* Default length */
2098 return code + 2*LINK_SIZE;
2099 }
2100
2101
2102
2103 /*************************************************
2104 *         Complete a callout item                *
2105 *************************************************/
2106
2107 /* A callout item contains the length of the next item in the pattern, which
2108 we can't fill in till after we have reached the relevant point. This is used
2109 for both automatic and manual callouts.
2110
2111 Arguments:
2112   previous_callout   points to previous callout item
2113   ptr                current pattern pointer
2114   cd                 pointers to tables etc
2115
2116 Returns:             nothing
2117 */
2118
2119 static void
2120 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2121 {
2122 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2123 PUT(previous_callout, 2 + LINK_SIZE, length);
2124 }
2125
2126
2127
2128 #ifdef SUPPORT_UCP
2129 /*************************************************
2130 *           Get othercase range                  *
2131 *************************************************/
2132
2133 /* This function is passed the start and end of a class range, in UTF-8 mode
2134 with UCP support. It searches up the characters, looking for internal ranges of
2135 characters in the "other" case. Each call returns the next one, updating the
2136 start address.
2137
2138 Arguments:
2139   cptr        points to starting character value; updated
2140   d           end value
2141   ocptr       where to put start of othercase range
2142   odptr       where to put end of othercase range
2143
2144 Yield:        TRUE when range returned; FALSE when no more
2145 */
2146
2147 static BOOL
2148 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2149   unsigned int *odptr)
2150 {
2151 unsigned int c, othercase, next;
2152
2153 for (c = *cptr; c <= d; c++)
2154   { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2155
2156 if (c > d) return FALSE;
2157
2158 *ocptr = othercase;
2159 next = othercase + 1;
2160
2161 for (++c; c <= d; c++)
2162   {
2163   if (UCD_OTHERCASE(c) != next) break;
2164   next++;
2165   }
2166
2167 *odptr = next - 1;
2168 *cptr = c;
2169
2170 return TRUE;
2171 }
2172 #endif  /* SUPPORT_UCP */
2173
2174
2175
2176 /*************************************************
2177 *     Check if auto-possessifying is possible    *
2178 *************************************************/
2179
2180 /* This function is called for unlimited repeats of certain items, to see
2181 whether the next thing could possibly match the repeated item. If not, it makes
2182 sense to automatically possessify the repeated item.
2183
2184 Arguments:
2185   op_code       the repeated op code
2186   this          data for this item, depends on the opcode
2187   utf8          TRUE in UTF-8 mode
2188   utf8_char     used for utf8 character bytes, NULL if not relevant
2189   ptr           next character in pattern
2190   options       options bits
2191   cd            contains pointers to tables etc.
2192
2193 Returns:        TRUE if possessifying is wanted
2194 */
2195
2196 static BOOL
2197 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2198   const uschar *ptr, int options, compile_data *cd)
2199 {
2200 int next;
2201
2202 /* Skip whitespace and comments in extended mode */
2203
2204 if ((options & PCRE_EXTENDED) != 0)
2205   {
2206   for (;;)
2207     {
2208     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2209     if (*ptr == CHAR_NUMBER_SIGN)
2210       {
2211       while (*(++ptr) != 0)
2212         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2213       }
2214     else break;
2215     }
2216   }
2217
2218 /* If the next item is one that we can handle, get its value. A non-negative
2219 value is a character, a negative value is an escape value. */
2220
2221 if (*ptr == CHAR_BACKSLASH)
2222   {
2223   int temperrorcode = 0;
2224   next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2225   if (temperrorcode != 0) return FALSE;
2226   ptr++;    /* Point after the escape sequence */
2227   }
2228
2229 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2230   {
2231 #ifdef SUPPORT_UTF8
2232   if (utf8) { GETCHARINC(next, ptr); } else
2233 #endif
2234   next = *ptr++;
2235   }
2236
2237 else return FALSE;
2238
2239 /* Skip whitespace and comments in extended mode */
2240
2241 if ((options & PCRE_EXTENDED) != 0)
2242   {
2243   for (;;)
2244     {
2245     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2246     if (*ptr == CHAR_NUMBER_SIGN)
2247       {
2248       while (*(++ptr) != 0)
2249         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2250       }
2251     else break;
2252     }
2253   }
2254
2255 /* If the next thing is itself optional, we have to give up. */
2256
2257 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2258   strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2259     return FALSE;
2260
2261 /* Now compare the next item with the previous opcode. If the previous is a
2262 positive single character match, "item" either contains the character or, if
2263 "item" is greater than 127 in utf8 mode, the character's bytes are in
2264 utf8_char. */
2265
2266
2267 /* Handle cases when the next item is a character. */
2268
2269 if (next >= 0) switch(op_code)
2270   {
2271   case OP_CHAR:
2272 #ifdef SUPPORT_UTF8
2273   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2274 #else
2275   (void)(utf8_char);  /* Keep compiler happy by referencing function argument */
2276 #endif
2277   return item != next;
2278
2279   /* For CHARNC (caseless character) we must check the other case. If we have
2280   Unicode property support, we can use it to test the other case of
2281   high-valued characters. */
2282
2283   case OP_CHARNC:
2284 #ifdef SUPPORT_UTF8
2285   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2286 #endif
2287   if (item == next) return FALSE;
2288 #ifdef SUPPORT_UTF8
2289   if (utf8)
2290     {
2291     unsigned int othercase;
2292     if (next < 128) othercase = cd->fcc[next]; else
2293 #ifdef SUPPORT_UCP
2294     othercase = UCD_OTHERCASE((unsigned int)next);
2295 #else
2296     othercase = NOTACHAR;
2297 #endif
2298     return (unsigned int)item != othercase;
2299     }
2300   else
2301 #endif  /* SUPPORT_UTF8 */
2302   return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
2303
2304   /* For OP_NOT, "item" must be a single-byte character. */
2305
2306   case OP_NOT:
2307   if (item == next) return TRUE;
2308   if ((options & PCRE_CASELESS) == 0) return FALSE;
2309 #ifdef SUPPORT_UTF8
2310   if (utf8)
2311     {
2312     unsigned int othercase;
2313     if (next < 128) othercase = cd->fcc[next]; else
2314 #ifdef SUPPORT_UCP
2315     othercase = UCD_OTHERCASE(next);
2316 #else
2317     othercase = NOTACHAR;
2318 #endif
2319     return (unsigned int)item == othercase;
2320     }
2321   else
2322 #endif  /* SUPPORT_UTF8 */
2323   return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
2324
2325   case OP_DIGIT:
2326   return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2327
2328   case OP_NOT_DIGIT:
2329   return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2330
2331   case OP_WHITESPACE:
2332   return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2333
2334   case OP_NOT_WHITESPACE:
2335   return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2336
2337   case OP_WORDCHAR:
2338   return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2339
2340   case OP_NOT_WORDCHAR:
2341   return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2342
2343   case OP_HSPACE:
2344   case OP_NOT_HSPACE:
2345   switch(next)
2346     {
2347     case 0x09:
2348     case 0x20:
2349     case 0xa0:
2350     case 0x1680:
2351     case 0x180e:
2352     case 0x2000:
2353     case 0x2001:
2354     case 0x2002:
2355     case 0x2003:
2356     case 0x2004:
2357     case 0x2005:
2358     case 0x2006:
2359     case 0x2007:
2360     case 0x2008:
2361     case 0x2009:
2362     case 0x200A:
2363     case 0x202f:
2364     case 0x205f:
2365     case 0x3000:
2366     return op_code != OP_HSPACE;
2367     default:
2368     return op_code == OP_HSPACE;
2369     }
2370
2371   case OP_VSPACE:
2372   case OP_NOT_VSPACE:
2373   switch(next)
2374     {
2375     case 0x0a:
2376     case 0x0b:
2377     case 0x0c:
2378     case 0x0d:
2379     case 0x85:
2380     case 0x2028:
2381     case 0x2029:
2382     return op_code != OP_VSPACE;
2383     default:
2384     return op_code == OP_VSPACE;
2385     }
2386
2387   default:
2388   return FALSE;
2389   }
2390
2391
2392 /* Handle the case when the next item is \d, \s, etc. */
2393
2394 switch(op_code)
2395   {
2396   case OP_CHAR:
2397   case OP_CHARNC:
2398 #ifdef SUPPORT_UTF8
2399   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2400 #endif
2401   switch(-next)
2402     {
2403     case ESC_d:
2404     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2405
2406     case ESC_D:
2407     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2408
2409     case ESC_s:
2410     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2411
2412     case ESC_S:
2413     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2414
2415     case ESC_w:
2416     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2417
2418     case ESC_W:
2419     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2420
2421     case ESC_h:
2422     case ESC_H:
2423     switch(item)
2424       {
2425       case 0x09:
2426       case 0x20:
2427       case 0xa0:
2428       case 0x1680:
2429       case 0x180e:
2430       case 0x2000:
2431       case 0x2001:
2432       case 0x2002:
2433       case 0x2003:
2434       case 0x2004:
2435       case 0x2005:
2436       case 0x2006:
2437       case 0x2007:
2438       case 0x2008:
2439       case 0x2009:
2440       case 0x200A:
2441       case 0x202f:
2442       case 0x205f:
2443       case 0x3000:
2444       return -next != ESC_h;
2445       default:
2446       return -next == ESC_h;
2447       }
2448
2449     case ESC_v:
2450     case ESC_V:
2451     switch(item)
2452       {
2453       case 0x0a:
2454       case 0x0b:
2455       case 0x0c:
2456       case 0x0d:
2457       case 0x85:
2458       case 0x2028:
2459       case 0x2029:
2460       return -next != ESC_v;
2461       default:
2462       return -next == ESC_v;
2463       }
2464
2465     default:
2466     return FALSE;
2467     }
2468
2469   case OP_DIGIT:
2470   return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2471          next == -ESC_h || next == -ESC_v;
2472
2473   case OP_NOT_DIGIT:
2474   return next == -ESC_d;
2475
2476   case OP_WHITESPACE:
2477   return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2478
2479   case OP_NOT_WHITESPACE:
2480   return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2481
2482   case OP_HSPACE:
2483   return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2484
2485   case OP_NOT_HSPACE:
2486   return next == -ESC_h;
2487
2488   /* Can't have \S in here because VT matches \S (Perl anomaly) */
2489   case OP_VSPACE:
2490   return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2491
2492   case OP_NOT_VSPACE:
2493   return next == -ESC_v;
2494
2495   case OP_WORDCHAR:
2496   return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2497
2498   case OP_NOT_WORDCHAR:
2499   return next == -ESC_w || next == -ESC_d;
2500
2501   default:
2502   return FALSE;
2503   }
2504
2505 /* Control does not reach here */
2506 }
2507
2508
2509
2510 /*************************************************
2511 *           Compile one branch                   *
2512 *************************************************/
2513
2514 /* Scan the pattern, compiling it into the a vector. If the options are
2515 changed during the branch, the pointer is used to change the external options
2516 bits. This function is used during the pre-compile phase when we are trying
2517 to find out the amount of memory needed, as well as during the real compile
2518 phase. The value of lengthptr distinguishes the two phases.
2519
2520 Arguments:
2521   optionsptr     pointer to the option bits
2522   codeptr        points to the pointer to the current code point
2523   ptrptr         points to the current pattern pointer
2524   errorcodeptr   points to error code variable
2525   firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2526   reqbyteptr     set to the last literal character required, else < 0
2527   bcptr          points to current branch chain
2528   cd             contains pointers to tables etc.
2529   lengthptr      NULL during the real compile phase
2530                  points to length accumulator during pre-compile phase
2531
2532 Returns:         TRUE on success
2533                  FALSE, with *errorcodeptr set non-zero on error
2534 */
2535
2536 static BOOL
2537 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2538   int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2539   compile_data *cd, int *lengthptr)
2540 {
2541 int repeat_type, op_type;
2542 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2543 int bravalue = 0;
2544 int greedy_default, greedy_non_default;
2545 int firstbyte, reqbyte;
2546 int zeroreqbyte, zerofirstbyte;
2547 int req_caseopt, reqvary, tempreqvary;
2548 int options = *optionsptr;
2549 int after_manual_callout = 0;
2550 int length_prevgroup = 0;
2551 register int c;
2552 register uschar *code = *codeptr;
2553 uschar *last_code = code;
2554 uschar *orig_code = code;
2555 uschar *tempcode;
2556 BOOL inescq = FALSE;
2557 BOOL groupsetfirstbyte = FALSE;
2558 const uschar *ptr = *ptrptr;
2559 const uschar *tempptr;
2560 uschar *previous = NULL;
2561 uschar *previous_callout = NULL;
2562 uschar *save_hwm = NULL;
2563 uschar classbits[32];
2564
2565 #ifdef SUPPORT_UTF8
2566 BOOL class_utf8;
2567 BOOL utf8 = (options & PCRE_UTF8) != 0;
2568 uschar *class_utf8data;
2569 uschar *class_utf8data_base;
2570 uschar utf8_char[6];
2571 #else
2572 BOOL utf8 = FALSE;
2573 uschar *utf8_char = NULL;
2574 #endif
2575
2576 #ifdef PCRE_DEBUG
2577 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2578 #endif
2579
2580 /* Set up the default and non-default settings for greediness */
2581
2582 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2583 greedy_non_default = greedy_default ^ 1;
2584
2585 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2586 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2587 matches a non-fixed char first char; reqbyte just remains unset if we never
2588 find one.
2589
2590 When we hit a repeat whose minimum is zero, we may have to adjust these values
2591 to take the zero repeat into account. This is implemented by setting them to
2592 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2593 item types that can be repeated set these backoff variables appropriately. */
2594
2595 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2596
2597 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2598 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2599 value > 255. It is added into the firstbyte or reqbyte variables to record the
2600 case status of the value. This is used only for ASCII characters. */
2601
2602 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2603
2604 /* Switch on next character until the end of the branch */
2605
2606 for (;; ptr++)
2607   {
2608   BOOL negate_class;
2609   BOOL should_flip_negation;
2610   BOOL possessive_quantifier;
2611   BOOL is_quantifier;
2612   BOOL is_recurse;
2613   BOOL reset_bracount;
2614   int class_charcount;
2615   int class_lastchar;
2616   int newoptions;
2617   int recno;
2618   int refsign;
2619   int skipbytes;
2620   int subreqbyte;
2621   int subfirstbyte;
2622   int terminator;
2623   int mclength;
2624   uschar mcbuffer[8];
2625
2626   /* Get next byte in the pattern */
2627
2628   c = *ptr;
2629
2630   /* If we are in the pre-compile phase, accumulate the length used for the
2631   previous cycle of this loop. */
2632
2633   if (lengthptr != NULL)
2634     {
2635 #ifdef PCRE_DEBUG
2636     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2637 #endif
2638     if (code > cd->start_workspace + WORK_SIZE_CHECK)   /* Check for overrun */
2639       {
2640       *errorcodeptr = ERR52;
2641       goto FAILED;
2642       }
2643
2644     /* There is at least one situation where code goes backwards: this is the
2645     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2646     the class is simply eliminated. However, it is created first, so we have to
2647     allow memory for it. Therefore, don't ever reduce the length at this point.
2648     */
2649
2650     if (code < last_code) code = last_code;
2651
2652     /* Paranoid check for integer overflow */
2653
2654     if (OFLOW_MAX - *lengthptr < code - last_code)
2655       {
2656       *errorcodeptr = ERR20;
2657       goto FAILED;
2658       }
2659
2660     *lengthptr += code - last_code;
2661     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2662
2663     /* If "previous" is set and it is not at the start of the work space, move
2664     it back to there, in order to avoid filling up the work space. Otherwise,
2665     if "previous" is NULL, reset the current code pointer to the start. */
2666
2667     if (previous != NULL)
2668       {
2669       if (previous > orig_code)
2670         {
2671         memmove(orig_code, previous, code - previous);
2672         code -= previous - orig_code;
2673         previous = orig_code;
2674         }
2675       }
2676     else code = orig_code;
2677
2678     /* Remember where this code item starts so we can pick up the length
2679     next time round. */
2680
2681     last_code = code;
2682     }
2683
2684   /* In the real compile phase, just check the workspace used by the forward
2685   reference list. */
2686
2687   else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
2688     {
2689     *errorcodeptr = ERR52;
2690     goto FAILED;
2691     }
2692
2693   /* If in \Q...\E, check for the end; if not, we have a literal */
2694
2695   if (inescq && c != 0)
2696     {
2697     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2698       {
2699       inescq = FALSE;
2700       ptr++;
2701       continue;
2702       }
2703     else
2704       {
2705       if (previous_callout != NULL)
2706         {
2707         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2708           complete_callout(previous_callout, ptr, cd);
2709         previous_callout = NULL;
2710         }
2711       if ((options & PCRE_AUTO_CALLOUT) != 0)
2712         {
2713         previous_callout = code;
2714         code = auto_callout(code, ptr, cd);
2715         }
2716       goto NORMAL_CHAR;
2717       }
2718     }
2719
2720   /* Fill in length of a previous callout, except when the next thing is
2721   a quantifier. */
2722
2723   is_quantifier =
2724     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2725     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2726
2727   if (!is_quantifier && previous_callout != NULL &&
2728        after_manual_callout-- <= 0)
2729     {
2730     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2731       complete_callout(previous_callout, ptr, cd);
2732     previous_callout = NULL;
2733     }
2734
2735   /* In extended mode, skip white space and comments */
2736
2737   if ((options & PCRE_EXTENDED) != 0)
2738     {
2739     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2740     if (c == CHAR_NUMBER_SIGN)
2741       {
2742       while (*(++ptr) != 0)
2743         {
2744         if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2745         }
2746       if (*ptr != 0) continue;
2747
2748       /* Else fall through to handle end of string */
2749       c = 0;
2750       }
2751     }
2752
2753   /* No auto callout for quantifiers. */
2754
2755   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2756     {
2757     previous_callout = code;
2758     code = auto_callout(code, ptr, cd);
2759     }
2760
2761   switch(c)
2762     {
2763     /* ===================================================================*/
2764     case 0:                        /* The branch terminates at string end */
2765     case CHAR_VERTICAL_LINE:       /* or | or ) */
2766     case CHAR_RIGHT_PARENTHESIS:
2767     *firstbyteptr = firstbyte;
2768     *reqbyteptr = reqbyte;
2769     *codeptr = code;
2770     *ptrptr = ptr;
2771     if (lengthptr != NULL)
2772       {
2773       if (OFLOW_MAX - *lengthptr < code - last_code)
2774         {
2775         *errorcodeptr = ERR20;
2776         goto FAILED;
2777         }
2778       *lengthptr += code - last_code;   /* To include callout length */
2779       DPRINTF((">> end branch\n"));
2780       }
2781     return TRUE;
2782
2783
2784     /* ===================================================================*/
2785     /* Handle single-character metacharacters. In multiline mode, ^ disables
2786     the setting of any following char as a first character. */
2787
2788     case CHAR_CIRCUMFLEX_ACCENT:
2789     if ((options & PCRE_MULTILINE) != 0)
2790       {
2791       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2792       }
2793     previous = NULL;
2794     *code++ = OP_CIRC;
2795     break;
2796
2797     case CHAR_DOLLAR_SIGN:
2798     previous = NULL;
2799     *code++ = OP_DOLL;
2800     break;
2801
2802     /* There can never be a first char if '.' is first, whatever happens about
2803     repeats. The value of reqbyte doesn't change either. */
2804
2805     case CHAR_DOT:
2806     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2807     zerofirstbyte = firstbyte;
2808     zeroreqbyte = reqbyte;
2809     previous = code;
2810     *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2811     break;
2812
2813
2814     /* ===================================================================*/
2815     /* Character classes. If the included characters are all < 256, we build a
2816     32-byte bitmap of the permitted characters, except in the special case
2817     where there is only one such character. For negated classes, we build the
2818     map as usual, then invert it at the end. However, we use a different opcode
2819     so that data characters > 255 can be handled correctly.
2820
2821     If the class contains characters outside the 0-255 range, a different
2822     opcode is compiled. It may optionally have a bit map for characters < 256,
2823     but those above are are explicitly listed afterwards. A flag byte tells
2824     whether the bitmap is present, and whether this is a negated class or not.
2825
2826     In JavaScript compatibility mode, an isolated ']' causes an error. In
2827     default (Perl) mode, it is treated as a data character. */
2828
2829     case CHAR_RIGHT_SQUARE_BRACKET:
2830     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2831       {
2832       *errorcodeptr = ERR64;
2833       goto FAILED;
2834       }
2835     goto NORMAL_CHAR;
2836
2837     case CHAR_LEFT_SQUARE_BRACKET:
2838     previous = code;
2839
2840     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2841     they are encountered at the top level, so we'll do that too. */
2842
2843     if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2844          ptr[1] == CHAR_EQUALS_SIGN) &&
2845         check_posix_syntax(ptr, &tempptr))
2846       {
2847       *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
2848       goto FAILED;
2849       }
2850
2851     /* If the first character is '^', set the negation flag and skip it. Also,
2852     if the first few characters (either before or after ^) are \Q\E or \E we
2853     skip them too. This makes for compatibility with Perl. */
2854
2855     negate_class = FALSE;
2856     for (;;)
2857       {
2858       c = *(++ptr);
2859       if (c == CHAR_BACKSLASH)
2860         {
2861         if (ptr[1] == CHAR_E)
2862           ptr++;
2863         else if (strncmp((const char *)ptr+1,
2864                           STR_Q STR_BACKSLASH STR_E, 3) == 0)
2865           ptr += 3;
2866         else
2867           break;
2868         }
2869       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
2870         negate_class = TRUE;
2871       else break;
2872       }
2873
2874     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2875     an initial ']' is taken as a data character -- the code below handles
2876     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2877     [^] must match any character, so generate OP_ALLANY. */
2878
2879     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
2880         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2881       {
2882       *code++ = negate_class? OP_ALLANY : OP_FAIL;
2883       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2884       zerofirstbyte = firstbyte;
2885       break;
2886       }
2887
2888     /* If a class contains a negative special such as \S, we need to flip the
2889     negation flag at the end, so that support for characters > 255 works
2890     correctly (they are all included in the class). */
2891
2892     should_flip_negation = FALSE;
2893
2894     /* Keep a count of chars with values < 256 so that we can optimize the case
2895     of just a single character (as long as it's < 256). However, For higher
2896     valued UTF-8 characters, we don't yet do any optimization. */
2897
2898     class_charcount = 0;
2899     class_lastchar = -1;
2900
2901     /* Initialize the 32-char bit map to all zeros. We build the map in a
2902     temporary bit of memory, in case the class contains only 1 character (less
2903     than 256), because in that case the compiled code doesn't use the bit map.
2904     */
2905
2906     memset(classbits, 0, 32 * sizeof(uschar));
2907
2908 #ifdef SUPPORT_UTF8
2909     class_utf8 = FALSE;                       /* No chars >= 256 */
2910     class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2911     class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
2912 #endif
2913
2914     /* Process characters until ] is reached. By writing this as a "do" it
2915     means that an initial ] is taken as a data character. At the start of the
2916     loop, c contains the first byte of the character. */
2917
2918     if (c != 0) do
2919       {
2920       const uschar *oldptr;
2921
2922 #ifdef SUPPORT_UTF8
2923       if (utf8 && c > 127)
2924         {                           /* Braces are required because the */
2925         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
2926         }
2927
2928       /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2929       data and reset the pointer. This is so that very large classes that
2930       contain a zillion UTF-8 characters no longer overwrite the work space
2931       (which is on the stack). */
2932
2933       if (lengthptr != NULL)
2934         {
2935         *lengthptr += class_utf8data - class_utf8data_base;
2936         class_utf8data = class_utf8data_base;
2937         }
2938
2939 #endif
2940
2941       /* Inside \Q...\E everything is literal except \E */
2942
2943       if (inescq)
2944         {
2945         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
2946           {
2947           inescq = FALSE;                   /* Reset literal state */
2948           ptr++;                            /* Skip the 'E' */
2949           continue;                         /* Carry on with next */
2950           }
2951         goto CHECK_RANGE;                   /* Could be range if \E follows */
2952         }
2953
2954       /* Handle POSIX class names. Perl allows a negation extension of the
2955       form [:^name:]. A square bracket that doesn't match the syntax is
2956       treated as a literal. We also recognize the POSIX constructions
2957       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2958       5.6 and 5.8 do. */
2959
2960       if (c == CHAR_LEFT_SQUARE_BRACKET &&
2961           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2962            ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
2963         {
2964         BOOL local_negate = FALSE;
2965         int posix_class, taboffset, tabopt;
2966         register const uschar *cbits = cd->cbits;
2967         uschar pbits[32];
2968
2969         if (ptr[1] != CHAR_COLON)
2970           {
2971           *errorcodeptr = ERR31;
2972           goto FAILED;
2973           }
2974
2975         ptr += 2;
2976         if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
2977           {
2978           local_negate = TRUE;
2979           should_flip_negation = TRUE;  /* Note negative special */
2980           ptr++;
2981           }
2982
2983         posix_class = check_posix_name(ptr, tempptr - ptr);
2984         if (posix_class < 0)
2985           {
2986           *errorcodeptr = ERR30;
2987           goto FAILED;
2988           }
2989
2990         /* If matching is caseless, upper and lower are converted to
2991         alpha. This relies on the fact that the class table starts with
2992         alpha, lower, upper as the first 3 entries. */
2993
2994         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2995           posix_class = 0;
2996
2997         /* We build the bit map for the POSIX class in a chunk of local store
2998         because we may be adding and subtracting from it, and we don't want to
2999         subtract bits that may be in the main map already. At the end we or the
3000         result into the bit map that is being built. */
3001
3002         posix_class *= 3;
3003
3004         /* Copy in the first table (always present) */
3005
3006         memcpy(pbits, cbits + posix_class_maps[posix_class],
3007           32 * sizeof(uschar));
3008
3009         /* If there is a second table, add or remove it as required. */
3010
3011         taboffset = posix_class_maps[posix_class + 1];
3012         tabopt = posix_class_maps[posix_class + 2];
3013
3014         if (taboffset >= 0)
3015           {
3016           if (tabopt >= 0)
3017             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3018           else
3019             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3020           }
3021
3022         /* Not see if we need to remove any special characters. An option
3023         value of 1 removes vertical space and 2 removes underscore. */
3024
3025         if (tabopt < 0) tabopt = -tabopt;
3026         if (tabopt == 1) pbits[1] &= ~0x3c;
3027           else if (tabopt == 2) pbits[11] &= 0x7f;
3028
3029         /* Add the POSIX table or its complement into the main table that is
3030         being built and we are done. */
3031
3032         if (local_negate)
3033           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3034         else
3035           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3036
3037         ptr = tempptr + 1;
3038         class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
3039         continue;    /* End of POSIX syntax handling */
3040         }
3041
3042       /* Backslash may introduce a single character, or it may introduce one
3043       of the specials, which just set a flag. The sequence \b is a special
3044       case. Inside a class (and only there) it is treated as backspace.
3045       Elsewhere it marks a word boundary. Other escapes have preset maps ready
3046       to 'or' into the one we are building. We assume they have more than one
3047       character in them, so set class_charcount bigger than one. */
3048
3049       if (c == CHAR_BACKSLASH)
3050         {
3051         c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3052         if (*errorcodeptr != 0) goto FAILED;
3053
3054         if (-c == ESC_b) c = CHAR_BS;       /* \b is backspace in a class */
3055         else if (-c == ESC_X) c = CHAR_X;   /* \X is literal X in a class */
3056         else if (-c == ESC_R) c = CHAR_R;   /* \R is literal R in a class */
3057         else if (-c == ESC_Q)            /* Handle start of quoted string */
3058           {
3059           if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3060             {
3061             ptr += 2; /* avoid empty string */
3062             }
3063           else inescq = TRUE;
3064           continue;
3065           }
3066         else if (-c == ESC_E) continue;  /* Ignore orphan \E */
3067
3068         if (c < 0)
3069           {
3070           register const uschar *cbits = cd->cbits;
3071           class_charcount += 2;     /* Greater than 1 is what matters */
3072
3073           /* Save time by not doing this in the pre-compile phase. */
3074
3075           if (lengthptr == NULL) switch (-c)
3076             {
3077             case ESC_d:
3078             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3079             continue;
3080
3081             case ESC_D:
3082             should_flip_negation = TRUE;
3083             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3084             continue;
3085
3086             case ESC_w:
3087             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3088             continue;
3089
3090             case ESC_W:
3091             should_flip_negation = TRUE;
3092             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3093             continue;
3094
3095             case ESC_s:
3096             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3097             classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
3098             continue;
3099
3100             case ESC_S:
3101             should_flip_negation = TRUE;
3102             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3103             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
3104             continue;
3105
3106             default:    /* Not recognized; fall through */
3107             break;      /* Need "default" setting to stop compiler warning. */
3108             }
3109
3110           /* In the pre-compile phase, just do the recognition. */
3111
3112           else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3113                    c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3114
3115           /* We need to deal with \H, \h, \V, and \v in both phases because
3116           they use extra memory. */
3117
3118           if (-c == ESC_h)
3119             {
3120             SETBIT(classbits, 0x09); /* VT */
3121             SETBIT(classbits, 0x20); /* SPACE */
3122             SETBIT(classbits, 0xa0); /* NSBP */
3123 #ifdef SUPPORT_UTF8
3124             if (utf8)
3125               {
3126               class_utf8 = TRUE;
3127               *class_utf8data++ = XCL_SINGLE;
3128               class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3129               *class_utf8data++ = XCL_SINGLE;
3130               class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3131               *class_utf8data++ = XCL_RANGE;
3132               class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3133               class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3134               *class_utf8data++ = XCL_SINGLE;
3135               class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3136               *class_utf8data++ = XCL_SINGLE;
3137               class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3138               *class_utf8data++ = XCL_SINGLE;
3139               class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3140               }
3141 #endif
3142             continue;
3143             }
3144
3145           if (-c == ESC_H)
3146             {
3147             for (c = 0; c < 32; c++)
3148               {
3149               int x = 0xff;
3150               switch (c)
3151                 {
3152                 case 0x09/8: x ^= 1 << (0x09%8); break;
3153                 case 0x20/8: x ^= 1 << (0x20%8); break;
3154                 case 0xa0/8: x ^= 1 << (0xa0%8); break;
3155                 default: break;
3156                 }
3157               classbits[c] |= x;
3158               }
3159
3160 #ifdef SUPPORT_UTF8
3161             if (utf8)
3162               {
3163               class_utf8 = TRUE;
3164               *class_utf8data++ = XCL_RANGE;
3165               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3166               class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3167               *class_utf8data++ = XCL_RANGE;
3168               class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3169               class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3170               *class_utf8data++ = XCL_RANGE;
3171               class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3172               class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3173               *class_utf8data++ = XCL_RANGE;
3174               class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3175               class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3176               *class_utf8data++ = XCL_RANGE;
3177               class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3178               class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3179               *class_utf8data++ = XCL_RANGE;
3180               class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3181               class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3182               *class_utf8data++ = XCL_RANGE;
3183               class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3184               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3185               }
3186 #endif
3187             continue;
3188             }
3189
3190           if (-c == ESC_v)
3191             {
3192             SETBIT(classbits, 0x0a); /* LF */
3193             SETBIT(classbits, 0x0b); /* VT */
3194             SETBIT(classbits, 0x0c); /* FF */
3195             SETBIT(classbits, 0x0d); /* CR */
3196             SETBIT(classbits, 0x85); /* NEL */
3197 #ifdef SUPPORT_UTF8
3198             if (utf8)
3199               {
3200               class_utf8 = TRUE;
3201               *class_utf8data++ = XCL_RANGE;
3202               class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3203               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3204               }
3205 #endif
3206             continue;
3207             }
3208
3209           if (-c == ESC_V)
3210             {
3211             for (c = 0; c < 32; c++)
3212               {
3213               int x = 0xff;
3214               switch (c)
3215                 {
3216                 case 0x0a/8: x ^= 1 << (0x0a%8);
3217                              x ^= 1 << (0x0b%8);
3218                              x ^= 1 << (0x0c%8);
3219                              x ^= 1 << (0x0d%8);
3220                              break;
3221                 case 0x85/8: x ^= 1 << (0x85%8); break;
3222                 default: break;
3223                 }
3224               classbits[c] |= x;
3225               }
3226
3227 #ifdef SUPPORT_UTF8
3228             if (utf8)
3229               {
3230               class_utf8 = TRUE;
3231               *class_utf8data++ = XCL_RANGE;
3232               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3233               class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3234               *class_utf8data++ = XCL_RANGE;
3235               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3236               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3237               }
3238 #endif
3239             continue;
3240             }
3241
3242           /* We need to deal with \P and \p in both phases. */
3243
3244 #ifdef SUPPORT_UCP
3245           if (-c == ESC_p || -c == ESC_P)
3246             {
3247             BOOL negated;
3248             int pdata;
3249             int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3250             if (ptype < 0) goto FAILED;
3251             class_utf8 = TRUE;
3252             *class_utf8data++ = ((-c == ESC_p) != negated)?
3253               XCL_PROP : XCL_NOTPROP;
3254             *class_utf8data++ = ptype;
3255             *class_utf8data++ = pdata;
3256             class_charcount -= 2;   /* Not a < 256 character */
3257             continue;
3258             }
3259 #endif
3260           /* Unrecognized escapes are faulted if PCRE is running in its
3261           strict mode. By default, for compatibility with Perl, they are
3262           treated as literals. */
3263
3264           if ((options & PCRE_EXTRA) != 0)
3265             {
3266             *errorcodeptr = ERR7;
3267             goto FAILED;
3268             }
3269
3270           class_charcount -= 2;  /* Undo the default count from above */
3271           c = *ptr;              /* Get the final character and fall through */
3272           }
3273
3274         /* Fall through if we have a single character (c >= 0). This may be
3275         greater than 256 in UTF-8 mode. */
3276
3277         }   /* End of backslash handling */
3278
3279       /* A single character may be followed by '-' to form a range. However,
3280       Perl does not permit ']' to be the end of the range. A '-' character
3281       at the end is treated as a literal. Perl ignores orphaned \E sequences
3282       entirely. The code for handling \Q and \E is messy. */
3283
3284       CHECK_RANGE:
3285       while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3286         {
3287         inescq = FALSE;
3288         ptr += 2;
3289         }
3290
3291       oldptr = ptr;
3292
3293       /* Remember \r or \n */
3294
3295       if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3296
3297       /* Check for range */
3298
3299       if (!inescq && ptr[1] == CHAR_MINUS)
3300         {
3301         int d;
3302         ptr += 2;
3303         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3304
3305         /* If we hit \Q (not followed by \E) at this point, go into escaped
3306         mode. */
3307
3308         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3309           {
3310           ptr += 2;
3311           if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3312             { ptr += 2; continue; }
3313           inescq = TRUE;
3314           break;
3315           }
3316
3317         if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3318           {
3319           ptr = oldptr;
3320           goto LONE_SINGLE_CHARACTER;
3321           }
3322
3323 #ifdef SUPPORT_UTF8
3324         if (utf8)
3325           {                           /* Braces are required because the */
3326           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
3327           }
3328         else
3329 #endif
3330         d = *ptr;  /* Not UTF-8 mode */
3331
3332         /* The second part of a range can be a single-character escape, but
3333         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3334         in such circumstances. */
3335
3336         if (!inescq && d == CHAR_BACKSLASH)
3337           {
3338           d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3339           if (*errorcodeptr != 0) goto FAILED;
3340
3341           /* \b is backspace; \X is literal X; \R is literal R; any other
3342           special means the '-' was literal */
3343
3344           if (d < 0)
3345             {
3346             if (d == -ESC_b) d = CHAR_BS;
3347             else if (d == -ESC_X) d = CHAR_X;
3348             else if (d == -ESC_R) d = CHAR_R; else
3349               {
3350               ptr = oldptr;
3351               goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3352               }
3353             }
3354           }
3355
3356         /* Check that the two values are in the correct order. Optimize
3357         one-character ranges */
3358
3359         if (d < c)
3360           {
3361           *errorcodeptr = ERR8;
3362           goto FAILED;
3363           }
3364
3365         if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3366
3367         /* Remember \r or \n */
3368
3369         if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3370
3371         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3372         matching, we have to use an XCLASS with extra data items. Caseless
3373         matching for characters > 127 is available only if UCP support is
3374         available. */
3375
3376 #ifdef SUPPORT_UTF8
3377         if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3378           {
3379           class_utf8 = TRUE;
3380
3381           /* With UCP support, we can find the other case equivalents of
3382           the relevant characters. There may be several ranges. Optimize how
3383           they fit with the basic range. */
3384
3385 #ifdef SUPPORT_UCP
3386           if ((options & PCRE_CASELESS) != 0)
3387             {
3388             unsigned int occ, ocd;
3389             unsigned int cc = c;
3390             unsigned int origd = d;
3391             while (get_othercase_range(&cc, origd, &occ, &ocd))
3392               {
3393               if (occ >= (unsigned int)c &&
3394                   ocd <= (unsigned int)d)
3395                 continue;                          /* Skip embedded ranges */
3396
3397               if (occ < (unsigned int)c  &&
3398                   ocd >= (unsigned int)c - 1)      /* Extend the basic range */
3399                 {                                  /* if there is overlap,   */
3400                 c = occ;                           /* noting that if occ < c */
3401                 continue;                          /* we can't have ocd > d  */
3402                 }                                  /* because a subrange is  */
3403               if (ocd > (unsigned int)d &&
3404                   occ <= (unsigned int)d + 1)      /* always shorter than    */
3405                 {                                  /* the basic range.       */
3406                 d = ocd;
3407                 continue;
3408                 }
3409
3410               if (occ == ocd)
3411                 {
3412                 *class_utf8data++ = XCL_SINGLE;
3413                 }
3414               else
3415                 {
3416                 *class_utf8data++ = XCL_RANGE;
3417                 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3418                 }
3419               class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3420               }
3421             }
3422 #endif  /* SUPPORT_UCP */
3423
3424           /* Now record the original range, possibly modified for UCP caseless
3425           overlapping ranges. */
3426
3427           *class_utf8data++ = XCL_RANGE;
3428           class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3429           class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3430
3431           /* With UCP support, we are done. Without UCP support, there is no
3432           caseless matching for UTF-8 characters > 127; we can use the bit map
3433           for the smaller ones. */
3434
3435 #ifdef SUPPORT_UCP
3436           continue;    /* With next character in the class */
3437 #else
3438           if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3439
3440           /* Adjust upper limit and fall through to set up the map */
3441
3442           d = 127;
3443
3444 #endif  /* SUPPORT_UCP */
3445           }
3446 #endif  /* SUPPORT_UTF8 */
3447
3448         /* We use the bit map for all cases when not in UTF-8 mode; else
3449         ranges that lie entirely within 0-127 when there is UCP support; else
3450         for partial ranges without UCP support. */
3451
3452         class_charcount += d - c + 1;
3453         class_lastchar = d;
3454
3455         /* We can save a bit of time by skipping this in the pre-compile. */
3456
3457         if (lengthptr == NULL) for (; c <= d; c++)
3458           {
3459           classbits[c/8] |= (1 << (c&7));
3460           if ((options & PCRE_CASELESS) != 0)
3461             {
3462             int uc = cd->fcc[c];           /* flip case */
3463             classbits[uc/8] |= (1 << (uc&7));
3464             }
3465           }
3466
3467         continue;   /* Go get the next char in the class */
3468         }
3469
3470       /* Handle a lone single character - we can get here for a normal
3471       non-escape char, or after \ that introduces a single character or for an
3472       apparent range that isn't. */
3473
3474       LONE_SINGLE_CHARACTER:
3475
3476       /* Handle a character that cannot go in the bit map */
3477
3478 #ifdef SUPPORT_UTF8
3479       if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3480         {
3481         class_utf8 = TRUE;
3482         *class_utf8data++ = XCL_SINGLE;
3483         class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3484
3485 #ifdef SUPPORT_UCP
3486         if ((options & PCRE_CASELESS) != 0)
3487           {
3488           unsigned int othercase;
3489           if ((othercase = UCD_OTHERCASE(c)) != c)
3490             {
3491             *class_utf8data++ = XCL_SINGLE;
3492             class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3493             }
3494           }
3495 #endif  /* SUPPORT_UCP */
3496
3497         }
3498       else
3499 #endif  /* SUPPORT_UTF8 */
3500
3501       /* Handle a single-byte character */
3502         {
3503         classbits[c/8] |= (1 << (c&7));
3504         if ((options & PCRE_CASELESS) != 0)
3505           {
3506           c = cd->fcc[c];   /* flip case */
3507           classbits[c/8] |= (1 << (c&7));
3508           }
3509         class_charcount++;
3510         class_lastchar = c;
3511         }
3512       }
3513
3514     /* Loop until ']' reached. This "while" is the end of the "do" above. */
3515
3516     while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3517
3518     if (c == 0)                          /* Missing terminating ']' */
3519       {
3520       *errorcodeptr = ERR6;
3521       goto FAILED;
3522       }
3523
3524
3525 /* This code has been disabled because it would mean that \s counts as
3526 an explicit \r or \n reference, and that's not really what is wanted. Now
3527 we set the flag only if there is a literal "\r" or "\n" in the class. */
3528
3529 #if 0
3530     /* Remember whether \r or \n are in this class */
3531
3532     if (negate_class)
3533       {
3534       if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3535       }
3536     else
3537       {
3538       if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3539       }
3540 #endif
3541
3542
3543     /* If class_charcount is 1, we saw precisely one character whose value is
3544     less than 256. As long as there were no characters >= 128 and there was no
3545     use of \p or \P, in other words, no use of any XCLASS features, we can
3546     optimize.
3547
3548     In UTF-8 mode, we can optimize the negative case only if there were no
3549     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3550     operate on single-bytes only. This is an historical hangover. Maybe one day
3551     we can tidy these opcodes to handle multi-byte characters.
3552
3553     The optimization throws away the bit map. We turn the item into a
3554     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3555     that OP_NOT does not support multibyte characters. In the positive case, it
3556     can cause firstbyte to be set. Otherwise, there can be no first char if
3557     this item is first, whatever repeat count may follow. In the case of
3558     reqbyte, save the previous value for reinstating. */
3559
3560 #ifdef SUPPORT_UTF8
3561     if (class_charcount == 1 && !class_utf8 &&
3562       (!utf8 || !negate_class || class_lastchar < 128))
3563 #else
3564     if (class_charcount == 1)
3565 #endif
3566       {
3567       zeroreqbyte = reqbyte;
3568
3569       /* The OP_NOT opcode works on one-byte characters only. */
3570
3571       if (negate_class)
3572         {
3573         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3574         zerofirstbyte = firstbyte;
3575         *code++ = OP_NOT;
3576         *code++ = class_lastchar;
3577         break;
3578         }
3579
3580       /* For a single, positive character, get the value into mcbuffer, and
3581       then we can handle this with the normal one-character code. */
3582
3583 #ifdef SUPPORT_UTF8
3584       if (utf8 && class_lastchar > 127)
3585         mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3586       else
3587 #endif
3588         {
3589         mcbuffer[0] = class_lastchar;
3590         mclength = 1;
3591         }
3592       goto ONE_CHAR;
3593       }       /* End of 1-char optimization */
3594
3595     /* The general case - not the one-char optimization. If this is the first
3596     thing in the branch, there can be no first char setting, whatever the
3597     repeat count. Any reqbyte setting must remain unchanged after any kind of
3598     repeat. */
3599
3600     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3601     zerofirstbyte = firstbyte;
3602     zeroreqbyte = reqbyte;
3603
3604     /* If there are characters with values > 255, we have to compile an
3605     extended class, with its own opcode, unless there was a negated special
3606     such as \S in the class, because in that case all characters > 255 are in
3607     the class, so any that were explicitly given as well can be ignored. If
3608     (when there are explicit characters > 255 that must be listed) there are no
3609     characters < 256, we can omit the bitmap in the actual compiled code. */
3610
3611 #ifdef SUPPORT_UTF8
3612     if (class_utf8 && !should_flip_negation)
3613       {
3614       *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
3615       *code++ = OP_XCLASS;
3616       code += LINK_SIZE;
3617       *code = negate_class? XCL_NOT : 0;
3618
3619       /* If the map is required, move up the extra data to make room for it;
3620       otherwise just move the code pointer to the end of the extra data. */
3621
3622       if (class_charcount > 0)
3623         {
3624         *code++ |= XCL_MAP;
3625         memmove(code + 32, code, class_utf8data - code);
3626         memcpy(code, classbits, 32);
3627         code = class_utf8data + 32;
3628         }
3629       else code = class_utf8data;
3630
3631       /* Now fill in the complete length of the item */
3632
3633       PUT(previous, 1, code - previous);
3634       break;   /* End of class handling */
3635       }
3636 #endif
3637
3638     /* If there are no characters > 255, set the opcode to OP_CLASS or
3639     OP_NCLASS, depending on whether the whole class was negated and whether
3640     there were negative specials such as \S in the class. Then copy the 32-byte
3641     map into the code vector, negating it if necessary. */
3642
3643     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3644     if (negate_class)
3645       {
3646       if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3647         for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3648       }
3649     else
3650       {
3651       memcpy(code, classbits, 32);
3652       }
3653     code += 32;
3654     break;
3655
3656
3657     /* ===================================================================*/
3658     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3659     has been tested above. */
3660
3661     case CHAR_LEFT_CURLY_BRACKET:
3662     if (!is_quantifier) goto NORMAL_CHAR;
3663     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3664     if (*errorcodeptr != 0) goto FAILED;
3665     goto REPEAT;
3666
3667     case CHAR_ASTERISK:
3668     repeat_min = 0;
3669     repeat_max = -1;
3670     goto REPEAT;
3671
3672     case CHAR_PLUS:
3673     repeat_min = 1;
3674     repeat_max = -1;
3675     goto REPEAT;
3676
3677     case CHAR_QUESTION_MARK:
3678     repeat_min = 0;
3679     repeat_max = 1;
3680
3681     REPEAT:
3682     if (previous == NULL)
3683       {
3684       *errorcodeptr = ERR9;
3685       goto FAILED;
3686       }
3687
3688     if (repeat_min == 0)
3689       {
3690       firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
3691       reqbyte = zeroreqbyte;        /* Ditto */
3692       }
3693
3694     /* Remember whether this is a variable length repeat */
3695
3696     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3697
3698     op_type = 0;                    /* Default single-char op codes */
3699     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
3700
3701     /* Save start of previous item, in case we have to move it up to make space
3702     for an inserted OP_ONCE for the additional '+' extension. */
3703
3704     tempcode = previous;
3705
3706     /* If the next character is '+', we have a possessive quantifier. This
3707     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3708     If the next character is '?' this is a minimizing repeat, by default,
3709     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3710     repeat type to the non-default. */
3711
3712     if (ptr[1] == CHAR_PLUS)
3713       {
3714       repeat_type = 0;                  /* Force greedy */
3715       possessive_quantifier = TRUE;
3716       ptr++;
3717       }
3718     else if (ptr[1] == CHAR_QUESTION_MARK)
3719       {
3720       repeat_type = greedy_non_default;
3721       ptr++;
3722       }
3723     else repeat_type = greedy_default;
3724
3725     /* If previous was a character match, abolish the item and generate a
3726     repeat item instead. If a char item has a minumum of more than one, ensure
3727     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3728     the first thing in a branch because the x will have gone into firstbyte
3729     instead.  */
3730
3731     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3732       {
3733       /* Deal with UTF-8 characters that take up more than one byte. It's
3734       easier to write this out separately than try to macrify it. Use c to
3735       hold the length of the character in bytes, plus 0x80 to flag that it's a
3736       length rather than a small character. */
3737
3738 #ifdef SUPPORT_UTF8
3739       if (utf8 && (code[-1] & 0x80) != 0)
3740         {
3741         uschar *lastchar = code - 1;
3742         while((*lastchar & 0xc0) == 0x80) lastchar--;
3743         c = code - lastchar;            /* Length of UTF-8 character */
3744         memcpy(utf8_char, lastchar, c); /* Save the char */
3745         c |= 0x80;                      /* Flag c as a length */
3746         }
3747       else
3748 #endif
3749
3750       /* Handle the case of a single byte - either with no UTF8 support, or
3751       with UTF-8 disabled, or for a UTF-8 character < 128. */
3752
3753         {
3754         c = code[-1];
3755         if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3756         }
3757
3758       /* If the repetition is unlimited, it pays to see if the next thing on
3759       the line is something that cannot possibly match this character. If so,
3760       automatically possessifying this item gains some performance in the case
3761       where the match fails. */
3762
3763       if (!possessive_quantifier &&
3764           repeat_max < 0 &&
3765           check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3766             options, cd))
3767         {
3768         repeat_type = 0;    /* Force greedy */
3769         possessive_quantifier = TRUE;
3770         }
3771
3772       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3773       }
3774
3775     /* If previous was a single negated character ([^a] or similar), we use
3776     one of the special opcodes, replacing it. The code is shared with single-
3777     character repeats by setting opt_type to add a suitable offset into
3778     repeat_type. We can also test for auto-possessification. OP_NOT is
3779     currently used only for single-byte chars. */
3780
3781     else if (*previous == OP_NOT)
3782       {
3783       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3784       c = previous[1];
3785       if (!possessive_quantifier &&
3786           repeat_max < 0 &&
3787           check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3788         {
3789         repeat_type = 0;    /* Force greedy */
3790         possessive_quantifier = TRUE;
3791         }
3792       goto OUTPUT_SINGLE_REPEAT;
3793       }
3794
3795     /* If previous was a character type match (\d or similar), abolish it and
3796     create a suitable repeat item. The code is shared with single-character
3797     repeats by setting op_type to add a suitable offset into repeat_type. Note
3798     the the Unicode property types will be present only when SUPPORT_UCP is
3799     defined, but we don't wrap the little bits of code here because it just
3800     makes it horribly messy. */
3801
3802     else if (*previous < OP_EODN)
3803       {
3804       uschar *oldcode;
3805       int prop_type, prop_value;
3806       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3807       c = *previous;
3808
3809       if (!possessive_quantifier &&
3810           repeat_max < 0 &&
3811           check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3812         {
3813         repeat_type = 0;    /* Force greedy */
3814         possessive_quantifier = TRUE;
3815         }
3816
3817       OUTPUT_SINGLE_REPEAT:
3818       if (*previous == OP_PROP || *previous == OP_NOTPROP)
3819         {
3820         prop_type = previous[1];
3821         prop_value = previous[2];
3822         }
3823       else prop_type = prop_value = -1;
3824
3825       oldcode = code;
3826       code = previous;                  /* Usually overwrite previous item */
3827
3828       /* If the maximum is zero then the minimum must also be zero; Perl allows
3829       this case, so we do too - by simply omitting the item altogether. */
3830
3831       if (repeat_max == 0) goto END_REPEAT;
3832
3833       /*--------------------------------------------------------------------*/
3834       /* This code is obsolete from release 8.00; the restriction was finally
3835       removed: */
3836
3837       /* All real repeats make it impossible to handle partial matching (maybe
3838       one day we will be able to remove this restriction). */
3839
3840       /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
3841       /*--------------------------------------------------------------------*/
3842
3843       /* Combine the op_type with the repeat_type */
3844
3845       repeat_type += op_type;
3846
3847       /* A minimum of zero is handled either as the special case * or ?, or as
3848       an UPTO, with the maximum given. */
3849
3850       if (repeat_min == 0)
3851         {
3852         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3853           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3854         else
3855           {
3856           *code++ = OP_UPTO + repeat_type;
3857           PUT2INC(code, 0, repeat_max);
3858           }
3859         }
3860
3861       /* A repeat minimum of 1 is optimized into some special cases. If the
3862       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3863       left in place and, if the maximum is greater than 1, we use OP_UPTO with
3864       one less than the maximum. */
3865
3866       else if (repeat_min == 1)
3867         {
3868         if (repeat_max == -1)
3869           *code++ = OP_PLUS + repeat_type;
3870         else
3871           {
3872           code = oldcode;                 /* leave previous item in place */
3873           if (repeat_max == 1) goto END_REPEAT;
3874           *code++ = OP_UPTO + repeat_type;
3875           PUT2INC(code, 0, repeat_max - 1);
3876           }
3877         }
3878
3879       /* The case {n,n} is just an EXACT, while the general case {n,m} is
3880       handled as an EXACT followed by an UPTO. */
3881
3882       else
3883         {
3884         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
3885         PUT2INC(code, 0, repeat_min);
3886
3887         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3888         we have to insert the character for the previous code. For a repeated
3889         Unicode property match, there are two extra bytes that define the
3890         required property. In UTF-8 mode, long characters have their length in
3891         c, with the 0x80 bit as a flag. */
3892
3893         if (repeat_max < 0)
3894           {
3895 #ifdef SUPPORT_UTF8
3896           if (utf8 && c >= 128)
3897             {
3898             memcpy(code, utf8_char, c & 7);
3899             code += c & 7;
3900             }
3901           else
3902 #endif
3903             {
3904             *code++ = c;
3905             if (prop_type >= 0)
3906               {
3907               *code++ = prop_type;
3908               *code++ = prop_value;
3909               }
3910             }
3911           *code++ = OP_STAR + repeat_type;
3912           }
3913
3914         /* Else insert an UPTO if the max is greater than the min, again
3915         preceded by the character, for the previously inserted code. If the
3916         UPTO is just for 1 instance, we can use QUERY instead. */
3917
3918         else if (repeat_max != repeat_min)
3919           {
3920 #ifdef SUPPORT_UTF8
3921           if (utf8 && c >= 128)
3922             {
3923             memcpy(code, utf8_char, c & 7);
3924             code += c & 7;
3925             }
3926           else
3927 #endif
3928           *code++ = c;
3929           if (prop_type >= 0)
3930             {
3931             *code++ = prop_type;
3932             *code++ = prop_value;
3933             }
3934           repeat_max -= repeat_min;
3935
3936           if (repeat_max == 1)
3937             {
3938             *code++ = OP_QUERY + repeat_type;
3939             }
3940           else
3941             {
3942             *code++ = OP_UPTO + repeat_type;
3943             PUT2INC(code, 0, repeat_max);
3944             }
3945           }
3946         }
3947
3948       /* The character or character type itself comes last in all cases. */
3949
3950 #ifdef SUPPORT_UTF8
3951       if (utf8 && c >= 128)
3952         {
3953         memcpy(code, utf8_char, c & 7);
3954         code += c & 7;
3955         }
3956       else
3957 #endif
3958       *code++ = c;
3959
3960       /* For a repeated Unicode property match, there are two extra bytes that
3961       define the required property. */
3962
3963 #ifdef SUPPORT_UCP
3964       if (prop_type >= 0)
3965         {
3966         *code++ = prop_type;
3967         *code++ = prop_value;
3968         }
3969 #endif
3970       }
3971
3972     /* If previous was a character class or a back reference, we put the repeat
3973     stuff after it, but just skip the item if the repeat was {0,0}. */
3974
3975     else if (*previous == OP_CLASS ||
3976              *previous == OP_NCLASS ||
3977 #ifdef SUPPORT_UTF8
3978              *previous == OP_XCLASS ||
3979 #endif
3980              *previous == OP_REF)
3981       {
3982       if (repeat_max == 0)
3983         {
3984         code = previous;
3985         goto END_REPEAT;
3986         }
3987
3988       /*--------------------------------------------------------------------*/
3989       /* This code is obsolete from release 8.00; the restriction was finally
3990       removed: */
3991
3992       /* All real repeats make it impossible to handle partial matching (maybe
3993       one day we will be able to remove this restriction). */
3994
3995       /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
3996       /*--------------------------------------------------------------------*/
3997
3998       if (repeat_min == 0 && repeat_max == -1)
3999         *code++ = OP_CRSTAR + repeat_type;
4000       else if (repeat_min == 1 && repeat_max == -1)
4001         *code++ = OP_CRPLUS + repeat_type;
4002       else if (repeat_min == 0 && repeat_max == 1)
4003         *code++ = OP_CRQUERY + repeat_type;
4004       else
4005         {
4006         *code++ = OP_CRRANGE + repeat_type;
4007         PUT2INC(code, 0, repeat_min);
4008         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
4009         PUT2INC(code, 0, repeat_max);
4010         }
4011       }
4012
4013     /* If previous was a bracket group, we may have to replicate it in certain
4014     cases. */
4015
4016     else if (*previous == OP_BRA  || *previous == OP_CBRA ||
4017              *previous == OP_ONCE || *previous == OP_COND)
4018       {
4019       register int i;
4020       int ketoffset = 0;
4021       int len = code - previous;
4022       uschar *bralink = NULL;
4023
4024       /* Repeating a DEFINE group is pointless */
4025
4026       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
4027         {
4028         *errorcodeptr = ERR55;
4029         goto FAILED;
4030         }
4031
4032       /* If the maximum repeat count is unlimited, find the end of the bracket
4033       by scanning through from the start, and compute the offset back to it
4034       from the current code pointer. There may be an OP_OPT setting following
4035       the final KET, so we can't find the end just by going back from the code
4036       pointer. */
4037
4038       if (repeat_max == -1)
4039         {
4040         register uschar *ket = previous;
4041         do ket += GET(ket, 1); while (*ket != OP_KET);
4042         ketoffset = code - ket;
4043         }
4044
4045       /* The case of a zero minimum is special because of the need to stick
4046       OP_BRAZERO in front of it, and because the group appears once in the
4047       data, whereas in other cases it appears the minimum number of times. For
4048       this reason, it is simplest to treat this case separately, as otherwise
4049       the code gets far too messy. There are several special subcases when the
4050       minimum is zero. */
4051
4052       if (repeat_min == 0)
4053         {
4054         /* If the maximum is also zero, we used to just omit the group from the
4055         output altogether, like this:
4056
4057         ** if (repeat_max == 0)
4058         **   {
4059         **   code = previous;
4060         **   goto END_REPEAT;
4061         **   }
4062
4063         However, that fails when a group is referenced as a subroutine from
4064         elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
4065         so that it is skipped on execution. As we don't have a list of which
4066         groups are referenced, we cannot do this selectively.
4067
4068         If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
4069         and do no more at this point. However, we do need to adjust any
4070         OP_RECURSE calls inside the group that refer to the group itself or any
4071         internal or forward referenced group, because the offset is from the
4072         start of the whole regex. Temporarily terminate the pattern while doing
4073         this. */
4074
4075         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
4076           {
4077           *code = OP_END;
4078           adjust_recurse(previous, 1, utf8, cd, save_hwm);
4079           memmove(previous+1, previous, len);
4080           code++;
4081           if (repeat_max == 0)
4082             {
4083             *previous++ = OP_SKIPZERO;
4084             goto END_REPEAT;
4085             }
4086           *previous++ = OP_BRAZERO + repeat_type;
4087           }
4088
4089         /* If the maximum is greater than 1 and limited, we have to replicate
4090         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
4091         The first one has to be handled carefully because it's the original
4092         copy, which has to be moved up. The remainder can be handled by code
4093         that is common with the non-zero minimum case below. We have to
4094         adjust the value or repeat_max, since one less copy is required. Once
4095         again, we may have to adjust any OP_RECURSE calls inside the group. */
4096
4097         else
4098           {
4099           int offset;
4100           *code = OP_END;
4101           adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
4102           memmove(previous + 2 + LINK_SIZE, previous, len);
4103           code += 2 + LINK_SIZE;
4104           *previous++ = OP_BRAZERO + repeat_type;
4105           *previous++ = OP_BRA;
4106
4107           /* We chain together the bracket offset fields that have to be
4108           filled in later when the ends of the brackets are reached. */
4109
4110           offset = (bralink == NULL)? 0 : previous - bralink;
4111           bralink = previous;
4112           PUTINC(previous, 0, offset);
4113           }
4114
4115         repeat_max--;
4116         }
4117
4118       /* If the minimum is greater than zero, replicate the group as many
4119       times as necessary, and adjust the maximum to the number of subsequent
4120       copies that we need. If we set a first char from the group, and didn't
4121       set a required char, copy the latter from the former. If there are any
4122       forward reference subroutine calls in the group, there will be entries on
4123       the workspace list; replicate these with an appropriate increment. */
4124
4125       else
4126         {
4127         if (repeat_min > 1)
4128           {
4129           /* In the pre-compile phase, we don't actually do the replication. We
4130           just adjust the length as if we had. Do some paranoid checks for
4131           potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
4132           integer type when available, otherwise double. */
4133
4134           if (lengthptr != NULL)
4135             {
4136             int delta = (repeat_min - 1)*length_prevgroup;
4137             if ((INT64_OR_DOUBLE)(repeat_min - 1)*
4138                   (INT64_OR_DOUBLE)length_prevgroup >
4139                     (INT64_OR_DOUBLE)INT_MAX ||
4140                 OFLOW_MAX - *lengthptr < delta)
4141               {
4142               *errorcodeptr = ERR20;
4143               goto FAILED;
4144               }
4145             *lengthptr += delta;
4146             }
4147
4148           /* This is compiling for real */
4149
4150           else
4151             {
4152             if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
4153             for (i = 1; i < repeat_min; i++)
4154               {
4155               uschar *hc;
4156               uschar *this_hwm = cd->hwm;
4157               memcpy(code, previous, len);
4158               for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4159                 {
4160                 PUT(cd->hwm, 0, GET(hc, 0) + len);
4161                 cd->hwm += LINK_SIZE;
4162                 }
4163               save_hwm = this_hwm;
4164               code += len;
4165               }
4166             }
4167           }
4168
4169         if (repeat_max > 0) repeat_max -= repeat_min;
4170         }
4171
4172       /* This code is common to both the zero and non-zero minimum cases. If
4173       the maximum is limited, it replicates the group in a nested fashion,
4174       remembering the bracket starts on a stack. In the case of a zero minimum,
4175       the first one was set up above. In all cases the repeat_max now specifies
4176       the number of additional copies needed. Again, we must remember to
4177       replicate entries on the forward reference list. */
4178
4179       if (repeat_max >= 0)
4180         {
4181         /* In the pre-compile phase, we don't actually do the replication. We
4182         just adjust the length as if we had. For each repetition we must add 1
4183         to the length for BRAZERO and for all but the last repetition we must
4184         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4185         paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
4186         a 64-bit integer type when available, otherwise double. */
4187
4188         if (lengthptr != NULL && repeat_max > 0)
4189           {
4190           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4191                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
4192           if ((INT64_OR_DOUBLE)repeat_max *
4193                 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4194                   > (INT64_OR_DOUBLE)INT_MAX ||
4195               OFLOW_MAX - *lengthptr < delta)
4196             {
4197             *errorcodeptr = ERR20;
4198             goto FAILED;
4199             }
4200           *lengthptr += delta;
4201           }
4202
4203         /* This is compiling for real */
4204
4205         else for (i = repeat_max - 1; i >= 0; i--)
4206           {
4207           uschar *hc;
4208           uschar *this_hwm = cd->hwm;
4209
4210           *code++ = OP_BRAZERO + repeat_type;
4211
4212           /* All but the final copy start a new nesting, maintaining the
4213           chain of brackets outstanding. */
4214
4215           if (i != 0)
4216             {
4217             int offset;
4218             *code++ = OP_BRA;
4219             offset = (bralink == NULL)? 0 : code - bralink;
4220             bralink = code;
4221             PUTINC(code, 0, offset);
4222             }
4223
4224           memcpy(code, previous, len);
4225           for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4226             {
4227             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4228             cd->hwm += LINK_SIZE;
4229             }
4230           save_hwm = this_hwm;
4231           code += len;
4232           }
4233
4234         /* Now chain through the pending brackets, and fill in their length
4235         fields (which are holding the chain links pro tem). */
4236
4237         while (bralink != NULL)
4238           {
4239           int oldlinkoffset;
4240           int offset = code - bralink + 1;
4241           uschar *bra = code - offset;
4242           oldlinkoffset = GET(bra, 1);
4243           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4244           *code++ = OP_KET;
4245           PUTINC(code, 0, offset);
4246           PUT(bra, 1, offset);
4247           }
4248         }
4249
4250       /* If the maximum is unlimited, set a repeater in the final copy. We
4251       can't just offset backwards from the current code point, because we
4252       don't know if there's been an options resetting after the ket. The
4253       correct offset was computed above.
4254
4255       Then, when we are doing the actual compile phase, check to see whether
4256       this group is a non-atomic one that could match an empty string. If so,
4257       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4258       that runtime checking can be done. [This check is also applied to
4259       atomic groups at runtime, but in a different way.] */
4260
4261       else
4262         {
4263         uschar *ketcode = code - ketoffset;
4264         uschar *bracode = ketcode - GET(ketcode, 1);
4265         *ketcode = OP_KETRMAX + repeat_type;
4266         if (lengthptr == NULL && *bracode != OP_ONCE)
4267           {
4268           uschar *scode = bracode;
4269           do
4270             {
4271             if (could_be_empty_branch(scode, ketcode, utf8, cd))
4272               {
4273               *bracode += OP_SBRA - OP_BRA;
4274               break;
4275               }
4276             scode += GET(scode, 1);
4277             }
4278           while (*scode == OP_ALT);
4279           }
4280         }
4281       }
4282
4283     /* If previous is OP_FAIL, it was generated by an empty class [] in
4284     JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4285     by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4286     error above. We can just ignore the repeat in JS case. */
4287
4288     else if (*previous == OP_FAIL) goto END_REPEAT;
4289
4290     /* Else there's some kind of shambles */
4291
4292     else
4293       {
4294       *errorcodeptr = ERR11;
4295       goto FAILED;
4296       }
4297
4298     /* If the character following a repeat is '+', or if certain optimization
4299     tests above succeeded, possessive_quantifier is TRUE. For some of the
4300     simpler opcodes, there is an special alternative opcode for this. For
4301     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4302     The '+' notation is just syntactic sugar, taken from Sun's Java package,
4303     but the special opcodes can optimize it a bit. The repeated item starts at
4304     tempcode, not at previous, which might be the first part of a string whose
4305     (former) last char we repeated.
4306
4307     Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4308     an 'upto' may follow. We skip over an 'exact' item, and then test the
4309     length of what remains before proceeding. */
4310
4311     if (possessive_quantifier)
4312       {
4313       int len;
4314
4315       if (*tempcode == OP_TYPEEXACT)
4316         tempcode += _pcre_OP_lengths[*tempcode] +
4317           ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);
4318
4319       else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
4320         {
4321         tempcode += _pcre_OP_lengths[*tempcode];
4322 #ifdef SUPPORT_UTF8
4323         if (utf8 && tempcode[-1] >= 0xc0)
4324           tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
4325 #endif
4326         }
4327
4328       len = code - tempcode;
4329       if (len > 0) switch (*tempcode)
4330         {
4331         case OP_STAR:  *tempcode = OP_POSSTAR; break;
4332         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
4333         case OP_QUERY: *tempcode = OP_POSQUERY; break;
4334         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
4335
4336         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
4337         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
4338         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4339         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
4340
4341         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
4342         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
4343         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4344         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
4345
4346         /* Because we are moving code along, we must ensure that any
4347         pending recursive references are updated. */
4348
4349         default:
4350         *code = OP_END;
4351         adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);
4352         memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4353         code += 1 + LINK_SIZE;
4354         len += 1 + LINK_SIZE;
4355         tempcode[0] = OP_ONCE;
4356         *code++ = OP_KET;
4357         PUTINC(code, 0, len);
4358         PUT(tempcode, 1, len);
4359         break;
4360         }
4361       }
4362
4363     /* In all case we no longer have a previous item. We also set the
4364     "follows varying string" flag for subsequently encountered reqbytes if
4365     it isn't already set and we have just passed a varying length item. */
4366
4367     END_REPEAT:
4368     previous = NULL;
4369     cd->req_varyopt |= reqvary;
4370     break;
4371
4372
4373     /* ===================================================================*/
4374     /* Start of nested parenthesized sub-expression, or comment or lookahead or
4375     lookbehind or option setting or condition or all the other extended
4376     parenthesis forms.  */
4377
4378     case CHAR_LEFT_PARENTHESIS:
4379     newoptions = options;
4380     skipbytes = 0;
4381     bravalue = OP_CBRA;
4382     save_hwm = cd->hwm;
4383     reset_bracount = FALSE;
4384
4385     /* First deal with various "verbs" that can be introduced by '*'. */
4386
4387     if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4388       {
4389       int i, namelen;
4390       const char *vn = verbnames;
4391       const uschar *name = ++ptr;
4392       previous = NULL;
4393       while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4394       if (*ptr == CHAR_COLON)
4395         {
4396         *errorcodeptr = ERR59;   /* Not supported */
4397         goto FAILED;
4398         }
4399       if (*ptr != CHAR_RIGHT_PARENTHESIS)
4400         {
4401         *errorcodeptr = ERR60;
4402         goto FAILED;
4403         }
4404       namelen = ptr - name;
4405       for (i = 0; i < verbcount; i++)
4406         {
4407         if (namelen == verbs[i].len &&
4408             strncmp((char *)name, vn, namelen) == 0)
4409           {
4410           /* Check for open captures before ACCEPT */
4411
4412           if (verbs[i].op == OP_ACCEPT)
4413             {
4414             open_capitem *oc;
4415             cd->had_accept = TRUE;
4416             for (oc = cd->open_caps; oc != NULL; oc = oc->next)
4417               {
4418               *code++ = OP_CLOSE;
4419               PUT2INC(code, 0, oc->number);
4420               }
4421             }
4422           *code++ = verbs[i].op;
4423           break;
4424           }
4425         vn += verbs[i].len + 1;
4426         }
4427       if (i < verbcount) continue;
4428       *errorcodeptr = ERR60;
4429       goto FAILED;
4430       }
4431
4432     /* Deal with the extended parentheses; all are introduced by '?', and the
4433     appearance of any of them means that this is not a capturing group. */
4434
4435     else if (*ptr == CHAR_QUESTION_MARK)
4436       {
4437       int i, set, unset, namelen;
4438       int *optset;
4439       const uschar *name;
4440       uschar *slot;
4441
4442       switch (*(++ptr))
4443         {
4444         case CHAR_NUMBER_SIGN:                 /* Comment; skip to ket */
4445         ptr++;
4446         while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4447         if (*ptr == 0)
4448           {
4449           *errorcodeptr = ERR18;
4450           goto FAILED;
4451           }
4452         continue;
4453
4454
4455         /* ------------------------------------------------------------ */
4456         case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
4457         reset_bracount = TRUE;
4458         /* Fall through */
4459
4460         /* ------------------------------------------------------------ */
4461         case CHAR_COLON:          /* Non-capturing bracket */
4462         bravalue = OP_BRA;
4463         ptr++;
4464         break;
4465
4466
4467         /* ------------------------------------------------------------ */
4468         case CHAR_LEFT_PARENTHESIS:
4469         bravalue = OP_COND;       /* Conditional group */
4470
4471         /* A condition can be an assertion, a number (referring to a numbered
4472         group), a name (referring to a named group), or 'R', referring to
4473         recursion. R<digits> and R&name are also permitted for recursion tests.
4474
4475         There are several syntaxes for testing a named group: (?(name)) is used
4476         by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4477
4478         There are two unfortunate ambiguities, caused by history. (a) 'R' can
4479         be the recursive thing or the name 'R' (and similarly for 'R' followed
4480         by digits), and (b) a number could be a name that consists of digits.
4481         In both cases, we look for a name first; if not found, we try the other
4482         cases. */
4483
4484         /* For conditions that are assertions, check the syntax, and then exit
4485         the switch. This will take control down to where bracketed groups,
4486         including assertions, are processed. */
4487
4488         if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
4489             ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
4490           break;
4491
4492         /* Most other conditions use OP_CREF (a couple change to OP_RREF
4493         below), and all need to skip 3 bytes at the start of the group. */
4494
4495         code[1+LINK_SIZE] = OP_CREF;
4496         skipbytes = 3;
4497         refsign = -1;
4498
4499         /* Check for a test for recursion in a named group. */
4500
4501         if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
4502           {
4503           terminator = -1;
4504           ptr += 2;
4505           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
4506           }
4507
4508         /* Check for a test for a named group's having been set, using the Perl
4509         syntax (?(<name>) or (?('name') */
4510
4511         else if (ptr[1] == CHAR_LESS_THAN_SIGN)
4512           {
4513           terminator = CHAR_GREATER_THAN_SIGN;
4514           ptr++;
4515           }
4516         else if (ptr[1] == CHAR_APOSTROPHE)
4517           {
4518           terminator = CHAR_APOSTROPHE;
4519           ptr++;
4520           }
4521         else
4522           {
4523           terminator = 0;
4524           if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
4525           }
4526
4527         /* We now expect to read a name; any thing else is an error */
4528
4529         if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4530           {
4531           ptr += 1;  /* To get the right offset */
4532           *errorcodeptr = ERR28;
4533           goto FAILED;
4534           }
4535
4536         /* Read the name, but also get it as a number if it's all digits */
4537
4538         recno = 0;
4539         name = ++ptr;
4540         while ((cd->ctypes[*ptr] & ctype_word) != 0)
4541           {
4542           if (recno >= 0)
4543             recno = (g_ascii_isdigit(*ptr) != 0)?
4544               recno * 10 + *ptr - CHAR_0 : -1;
4545           ptr++;
4546           }
4547         namelen = ptr - name;
4548
4549         if ((terminator > 0 && *ptr++ != terminator) ||
4550             *ptr++ != CHAR_RIGHT_PARENTHESIS)
4551           {
4552           ptr--;      /* Error offset */
4553           *errorcodeptr = ERR26;
4554           goto FAILED;
4555           }
4556
4557         /* Do no further checking in the pre-compile phase. */
4558
4559         if (lengthptr != NULL) break;
4560
4561         /* In the real compile we do the work of looking for the actual
4562         reference. If the string started with "+" or "-" we require the rest to
4563         be digits, in which case recno will be set. */
4564
4565         if (refsign > 0)
4566           {
4567           if (recno <= 0)
4568             {
4569             *errorcodeptr = ERR58;
4570             goto FAILED;
4571             }
4572           recno = (refsign == CHAR_MINUS)?
4573             cd->bracount - recno + 1 : recno +cd->bracount;
4574           if (recno <= 0 || recno > cd->final_bracount)
4575             {
4576             *errorcodeptr = ERR15;
4577             goto FAILED;
4578             }
4579           PUT2(code, 2+LINK_SIZE, recno);
4580           break;
4581           }
4582
4583         /* Otherwise (did not start with "+" or "-"), start by looking for the
4584         name. If we find a name, add one to the opcode to change OP_CREF or
4585         OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
4586         except they record that the reference was originally to a name. The
4587         information is used to check duplicate names. */
4588
4589         slot = cd->name_table;
4590         for (i = 0; i < cd->names_found; i++)
4591           {
4592           if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4593           slot += cd->name_entry_size;
4594           }
4595
4596         /* Found a previous named subpattern */
4597
4598         if (i < cd->names_found)
4599           {
4600           recno = GET2(slot, 0);
4601           PUT2(code, 2+LINK_SIZE, recno);
4602           code[1+LINK_SIZE]++;
4603           }
4604
4605         /* Search the pattern for a forward reference */
4606
4607         else if ((i = find_parens(cd, name, namelen,
4608                         (options & PCRE_EXTENDED) != 0)) > 0)
4609           {
4610           PUT2(code, 2+LINK_SIZE, i);
4611           code[1+LINK_SIZE]++;
4612           }
4613
4614         /* If terminator == 0 it means that the name followed directly after
4615         the opening parenthesis [e.g. (?(abc)...] and in this case there are
4616         some further alternatives to try. For the cases where terminator != 0
4617         [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4618         now checked all the possibilities, so give an error. */
4619
4620         else if (terminator != 0)
4621           {
4622           *errorcodeptr = ERR15;
4623           goto FAILED;
4624           }
4625
4626         /* Check for (?(R) for recursion. Allow digits after R to specify a
4627         specific group number. */
4628
4629         else if (*name == CHAR_R)
4630           {
4631           recno = 0;
4632           for (i = 1; i < namelen; i++)
4633             {
4634             if (g_ascii_isdigit(name[i]) == 0)
4635               {
4636               *errorcodeptr = ERR15;
4637               goto FAILED;
4638               }
4639             recno = recno * 10 + name[i] - CHAR_0;
4640             }
4641           if (recno == 0) recno = RREF_ANY;
4642           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
4643           PUT2(code, 2+LINK_SIZE, recno);
4644           }
4645
4646         /* Similarly, check for the (?(DEFINE) "condition", which is always
4647         false. */
4648
4649         else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
4650           {
4651           code[1+LINK_SIZE] = OP_DEF;
4652           skipbytes = 1;
4653           }
4654
4655         /* Check for the "name" actually being a subpattern number. We are
4656         in the second pass here, so final_bracount is set. */
4657
4658         else if (recno > 0 && recno <= cd->final_bracount)
4659           {
4660           PUT2(code, 2+LINK_SIZE, recno);
4661           }
4662
4663         /* Either an unidentified subpattern, or a reference to (?(0) */
4664
4665         else
4666           {
4667           *errorcodeptr = (recno == 0)? ERR35: ERR15;
4668           goto FAILED;
4669           }
4670         break;
4671
4672
4673         /* ------------------------------------------------------------ */
4674         case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
4675         bravalue = OP_ASSERT;
4676         ptr++;
4677         break;
4678
4679
4680         /* ------------------------------------------------------------ */
4681         case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
4682         ptr++;
4683         if (*ptr == CHAR_RIGHT_PARENTHESIS)    /* Optimize (?!) */
4684           {
4685           *code++ = OP_FAIL;
4686           previous = NULL;
4687           continue;
4688           }
4689         bravalue = OP_ASSERT_NOT;
4690         break;
4691
4692
4693         /* ------------------------------------------------------------ */
4694         case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
4695         switch (ptr[1])
4696           {
4697           case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
4698           bravalue = OP_ASSERTBACK;
4699           ptr += 2;
4700           break;
4701
4702           case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
4703           bravalue = OP_ASSERTBACK_NOT;
4704           ptr += 2;
4705           break;
4706
4707           default:                /* Could be name define, else bad */
4708           if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4709           ptr++;                  /* Correct offset for error */
4710           *errorcodeptr = ERR24;
4711           goto FAILED;
4712           }
4713         break;
4714
4715
4716         /* ------------------------------------------------------------ */
4717         case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
4718         bravalue = OP_ONCE;
4719         ptr++;
4720         break;
4721
4722
4723         /* ------------------------------------------------------------ */
4724         case CHAR_C:                 /* Callout - may be followed by digits; */
4725         previous_callout = code;  /* Save for later completion */
4726         after_manual_callout = 1; /* Skip one item before completing */
4727         *code++ = OP_CALLOUT;
4728           {
4729           int n = 0;
4730           while (g_ascii_isdigit(*(++ptr)) != 0)
4731             n = n * 10 + *ptr - CHAR_0;
4732           if (*ptr != CHAR_RIGHT_PARENTHESIS)
4733             {
4734             *errorcodeptr = ERR39;
4735             goto FAILED;
4736             }
4737           if (n > 255)
4738             {
4739             *errorcodeptr = ERR38;
4740             goto FAILED;
4741             }
4742           *code++ = n;
4743           PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
4744           PUT(code, LINK_SIZE, 0);                    /* Default length */
4745           code += 2 * LINK_SIZE;
4746           }
4747         previous = NULL;
4748         continue;
4749
4750
4751         /* ------------------------------------------------------------ */
4752         case CHAR_P:              /* Python-style named subpattern handling */
4753         if (*(++ptr) == CHAR_EQUALS_SIGN ||
4754             *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
4755           {
4756           is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
4757           terminator = CHAR_RIGHT_PARENTHESIS;
4758           goto NAMED_REF_OR_RECURSE;
4759           }
4760         else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
4761           {
4762           *errorcodeptr = ERR41;
4763           goto FAILED;
4764           }
4765         /* Fall through to handle (?P< as (?< is handled */
4766
4767
4768         /* ------------------------------------------------------------ */
4769         DEFINE_NAME:    /* Come here from (?< handling */
4770         case CHAR_APOSTROPHE:
4771           {
4772           terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
4773             CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
4774           name = ++ptr;
4775
4776           while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4777           namelen = ptr - name;
4778
4779           /* In the pre-compile phase, just do a syntax check. */
4780
4781           if (lengthptr != NULL)
4782             {
4783             if (*ptr != terminator)
4784               {
4785               *errorcodeptr = ERR42;
4786               goto FAILED;
4787               }
4788             if (cd->names_found >= MAX_NAME_COUNT)
4789               {
4790               *errorcodeptr = ERR49;
4791               goto FAILED;
4792               }
4793             if (namelen + 3 > cd->name_entry_size)
4794               {
4795               cd->name_entry_size = namelen + 3;
4796               if (namelen > MAX_NAME_SIZE)
4797                 {
4798                 *errorcodeptr = ERR48;
4799                 goto FAILED;
4800                 }
4801               }
4802             }
4803
4804           /* In the real compile, create the entry in the table, maintaining
4805           alphabetical order. Duplicate names for different numbers are
4806           permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
4807           number are always OK. (An existing number can be re-used if (?|
4808           appears in the pattern.) In either event, a duplicate name results in
4809           a duplicate entry in the table, even if the number is the same. This
4810           is because the number of names, and hence the table size, is computed
4811           in the pre-compile, and it affects various numbers and pointers which
4812           would all have to be modified, and the compiled code moved down, if
4813           duplicates with the same number were omitted from the table. This
4814           doesn't seem worth the hassle. However, *different* names for the
4815           same number are not permitted. */
4816
4817           else
4818             {
4819             BOOL dupname = FALSE;
4820             slot = cd->name_table;
4821
4822             for (i = 0; i < cd->names_found; i++)
4823               {
4824               int crc = memcmp(name, slot+2, namelen);
4825               if (crc == 0)
4826                 {
4827                 if (slot[2+namelen] == 0)
4828                   {
4829                   if (GET2(slot, 0) != cd->bracount + 1 &&
4830                       (options & PCRE_DUPNAMES) == 0)
4831                     {
4832                     *errorcodeptr = ERR43;
4833                     goto FAILED;
4834                     }
4835                   else dupname = TRUE;
4836                   }
4837                 else crc = -1;      /* Current name is a substring */
4838                 }
4839
4840               /* Make space in the table and break the loop for an earlier
4841               name. For a duplicate or later name, carry on. We do this for
4842               duplicates so that in the simple case (when ?(| is not used) they
4843               are in order of their numbers. */
4844
4845               if (crc < 0)
4846                 {
4847                 memmove(slot + cd->name_entry_size, slot,
4848                   (cd->names_found - i) * cd->name_entry_size);
4849                 break;
4850                 }
4851
4852               /* Continue the loop for a later or duplicate name */
4853
4854               slot += cd->name_entry_size;
4855               }
4856
4857             /* For non-duplicate names, check for a duplicate number before
4858             adding the new name. */
4859
4860             if (!dupname)
4861               {
4862               uschar *cslot = cd->name_table;
4863               for (i = 0; i < cd->names_found; i++)
4864                 {
4865                 if (cslot != slot)
4866                   {
4867                   if (GET2(cslot, 0) == cd->bracount + 1)
4868                     {
4869                     *errorcodeptr = ERR65;
4870                     goto FAILED;
4871                     }
4872                   }
4873                 else i--;
4874                 cslot += cd->name_entry_size;
4875                 }
4876               }
4877
4878             PUT2(slot, 0, cd->bracount + 1);
4879             memcpy(slot + 2, name, namelen);
4880             slot[2+namelen] = 0;
4881             }
4882           }
4883
4884         /* In both pre-compile and compile, count the number of names we've
4885         encountered. */
4886
4887         cd->names_found++;
4888         ptr++;                    /* Move past > or ' */
4889         goto NUMBERED_GROUP;
4890
4891
4892         /* ------------------------------------------------------------ */
4893         case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
4894         terminator = CHAR_RIGHT_PARENTHESIS;
4895         is_recurse = TRUE;
4896         /* Fall through */
4897
4898         /* We come here from the Python syntax above that handles both
4899         references (?P=name) and recursion (?P>name), as well as falling
4900         through from the Perl recursion syntax (?&name). We also come here from
4901         the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4902         .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
4903
4904         NAMED_REF_OR_RECURSE:
4905         name = ++ptr;
4906         while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4907         namelen = ptr - name;
4908
4909         /* In the pre-compile phase, do a syntax check and set a dummy
4910         reference number. */
4911
4912         if (lengthptr != NULL)
4913           {
4914           if (namelen == 0)
4915             {
4916             *errorcodeptr = ERR62;
4917             goto FAILED;
4918             }
4919           if (*ptr != terminator)
4920             {
4921             *errorcodeptr = ERR42;
4922             goto FAILED;
4923             }
4924           if (namelen > MAX_NAME_SIZE)
4925             {
4926             *errorcodeptr = ERR48;
4927             goto FAILED;
4928             }
4929           recno = 0;
4930           }
4931
4932         /* In the real compile, seek the name in the table. We check the name
4933         first, and then check that we have reached the end of the name in the
4934         table. That way, if the name that is longer than any in the table,
4935         the comparison will fail without reading beyond the table entry. */
4936
4937         else
4938           {
4939           slot = cd->name_table;
4940           for (i = 0; i < cd->names_found; i++)
4941             {
4942             if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4943                 slot[2+namelen] == 0)
4944               break;
4945             slot += cd->name_entry_size;
4946             }
4947
4948           if (i < cd->names_found)         /* Back reference */
4949             {
4950             recno = GET2(slot, 0);
4951             }
4952           else if ((recno =                /* Forward back reference */
4953                     find_parens(cd, name, namelen,
4954                       (options & PCRE_EXTENDED) != 0)) <= 0)
4955             {
4956             *errorcodeptr = ERR15;
4957             goto FAILED;
4958             }
4959           }
4960
4961         /* In both phases, we can now go to the code than handles numerical
4962         recursion or backreferences. */
4963
4964         if (is_recurse) goto HANDLE_RECURSION;
4965           else goto HANDLE_REFERENCE;
4966
4967
4968         /* ------------------------------------------------------------ */
4969         case CHAR_R:              /* Recursion */
4970         ptr++;                    /* Same as (?0)      */
4971         /* Fall through */
4972
4973
4974         /* ------------------------------------------------------------ */
4975         case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
4976         case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
4977         case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
4978           {
4979           const uschar *called;
4980           terminator = CHAR_RIGHT_PARENTHESIS;
4981
4982           /* Come here from the \g<...> and \g'...' code (Oniguruma
4983           compatibility). However, the syntax has been checked to ensure that
4984           the ... are a (signed) number, so that neither ERR63 nor ERR29 will
4985           be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
4986           ever be taken. */
4987
4988           HANDLE_NUMERICAL_RECURSION:
4989
4990           if ((refsign = *ptr) == CHAR_PLUS)
4991             {
4992             ptr++;
4993             if (g_ascii_isdigit(*ptr) == 0)
4994               {
4995               *errorcodeptr = ERR63;
4996               goto FAILED;
4997               }
4998             }
4999           else if (refsign == CHAR_MINUS)
5000             {
5001             if (g_ascii_isdigit(ptr[1]) == 0)
5002               goto OTHER_CHAR_AFTER_QUERY;
5003             ptr++;
5004             }
5005
5006           recno = 0;
5007           while(g_ascii_isdigit(*ptr) != 0)
5008             recno = recno * 10 + *ptr++ - CHAR_0;
5009
5010           if (*ptr != terminator)
5011             {
5012             *errorcodeptr = ERR29;
5013             goto FAILED;
5014             }
5015
5016           if (refsign == CHAR_MINUS)
5017             {
5018             if (recno == 0)
5019               {
5020               *errorcodeptr = ERR58;
5021               goto FAILED;
5022               }
5023             recno = cd->bracount - recno + 1;
5024             if (recno <= 0)
5025               {
5026               *errorcodeptr = ERR15;
5027               goto FAILED;
5028               }
5029             }
5030           else if (refsign == CHAR_PLUS)
5031             {
5032             if (recno == 0)
5033               {
5034               *errorcodeptr = ERR58;
5035               goto FAILED;
5036               }
5037             recno += cd->bracount;
5038             }
5039
5040           /* Come here from code above that handles a named recursion */
5041
5042           HANDLE_RECURSION:
5043
5044           previous = code;
5045           called = cd->start_code;
5046
5047           /* When we are actually compiling, find the bracket that is being
5048           referenced. Temporarily end the regex in case it doesn't exist before
5049           this point. If we end up with a forward reference, first check that
5050           the bracket does occur later so we can give the error (and position)
5051           now. Then remember this forward reference in the workspace so it can
5052           be filled in at the end. */
5053
5054           if (lengthptr == NULL)
5055             {
5056             *code = OP_END;
5057             if (recno != 0)
5058               called = _pcre_find_bracket(cd->start_code, utf8, recno);
5059
5060             /* Forward reference */
5061
5062             if (called == NULL)
5063               {
5064               if (find_parens(cd, NULL, recno,
5065                     (options & PCRE_EXTENDED) != 0) < 0)
5066                 {
5067                 *errorcodeptr = ERR15;
5068                 goto FAILED;
5069                 }
5070
5071               /* Fudge the value of "called" so that when it is inserted as an
5072               offset below, what it actually inserted is the reference number
5073               of the group. */
5074
5075               called = cd->start_code + recno;
5076               PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
5077               }
5078
5079             /* If not a forward reference, and the subpattern is still open,
5080             this is a recursive call. We check to see if this is a left
5081             recursion that could loop for ever, and diagnose that case. */
5082
5083             else if (GET(called, 1) == 0 &&
5084                      could_be_empty(called, code, bcptr, utf8, cd))
5085               {
5086               *errorcodeptr = ERR40;
5087               goto FAILED;
5088               }
5089             }
5090
5091           /* Insert the recursion/subroutine item, automatically wrapped inside
5092           "once" brackets. Set up a "previous group" length so that a
5093           subsequent quantifier will work. */
5094
5095           *code = OP_ONCE;
5096           PUT(code, 1, 2 + 2*LINK_SIZE);
5097           code += 1 + LINK_SIZE;
5098
5099           *code = OP_RECURSE;
5100           PUT(code, 1, called - cd->start_code);
5101           code += 1 + LINK_SIZE;
5102
5103           *code = OP_KET;
5104           PUT(code, 1, 2 + 2*LINK_SIZE);
5105           code += 1 + LINK_SIZE;
5106
5107           length_prevgroup = 3 + 3*LINK_SIZE;
5108           }
5109
5110         /* Can't determine a first byte now */
5111
5112         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5113         continue;
5114
5115
5116         /* ------------------------------------------------------------ */
5117         default:              /* Other characters: check option setting */
5118         OTHER_CHAR_AFTER_QUERY:
5119         set = unset = 0;
5120         optset = &set;
5121
5122         while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
5123           {
5124           switch (*ptr++)
5125             {
5126             case CHAR_MINUS: optset = &unset; break;
5127
5128             case CHAR_J:    /* Record that it changed in the external options */
5129             *optset |= PCRE_DUPNAMES;
5130             cd->external_flags |= PCRE_JCHANGED;
5131             break;
5132
5133             case CHAR_i: *optset |= PCRE_CASELESS; break;
5134             case CHAR_m: *optset |= PCRE_MULTILINE; break;
5135             case CHAR_s: *optset |= PCRE_DOTALL; break;
5136             case CHAR_x: *optset |= PCRE_EXTENDED; break;
5137             case CHAR_U: *optset |= PCRE_UNGREEDY; break;
5138             case CHAR_X: *optset |= PCRE_EXTRA; break;
5139
5140             default:  *errorcodeptr = ERR12;
5141                       ptr--;    /* Correct the offset */
5142                       goto FAILED;
5143             }
5144           }
5145
5146         /* Set up the changed option bits, but don't change anything yet. */
5147
5148         newoptions = (options | set) & (~unset);
5149
5150         /* If the options ended with ')' this is not the start of a nested
5151         group with option changes, so the options change at this level. If this
5152         item is right at the start of the pattern, the options can be
5153         abstracted and made external in the pre-compile phase, and ignored in
5154         the compile phase. This can be helpful when matching -- for instance in
5155         caseless checking of required bytes.
5156
5157         If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
5158         definitely *not* at the start of the pattern because something has been
5159         compiled. In the pre-compile phase, however, the code pointer can have
5160         that value after the start, because it gets reset as code is discarded
5161         during the pre-compile. However, this can happen only at top level - if
5162         we are within parentheses, the starting BRA will still be present. At
5163         any parenthesis level, the length value can be used to test if anything
5164         has been compiled at that level. Thus, a test for both these conditions
5165         is necessary to ensure we correctly detect the start of the pattern in
5166         both phases.
5167
5168         If we are not at the pattern start, compile code to change the ims
5169         options if this setting actually changes any of them, and reset the
5170         greedy defaults and the case value for firstbyte and reqbyte. */
5171
5172         if (*ptr == CHAR_RIGHT_PARENTHESIS)
5173           {
5174           if (code == cd->start_code + 1 + LINK_SIZE &&
5175                (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
5176             {
5177             cd->external_options = newoptions;
5178             }
5179           else
5180             {
5181             if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
5182               {
5183               *code++ = OP_OPT;
5184               *code++ = newoptions & PCRE_IMS;
5185               }
5186             greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
5187             greedy_non_default = greedy_default ^ 1;
5188             req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
5189             }
5190
5191           /* Change options at this level, and pass them back for use
5192           in subsequent branches. When not at the start of the pattern, this
5193           information is also necessary so that a resetting item can be
5194           compiled at the end of a group (if we are in a group). */
5195
5196           *optionsptr = options = newoptions;
5197           previous = NULL;       /* This item can't be repeated */
5198           continue;              /* It is complete */
5199           }
5200
5201         /* If the options ended with ':' we are heading into a nested group
5202         with possible change of options. Such groups are non-capturing and are
5203         not assertions of any kind. All we need to do is skip over the ':';
5204         the newoptions value is handled below. */
5205
5206         bravalue = OP_BRA;
5207         ptr++;
5208         }     /* End of switch for character following (? */
5209       }       /* End of (? handling */
5210
5211     /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
5212     all unadorned brackets become non-capturing and behave like (?:...)
5213     brackets. */
5214
5215     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
5216       {
5217       bravalue = OP_BRA;
5218       }
5219
5220     /* Else we have a capturing group. */
5221
5222     else
5223       {
5224       NUMBERED_GROUP:
5225       cd->bracount += 1;
5226       PUT2(code, 1+LINK_SIZE, cd->bracount);
5227       skipbytes = 2;
5228       }
5229
5230     /* Process nested bracketed regex. Assertions may not be repeated, but
5231     other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
5232     non-register variable in order to be able to pass its address because some
5233     compilers complain otherwise. Pass in a new setting for the ims options if
5234     they have changed. */
5235
5236     previous = (bravalue >= OP_ONCE)? code : NULL;
5237     *code = bravalue;
5238     tempcode = code;
5239     tempreqvary = cd->req_varyopt;     /* Save value before bracket */
5240     length_prevgroup = 0;              /* Initialize for pre-compile phase */
5241
5242     if (!compile_regex(
5243          newoptions,                   /* The complete new option state */
5244          options & PCRE_IMS,           /* The previous ims option state */
5245          &tempcode,                    /* Where to put code (updated) */
5246          &ptr,                         /* Input pointer (updated) */
5247          errorcodeptr,                 /* Where to put an error message */
5248          (bravalue == OP_ASSERTBACK ||
5249           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
5250          reset_bracount,               /* True if (?| group */
5251          skipbytes,                    /* Skip over bracket number */
5252          &subfirstbyte,                /* For possible first char */
5253          &subreqbyte,                  /* For possible last char */
5254          bcptr,                        /* Current branch chain */
5255          cd,                           /* Tables block */
5256          (lengthptr == NULL)? NULL :   /* Actual compile phase */
5257            &length_prevgroup           /* Pre-compile phase */
5258          ))
5259       goto FAILED;
5260
5261     /* At the end of compiling, code is still pointing to the start of the
5262     group, while tempcode has been updated to point past the end of the group
5263     and any option resetting that may follow it. The pattern pointer (ptr)
5264     is on the bracket. */
5265
5266     /* If this is a conditional bracket, check that there are no more than
5267     two branches in the group, or just one if it's a DEFINE group. We do this
5268     in the real compile phase, not in the pre-pass, where the whole group may
5269     not be available. */
5270
5271     if (bravalue == OP_COND && lengthptr == NULL)
5272       {
5273       uschar *tc = code;
5274       int condcount = 0;
5275
5276       do {
5277          condcount++;
5278          tc += GET(tc,1);
5279          }
5280       while (*tc != OP_KET);
5281
5282       /* A DEFINE group is never obeyed inline (the "condition" is always
5283       false). It must have only one branch. */
5284
5285       if (code[LINK_SIZE+1] == OP_DEF)
5286         {
5287         if (condcount > 1)
5288           {
5289           *errorcodeptr = ERR54;
5290           goto FAILED;
5291           }
5292         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
5293         }
5294
5295       /* A "normal" conditional group. If there is just one branch, we must not
5296       make use of its firstbyte or reqbyte, because this is equivalent to an
5297       empty second branch. */
5298
5299       else
5300         {
5301         if (condcount > 2)
5302           {
5303           *errorcodeptr = ERR27;
5304           goto FAILED;
5305           }
5306         if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
5307         }
5308       }
5309
5310     /* Error if hit end of pattern */
5311
5312     if (*ptr != CHAR_RIGHT_PARENTHESIS)
5313       {
5314       *errorcodeptr = ERR14;
5315       goto FAILED;
5316       }
5317
5318     /* In the pre-compile phase, update the length by the length of the group,
5319     less the brackets at either end. Then reduce the compiled code to just a
5320     set of non-capturing brackets so that it doesn't use much memory if it is
5321     duplicated by a quantifier.*/
5322
5323     if (lengthptr != NULL)
5324       {
5325       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
5326         {
5327         *errorcodeptr = ERR20;
5328         goto FAILED;
5329         }
5330       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5331       *code++ = OP_BRA;
5332       PUTINC(code, 0, 1 + LINK_SIZE);
5333       *code++ = OP_KET;
5334       PUTINC(code, 0, 1 + LINK_SIZE);
5335       break;    /* No need to waste time with special character handling */
5336       }
5337
5338     /* Otherwise update the main code pointer to the end of the group. */
5339
5340     code = tempcode;
5341
5342     /* For a DEFINE group, required and first character settings are not
5343     relevant. */
5344
5345     if (bravalue == OP_DEF) break;
5346
5347     /* Handle updating of the required and first characters for other types of
5348     group. Update for normal brackets of all kinds, and conditions with two
5349     branches (see code above). If the bracket is followed by a quantifier with
5350     zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5351     zerofirstbyte outside the main loop so that they can be accessed for the
5352     back off. */
5353
5354     zeroreqbyte = reqbyte;
5355     zerofirstbyte = firstbyte;
5356     groupsetfirstbyte = FALSE;
5357
5358     if (bravalue >= OP_ONCE)
5359       {
5360       /* If we have not yet set a firstbyte in this branch, take it from the
5361       subpattern, remembering that it was set here so that a repeat of more
5362       than one can replicate it as reqbyte if necessary. If the subpattern has
5363       no firstbyte, set "none" for the whole branch. In both cases, a zero
5364       repeat forces firstbyte to "none". */
5365
5366       if (firstbyte == REQ_UNSET)
5367         {
5368         if (subfirstbyte >= 0)
5369           {
5370           firstbyte = subfirstbyte;
5371           groupsetfirstbyte = TRUE;
5372           }
5373         else firstbyte = REQ_NONE;
5374         zerofirstbyte = REQ_NONE;
5375         }
5376
5377       /* If firstbyte was previously set, convert the subpattern's firstbyte
5378       into reqbyte if there wasn't one, using the vary flag that was in
5379       existence beforehand. */
5380
5381       else if (subfirstbyte >= 0 && subreqbyte < 0)
5382         subreqbyte = subfirstbyte | tempreqvary;
5383
5384       /* If the subpattern set a required byte (or set a first byte that isn't
5385       really the first byte - see above), set it. */
5386
5387       if (subreqbyte >= 0) reqbyte = subreqbyte;
5388       }
5389
5390     /* For a forward assertion, we take the reqbyte, if set. This can be
5391     helpful if the pattern that follows the assertion doesn't set a different
5392     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5393     for an assertion, however because it leads to incorrect effect for patterns
5394     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5395     of a firstbyte. This is overcome by a scan at the end if there's no
5396     firstbyte, looking for an asserted first char. */
5397
5398     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5399     break;     /* End of processing '(' */
5400
5401
5402     /* ===================================================================*/
5403     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5404     are arranged to be the negation of the corresponding OP_values. For the
5405     back references, the values are ESC_REF plus the reference number. Only
5406     back references and those types that consume a character may be repeated.
5407     We can test for values between ESC_b and ESC_Z for the latter; this may
5408     have to change if any new ones are ever created. */
5409
5410     case CHAR_BACKSLASH:
5411     tempptr = ptr;
5412     c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5413     if (*errorcodeptr != 0) goto FAILED;
5414
5415     if (c < 0)
5416       {
5417       if (-c == ESC_Q)            /* Handle start of quoted string */
5418         {
5419         if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5420           ptr += 2;               /* avoid empty string */
5421             else inescq = TRUE;
5422         continue;
5423         }
5424
5425       if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
5426
5427       /* For metasequences that actually match a character, we disable the
5428       setting of a first character if it hasn't already been set. */
5429
5430       if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5431         firstbyte = REQ_NONE;
5432
5433       /* Set values to reset to if this is followed by a zero repeat. */
5434
5435       zerofirstbyte = firstbyte;
5436       zeroreqbyte = reqbyte;
5437
5438       /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5439       is a subroutine call by number (Oniguruma syntax). In fact, the value
5440       -ESC_g is returned only for these cases. So we don't need to check for <
5441       or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5442       -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5443       that is a synonym for a named back reference). */
5444
5445       if (-c == ESC_g)
5446         {
5447         const uschar *p;
5448         save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
5449         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5450           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5451
5452         /* These two statements stop the compiler for warning about possibly
5453         unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5454         fact, because we actually check for a number below, the paths that
5455         would actually be in error are never taken. */
5456
5457         skipbytes = 0;
5458         reset_bracount = FALSE;
5459
5460         /* Test for a name */
5461
5462         if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
5463           {
5464           BOOL isnumber = TRUE;
5465           for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5466             {
5467             if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5468             if ((cd->ctypes[*p] & ctype_word) == 0) break;
5469             }
5470           if (*p != terminator)
5471             {
5472             *errorcodeptr = ERR57;
5473             break;
5474             }
5475           if (isnumber)
5476             {
5477             ptr++;
5478             goto HANDLE_NUMERICAL_RECURSION;
5479             }
5480           is_recurse = TRUE;
5481           goto NAMED_REF_OR_RECURSE;
5482           }
5483
5484         /* Test a signed number in angle brackets or quotes. */
5485
5486         p = ptr + 2;
5487         while (g_ascii_isdigit(*p) != 0) p++;
5488         if (*p != terminator)
5489           {
5490           *errorcodeptr = ERR57;
5491           break;
5492           }
5493         ptr++;
5494         goto HANDLE_NUMERICAL_RECURSION;
5495         }
5496
5497       /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5498       We also support \k{name} (.NET syntax) */
5499
5500       if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||
5501           ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))
5502         {
5503         is_recurse = FALSE;
5504         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5505           CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
5506           CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
5507         goto NAMED_REF_OR_RECURSE;
5508         }
5509
5510       /* Back references are handled specially; must disable firstbyte if
5511       not set to cope with cases like (?=(\w+))\1: which would otherwise set
5512       ':' later. */
5513
5514       if (-c >= ESC_REF)
5515         {
5516         open_capitem *oc;
5517         recno = -c - ESC_REF;
5518
5519         HANDLE_REFERENCE:    /* Come here from named backref handling */
5520         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5521         previous = code;
5522         *code++ = OP_REF;
5523         PUT2INC(code, 0, recno);
5524         cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5525         if (recno > cd->top_backref) cd->top_backref = recno;
5526
5527         /* Check to see if this back reference is recursive, that it, it
5528         is inside the group that it references. A flag is set so that the
5529         group can be made atomic. */
5530
5531         for (oc = cd->open_caps; oc != NULL; oc = oc->next)
5532           {
5533           if (oc->number == recno)
5534             {
5535             oc->flag = TRUE;
5536             break;
5537             }
5538           }
5539         }
5540
5541       /* So are Unicode property matches, if supported. */
5542
5543 #ifdef SUPPORT_UCP
5544       else if (-c == ESC_P || -c == ESC_p)
5545         {
5546         BOOL negated;
5547         int pdata;
5548         int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5549         if (ptype < 0) goto FAILED;
5550         previous = code;
5551         *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5552         *code++ = ptype;
5553         *code++ = pdata;
5554         }
5555 #else
5556
5557       /* If Unicode properties are not supported, \X, \P, and \p are not
5558       allowed. */
5559
5560       else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5561         {
5562         *errorcodeptr = ERR45;
5563         goto FAILED;
5564         }
5565 #endif
5566
5567       /* For the rest (including \X when Unicode properties are supported), we
5568       can obtain the OP value by negating the escape value. */
5569
5570       else
5571         {
5572         previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5573         *code++ = -c;
5574         }
5575       continue;
5576       }
5577
5578     /* We have a data character whose value is in c. In UTF-8 mode it may have
5579     a value > 127. We set its representation in the length/buffer, and then
5580     handle it as a data character. */
5581
5582 #ifdef SUPPORT_UTF8
5583     if (utf8 && c > 127)
5584       mclength = _pcre_ord2utf8(c, mcbuffer);
5585     else
5586 #endif
5587
5588      {
5589      mcbuffer[0] = c;
5590      mclength = 1;
5591      }
5592     goto ONE_CHAR;
5593
5594
5595     /* ===================================================================*/
5596     /* Handle a literal character. It is guaranteed not to be whitespace or #
5597     when the extended flag is set. If we are in UTF-8 mode, it may be a
5598     multi-byte literal character. */
5599
5600     default:
5601     NORMAL_CHAR:
5602     mclength = 1;
5603     mcbuffer[0] = c;
5604
5605 #ifdef SUPPORT_UTF8
5606     if (utf8 && c >= 0xc0)
5607       {
5608       while ((ptr[1] & 0xc0) == 0x80)
5609         mcbuffer[mclength++] = *(++ptr);
5610       }
5611 #endif
5612
5613     /* At this point we have the character's bytes in mcbuffer, and the length
5614     in mclength. When not in UTF-8 mode, the length is always 1. */
5615
5616     ONE_CHAR:
5617     previous = code;
5618     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5619     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5620
5621     /* Remember if \r or \n were seen */
5622
5623     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
5624       cd->external_flags |= PCRE_HASCRORLF;
5625
5626     /* Set the first and required bytes appropriately. If no previous first
5627     byte, set it from this character, but revert to none on a zero repeat.
5628     Otherwise, leave the firstbyte value alone, and don't change it on a zero
5629     repeat. */
5630
5631     if (firstbyte == REQ_UNSET)
5632       {
5633       zerofirstbyte = REQ_NONE;
5634       zeroreqbyte = reqbyte;
5635
5636       /* If the character is more than one byte long, we can set firstbyte
5637       only if it is not to be matched caselessly. */
5638
5639       if (mclength == 1 || req_caseopt == 0)
5640         {
5641         firstbyte = mcbuffer[0] | req_caseopt;
5642         if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5643         }
5644       else firstbyte = reqbyte = REQ_NONE;
5645       }
5646
5647     /* firstbyte was previously set; we can set reqbyte only the length is
5648     1 or the matching is caseful. */
5649
5650     else
5651       {
5652       zerofirstbyte = firstbyte;
5653       zeroreqbyte = reqbyte;
5654       if (mclength == 1 || req_caseopt == 0)
5655         reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5656       }
5657
5658     break;            /* End of literal character handling */
5659     }
5660   }                   /* end of big loop */
5661
5662
5663 /* Control never reaches here by falling through, only by a goto for all the
5664 error states. Pass back the position in the pattern so that it can be displayed
5665 to the user for diagnosing the error. */
5666
5667 FAILED:
5668 *ptrptr = ptr;
5669 return FALSE;
5670 }
5671
5672
5673
5674
5675 /*************************************************
5676 *     Compile sequence of alternatives           *
5677 *************************************************/
5678
5679 /* On entry, ptr is pointing past the bracket character, but on return it
5680 points to the closing bracket, or vertical bar, or end of string. The code
5681 variable is pointing at the byte into which the BRA operator has been stored.
5682 If the ims options are changed at the start (for a (?ims: group) or during any
5683 branch, we need to insert an OP_OPT item at the start of every following branch
5684 to ensure they get set correctly at run time, and also pass the new options
5685 into every subsequent branch compile.
5686
5687 This function is used during the pre-compile phase when we are trying to find
5688 out the amount of memory needed, as well as during the real compile phase. The
5689 value of lengthptr distinguishes the two phases.
5690
5691 Arguments:
5692   options        option bits, including any changes for this subpattern
5693   oldims         previous settings of ims option bits
5694   codeptr        -> the address of the current code pointer
5695   ptrptr         -> the address of the current pattern pointer
5696   errorcodeptr   -> pointer to error code variable
5697   lookbehind     TRUE if this is a lookbehind assertion
5698   reset_bracount TRUE to reset the count for each branch
5699   skipbytes      skip this many bytes at start (for brackets and OP_COND)
5700   firstbyteptr   place to put the first required character, or a negative number
5701   reqbyteptr     place to put the last required character, or a negative number
5702   bcptr          pointer to the chain of currently open branches
5703   cd             points to the data block with tables pointers etc.
5704   lengthptr      NULL during the real compile phase
5705                  points to length accumulator during pre-compile phase
5706
5707 Returns:         TRUE on success
5708 */
5709
5710 static BOOL
5711 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5712   int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5713   int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5714   int *lengthptr)
5715 {
5716 const uschar *ptr = *ptrptr;
5717 uschar *code = *codeptr;
5718 uschar *last_branch = code;
5719 uschar *start_bracket = code;
5720 uschar *reverse_count = NULL;
5721 open_capitem capitem;
5722 int capnumber = 0;
5723 int firstbyte, reqbyte;
5724 int branchfirstbyte, branchreqbyte;
5725 int length;
5726 int orig_bracount;
5727 int max_bracount;
5728 int old_external_options = cd->external_options;
5729 branch_chain bc;
5730
5731 bc.outer = bcptr;
5732 bc.current_branch = code;
5733
5734 firstbyte = reqbyte = REQ_UNSET;
5735
5736 /* Accumulate the length for use in the pre-compile phase. Start with the
5737 length of the BRA and KET and any extra bytes that are required at the
5738 beginning. We accumulate in a local variable to save frequent testing of
5739 lenthptr for NULL. We cannot do this by looking at the value of code at the
5740 start and end of each alternative, because compiled items are discarded during
5741 the pre-compile phase so that the work space is not exceeded. */
5742
5743 length = 2 + 2*LINK_SIZE + skipbytes;
5744
5745 /* WARNING: If the above line is changed for any reason, you must also change
5746 the code that abstracts option settings at the start of the pattern and makes
5747 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5748 pre-compile phase to find out whether anything has yet been compiled or not. */
5749
5750 /* If this is a capturing subpattern, add to the chain of open capturing items
5751 so that we can detect them if (*ACCEPT) is encountered. This is also used to
5752 detect groups that contain recursive back references to themselves. */
5753
5754 if (*code == OP_CBRA)
5755   {
5756   capnumber = GET2(code, 1 + LINK_SIZE);
5757   capitem.number = capnumber;
5758   capitem.next = cd->open_caps;
5759   capitem.flag = FALSE;
5760   cd->open_caps = &capitem;
5761   }
5762
5763 /* Offset is set zero to mark that this bracket is still open */
5764
5765 PUT(code, 1, 0);
5766 code += 1 + LINK_SIZE + skipbytes;
5767
5768 /* Loop for each alternative branch */
5769
5770 orig_bracount = max_bracount = cd->bracount;
5771 for (;;)
5772   {
5773   /* For a (?| group, reset the capturing bracket count so that each branch
5774   uses the same numbers. */
5775
5776   if (reset_bracount) cd->bracount = orig_bracount;
5777
5778   /* Handle a change of ims options at the start of the branch */
5779
5780   if ((options & PCRE_IMS) != oldims)
5781     {
5782     *code++ = OP_OPT;
5783     *code++ = options & PCRE_IMS;
5784     length += 2;
5785     }
5786
5787   /* Set up dummy OP_REVERSE if lookbehind assertion */
5788
5789   if (lookbehind)
5790     {
5791     *code++ = OP_REVERSE;
5792     reverse_count = code;
5793     PUTINC(code, 0, 0);
5794     length += 1 + LINK_SIZE;
5795     }
5796
5797   /* Now compile the branch; in the pre-compile phase its length gets added
5798   into the length. */
5799
5800   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5801         &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5802     {
5803     *ptrptr = ptr;
5804     return FALSE;
5805     }
5806
5807   /* If the external options have changed during this branch, it means that we
5808   are at the top level, and a leading option setting has been encountered. We
5809   need to re-set the original option values to take account of this so that,
5810   during the pre-compile phase, we know to allow for a re-set at the start of
5811   subsequent branches. */
5812
5813   if (old_external_options != cd->external_options)
5814     oldims = cd->external_options & PCRE_IMS;
5815
5816   /* Keep the highest bracket count in case (?| was used and some branch
5817   has fewer than the rest. */
5818
5819   if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5820
5821   /* In the real compile phase, there is some post-processing to be done. */
5822
5823   if (lengthptr == NULL)
5824     {
5825     /* If this is the first branch, the firstbyte and reqbyte values for the
5826     branch become the values for the regex. */
5827
5828     if (*last_branch != OP_ALT)
5829       {
5830       firstbyte = branchfirstbyte;
5831       reqbyte = branchreqbyte;
5832       }
5833
5834     /* If this is not the first branch, the first char and reqbyte have to
5835     match the values from all the previous branches, except that if the
5836     previous value for reqbyte didn't have REQ_VARY set, it can still match,
5837     and we set REQ_VARY for the regex. */
5838
5839     else
5840       {
5841       /* If we previously had a firstbyte, but it doesn't match the new branch,
5842       we have to abandon the firstbyte for the regex, but if there was
5843       previously no reqbyte, it takes on the value of the old firstbyte. */
5844
5845       if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5846         {
5847         if (reqbyte < 0) reqbyte = firstbyte;
5848         firstbyte = REQ_NONE;
5849         }
5850
5851       /* If we (now or from before) have no firstbyte, a firstbyte from the
5852       branch becomes a reqbyte if there isn't a branch reqbyte. */
5853
5854       if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5855           branchreqbyte = branchfirstbyte;
5856
5857       /* Now ensure that the reqbytes match */
5858
5859       if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5860         reqbyte = REQ_NONE;
5861       else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
5862       }
5863
5864     /* If lookbehind, check that this branch matches a fixed-length string, and
5865     put the length into the OP_REVERSE item. Temporarily mark the end of the
5866     branch with OP_END. If the branch contains OP_RECURSE, the result is -3
5867     because there may be forward references that we can't check here. Set a
5868     flag to cause another lookbehind check at the end. Why not do it all at the
5869     end? Because common, erroneous checks are picked up here and the offset of
5870     the problem can be shown. */
5871
5872     if (lookbehind)
5873       {
5874       int fixed_length;
5875       *code = OP_END;
5876       fixed_length = find_fixedlength(last_branch, options, FALSE, cd);
5877       DPRINTF(("fixed length = %d\n", fixed_length));
5878       if (fixed_length == -3)
5879         {
5880         cd->check_lookbehind = TRUE;
5881         }
5882       else if (fixed_length < 0)
5883         {
5884         *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5885         *ptrptr = ptr;
5886         return FALSE;
5887         }
5888       else { PUT(reverse_count, 0, fixed_length); }
5889       }
5890     }
5891
5892   /* Reached end of expression, either ')' or end of pattern. In the real
5893   compile phase, go back through the alternative branches and reverse the chain
5894   of offsets, with the field in the BRA item now becoming an offset to the
5895   first alternative. If there are no alternatives, it points to the end of the
5896   group. The length in the terminating ket is always the length of the whole
5897   bracketed item. If any of the ims options were changed inside the group,
5898   compile a resetting op-code following, except at the very end of the pattern.
5899   Return leaving the pointer at the terminating char. */
5900
5901   if (*ptr != CHAR_VERTICAL_LINE)
5902     {
5903     if (lengthptr == NULL)
5904       {
5905       int branch_length = code - last_branch;
5906       do
5907         {
5908         int prev_length = GET(last_branch, 1);
5909         PUT(last_branch, 1, branch_length);
5910         branch_length = prev_length;
5911         last_branch -= branch_length;
5912         }
5913       while (branch_length > 0);
5914       }
5915
5916     /* Fill in the ket */
5917
5918     *code = OP_KET;
5919     PUT(code, 1, code - start_bracket);
5920     code += 1 + LINK_SIZE;
5921
5922     /* If it was a capturing subpattern, check to see if it contained any
5923     recursive back references. If so, we must wrap it in atomic brackets.
5924     In any event, remove the block from the chain. */
5925
5926     if (capnumber > 0)
5927       {
5928       if (cd->open_caps->flag)
5929         {
5930         memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
5931           code - start_bracket);
5932         *start_bracket = OP_ONCE;
5933         code += 1 + LINK_SIZE;
5934         PUT(start_bracket, 1, code - start_bracket);
5935         *code = OP_KET;
5936         PUT(code, 1, code - start_bracket);
5937         code += 1 + LINK_SIZE;
5938         length += 2 + 2*LINK_SIZE;
5939         }
5940       cd->open_caps = cd->open_caps->next;
5941       }
5942
5943     /* Reset options if needed. */
5944
5945     if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS)
5946       {
5947       *code++ = OP_OPT;
5948       *code++ = oldims;
5949       length += 2;
5950       }
5951
5952     /* Retain the highest bracket number, in case resetting was used. */
5953
5954     cd->bracount = max_bracount;
5955
5956     /* Set values to pass back */
5957
5958     *codeptr = code;
5959     *ptrptr = ptr;
5960     *firstbyteptr = firstbyte;
5961     *reqbyteptr = reqbyte;
5962     if (lengthptr != NULL)
5963       {
5964       if (OFLOW_MAX - *lengthptr < length)
5965         {
5966         *errorcodeptr = ERR20;
5967         return FALSE;
5968         }
5969       *lengthptr += length;
5970       }
5971     return TRUE;
5972     }
5973
5974   /* Another branch follows. In the pre-compile phase, we can move the code
5975   pointer back to where it was for the start of the first branch. (That is,
5976   pretend that each branch is the only one.)
5977
5978   In the real compile phase, insert an ALT node. Its length field points back
5979   to the previous branch while the bracket remains open. At the end the chain
5980   is reversed. It's done like this so that the start of the bracket has a
5981   zero offset until it is closed, making it possible to detect recursion. */
5982
5983   if (lengthptr != NULL)
5984     {
5985     code = *codeptr + 1 + LINK_SIZE + skipbytes;
5986     length += 1 + LINK_SIZE;
5987     }
5988   else
5989     {
5990     *code = OP_ALT;
5991     PUT(code, 1, code - last_branch);
5992     bc.current_branch = last_branch = code;
5993     code += 1 + LINK_SIZE;
5994     }
5995
5996   ptr++;
5997   }
5998 /* Control never reaches here */
5999 }
6000
6001
6002
6003
6004 /*************************************************
6005 *          Check for anchored expression         *
6006 *************************************************/
6007
6008 /* Try to find out if this is an anchored regular expression. Consider each
6009 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
6010 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
6011 it's anchored. However, if this is a multiline pattern, then only OP_SOD
6012 counts, since OP_CIRC can match in the middle.
6013
6014 We can also consider a regex to be anchored if OP_SOM starts all its branches.
6015 This is the code for \G, which means "match at start of match position, taking
6016 into account the match offset".
6017
6018 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
6019 because that will try the rest of the pattern at all possible matching points,
6020 so there is no point trying again.... er ....
6021
6022 .... except when the .* appears inside capturing parentheses, and there is a
6023 subsequent back reference to those parentheses. We haven't enough information
6024 to catch that case precisely.
6025
6026 At first, the best we could do was to detect when .* was in capturing brackets
6027 and the highest back reference was greater than or equal to that level.
6028 However, by keeping a bitmap of the first 31 back references, we can catch some
6029 of the more common cases more precisely.
6030
6031 Arguments:
6032   code           points to start of expression (the bracket)
6033   options        points to the options setting
6034   bracket_map    a bitmap of which brackets we are inside while testing; this
6035                   handles up to substring 31; after that we just have to take
6036                   the less precise approach
6037   backref_map    the back reference bitmap
6038
6039 Returns:     TRUE or FALSE
6040 */
6041
6042 static BOOL
6043 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
6044   unsigned int backref_map)
6045 {
6046 do {
6047    const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6048      options, PCRE_MULTILINE, FALSE);
6049    register int op = *scode;
6050
6051    /* Non-capturing brackets */
6052
6053    if (op == OP_BRA)
6054      {
6055      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
6056      }
6057
6058    /* Capturing brackets */
6059
6060    else if (op == OP_CBRA)
6061      {
6062      int n = GET2(scode, 1+LINK_SIZE);
6063      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6064      if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
6065      }
6066
6067    /* Other brackets */
6068
6069    else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
6070      {
6071      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
6072      }
6073
6074    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
6075    it isn't in brackets that are or may be referenced. */
6076
6077    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
6078              op == OP_TYPEPOSSTAR))
6079      {
6080      if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
6081        return FALSE;
6082      }
6083
6084    /* Check for explicit anchoring */
6085
6086    else if (op != OP_SOD && op != OP_SOM &&
6087            ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
6088      return FALSE;
6089    code += GET(code, 1);
6090    }
6091 while (*code == OP_ALT);   /* Loop for each alternative */
6092 return TRUE;
6093 }
6094
6095
6096
6097 /*************************************************
6098 *         Check for starting with ^ or .*        *
6099 *************************************************/
6100
6101 /* This is called to find out if every branch starts with ^ or .* so that
6102 "first char" processing can be done to speed things up in multiline
6103 matching and for non-DOTALL patterns that start with .* (which must start at
6104 the beginning or after \n). As in the case of is_anchored() (see above), we
6105 have to take account of back references to capturing brackets that contain .*
6106 because in that case we can't make the assumption.
6107
6108 Arguments:
6109   code           points to start of expression (the bracket)
6110   bracket_map    a bitmap of which brackets we are inside while testing; this
6111                   handles up to substring 31; after that we just have to take
6112                   the less precise approach
6113   backref_map    the back reference bitmap
6114
6115 Returns:         TRUE or FALSE
6116 */
6117
6118 static BOOL
6119 is_startline(const uschar *code, unsigned int bracket_map,
6120   unsigned int backref_map)
6121 {
6122 do {
6123    const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6124      NULL, 0, FALSE);
6125    register int op = *scode;
6126
6127    /* If we are at the start of a conditional assertion group, *both* the
6128    conditional assertion *and* what follows the condition must satisfy the test
6129    for start of line. Other kinds of condition fail. Note that there may be an
6130    auto-callout at the start of a condition. */
6131
6132    if (op == OP_COND)
6133      {
6134      scode += 1 + LINK_SIZE;
6135      if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];
6136      switch (*scode)
6137        {
6138        case OP_CREF:
6139        case OP_NCREF:
6140        case OP_RREF:
6141        case OP_NRREF:
6142        case OP_DEF:
6143        return FALSE;
6144
6145        default:     /* Assertion */
6146        if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6147        do scode += GET(scode, 1); while (*scode == OP_ALT);
6148        scode += 1 + LINK_SIZE;
6149        break;
6150        }
6151      scode = first_significant_code(scode, NULL, 0, FALSE);
6152      op = *scode;
6153      }
6154
6155    /* Non-capturing brackets */
6156
6157    if (op == OP_BRA)
6158      {
6159      if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6160      }
6161
6162    /* Capturing brackets */
6163
6164    else if (op == OP_CBRA)
6165      {
6166      int n = GET2(scode, 1+LINK_SIZE);
6167      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6168      if (!is_startline(scode, new_map, backref_map)) return FALSE;
6169      }
6170
6171    /* Other brackets */
6172
6173    else if (op == OP_ASSERT || op == OP_ONCE)
6174      {
6175      if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6176      }
6177
6178    /* .* means "start at start or after \n" if it isn't in brackets that
6179    may be referenced. */
6180
6181    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
6182      {
6183      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
6184      }
6185
6186    /* Check for explicit circumflex */
6187
6188    else if (op != OP_CIRC) return FALSE;
6189
6190    /* Move on to the next alternative */
6191
6192    code += GET(code, 1);
6193    }
6194 while (*code == OP_ALT);  /* Loop for each alternative */
6195 return TRUE;
6196 }
6197
6198
6199
6200 /*************************************************
6201 *       Check for asserted fixed first char      *
6202 *************************************************/
6203
6204 /* During compilation, the "first char" settings from forward assertions are
6205 discarded, because they can cause conflicts with actual literals that follow.
6206 However, if we end up without a first char setting for an unanchored pattern,
6207 it is worth scanning the regex to see if there is an initial asserted first
6208 char. If all branches start with the same asserted char, or with a bracket all
6209 of whose alternatives start with the same asserted char (recurse ad lib), then
6210 we return that char, otherwise -1.
6211
6212 Arguments:
6213   code       points to start of expression (the bracket)
6214   options    pointer to the options (used to check casing changes)
6215   inassert   TRUE if in an assertion
6216
6217 Returns:     -1 or the fixed first char
6218 */
6219
6220 static int
6221 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
6222 {
6223 register int c = -1;
6224 do {
6225    int d;
6226    const uschar *scode =
6227      first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
6228    register int op = *scode;
6229
6230    switch(op)
6231      {
6232      default:
6233      return -1;
6234
6235      case OP_BRA:
6236      case OP_CBRA:
6237      case OP_ASSERT:
6238      case OP_ONCE:
6239      case OP_COND:
6240      if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
6241        return -1;
6242      if (c < 0) c = d; else if (c != d) return -1;
6243      break;
6244
6245      case OP_EXACT:       /* Fall through */
6246      scode += 2;
6247
6248      case OP_CHAR:
6249      case OP_CHARNC:
6250      case OP_PLUS:
6251      case OP_MINPLUS:
6252      case OP_POSPLUS:
6253      if (!inassert) return -1;
6254      if (c < 0)
6255        {
6256        c = scode[1];
6257        if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
6258        }
6259      else if (c != scode[1]) return -1;
6260      break;
6261      }
6262
6263    code += GET(code, 1);
6264    }
6265 while (*code == OP_ALT);
6266 return c;
6267 }
6268
6269
6270
6271 /*************************************************
6272 *        Compile a Regular Expression            *
6273 *************************************************/
6274
6275 /* This function takes a string and returns a pointer to a block of store
6276 holding a compiled version of the expression. The original API for this
6277 function had no error code return variable; it is retained for backwards
6278 compatibility. The new function is given a new name.
6279
6280 Arguments:
6281   pattern       the regular expression
6282   options       various option bits
6283   errorcodeptr  pointer to error code variable (pcre_compile2() only)
6284                   can be NULL if you don't want a code value
6285   errorptr      pointer to pointer to error text
6286   erroroffset   ptr offset in pattern where error was detected
6287   tables        pointer to character tables or NULL
6288
6289 Returns:        pointer to compiled data block, or NULL on error,
6290                 with errorptr and erroroffset set
6291 */
6292
6293 #ifdef NOT_USED_IN_GLIB
6294
6295 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6296 pcre_compile(const char *pattern, int options, const char **errorptr,
6297   int *erroroffset, const unsigned char *tables)
6298 {
6299 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
6300 }
6301
6302 #endif
6303
6304 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6305 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
6306   const char **errorptr, int *erroroffset, const unsigned char *tables)
6307 {
6308 real_pcre *re;
6309 int length = 1;  /* For final END opcode */
6310 int firstbyte, reqbyte, newline;
6311 int errorcode = 0;
6312 int skipatstart = 0;
6313 BOOL utf8 = (options & PCRE_UTF8) != 0;
6314 size_t size;
6315 uschar *code;
6316 const uschar *codestart;
6317 const uschar *ptr;
6318 compile_data compile_block;
6319 compile_data *cd = &compile_block;
6320
6321 /* This space is used for "compiling" into during the first phase, when we are
6322 computing the amount of memory that is needed. Compiled items are thrown away
6323 as soon as possible, so that a fairly large buffer should be sufficient for
6324 this purpose. The same space is used in the second phase for remembering where
6325 to fill in forward references to subpatterns. */
6326
6327 uschar cworkspace[COMPILE_WORK_SIZE];
6328
6329 /* Set this early so that early errors get offset 0. */
6330
6331 ptr = (const uschar *)pattern;
6332
6333 /* We can't pass back an error message if errorptr is NULL; I guess the best we
6334 can do is just return NULL, but we can set a code value if there is a code
6335 pointer. */
6336
6337 if (errorptr == NULL)
6338   {
6339   if (errorcodeptr != NULL) *errorcodeptr = 99;
6340   return NULL;
6341   }
6342
6343 *errorptr = NULL;
6344 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
6345
6346 /* However, we can give a message for this error */
6347
6348 if (erroroffset == NULL)
6349   {
6350   errorcode = ERR16;
6351   goto PCRE_EARLY_ERROR_RETURN2;
6352   }
6353
6354 *erroroffset = 0;
6355
6356 /* Set up pointers to the individual character tables */
6357
6358 if (tables == NULL) tables = _pcre_default_tables;
6359 cd->lcc = tables + lcc_offset;
6360 cd->fcc = tables + fcc_offset;
6361 cd->cbits = tables + cbits_offset;
6362 cd->ctypes = tables + ctypes_offset;
6363
6364 /* Check that all undefined public option bits are zero */
6365
6366 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
6367   {
6368   errorcode = ERR17;
6369   goto PCRE_EARLY_ERROR_RETURN;
6370   }
6371
6372 /* Check for global one-time settings at the start of the pattern, and remember
6373 the offset for later. */
6374
6375 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
6376        ptr[skipatstart+1] == CHAR_ASTERISK)
6377   {
6378   int newnl = 0;
6379   int newbsr = 0;
6380
6381   if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
6382     { skipatstart += 7; options |= PCRE_UTF8; continue; }
6383
6384   if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
6385     { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6386   else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3)  == 0)
6387     { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
6388   else if (strncmp((char *)(ptr+skipatstart+2), STRING_CRLF_RIGHTPAR, 5)  == 0)
6389     { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
6390   else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANY_RIGHTPAR, 4) == 0)
6391     { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
6392   else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANYCRLF_RIGHTPAR, 8) == 0)
6393     { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
6394
6395   else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
6396     { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
6397   else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
6398     { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
6399
6400   if (newnl != 0)
6401     options = (options & ~PCRE_NEWLINE_BITS) | newnl;
6402   else if (newbsr != 0)
6403     options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
6404   else break;
6405   }
6406
6407 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
6408
6409 #ifdef SUPPORT_UTF8
6410 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6411      (*erroroffset = _pcre_valid_utf8((USPTR)pattern, -1)) >= 0)
6412   {
6413   errorcode = ERR44;
6414   goto PCRE_EARLY_ERROR_RETURN2;
6415   }
6416 #else
6417 if (utf8)
6418   {
6419   errorcode = ERR32;
6420   goto PCRE_EARLY_ERROR_RETURN;
6421   }
6422 #endif
6423
6424 /* Check validity of \R options. */
6425
6426 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6427   {
6428   case 0:
6429   case PCRE_BSR_ANYCRLF:
6430   case PCRE_BSR_UNICODE:
6431   break;
6432   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6433   }
6434
6435 /* Handle different types of newline. The three bits give seven cases. The
6436 current code allows for fixed one- or two-byte sequences, plus "any" and
6437 "anycrlf". */
6438
6439 switch (options & PCRE_NEWLINE_BITS)
6440   {
6441   case 0: newline = NEWLINE; break;   /* Build-time default */
6442   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6443   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6444   case PCRE_NEWLINE_CR+
6445        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6446   case PCRE_NEWLINE_ANY: newline = -1; break;
6447   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6448   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6449   }
6450
6451 if (newline == -2)
6452   {
6453   cd->nltype = NLTYPE_ANYCRLF;
6454   }
6455 else if (newline < 0)
6456   {
6457   cd->nltype = NLTYPE_ANY;
6458   }
6459 else
6460   {
6461   cd->nltype = NLTYPE_FIXED;
6462   if (newline > 255)
6463     {
6464     cd->nllen = 2;
6465     cd->nl[0] = (newline >> 8) & 255;
6466     cd->nl[1] = newline & 255;
6467     }
6468   else
6469     {
6470     cd->nllen = 1;
6471     cd->nl[0] = newline;
6472     }
6473   }
6474
6475 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
6476 references to help in deciding whether (.*) can be treated as anchored or not.
6477 */
6478
6479 cd->top_backref = 0;
6480 cd->backref_map = 0;
6481
6482 /* Reflect pattern for debugging output */
6483
6484 DPRINTF(("------------------------------------------------------------------\n"));
6485 DPRINTF(("%s\n", pattern));
6486
6487 /* Pretend to compile the pattern while actually just accumulating the length
6488 of memory required. This behaviour is triggered by passing a non-NULL final
6489 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
6490 to compile parts of the pattern into; the compiled code is discarded when it is
6491 no longer needed, so hopefully this workspace will never overflow, though there
6492 is a test for its doing so. */
6493
6494 cd->bracount = cd->final_bracount = 0;
6495 cd->names_found = 0;
6496 cd->name_entry_size = 0;
6497 cd->name_table = NULL;
6498 cd->start_workspace = cworkspace;
6499 cd->start_code = cworkspace;
6500 cd->hwm = cworkspace;
6501 cd->start_pattern = (const uschar *)pattern;
6502 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
6503 cd->req_varyopt = 0;
6504 cd->external_options = options;
6505 cd->external_flags = 0;
6506 cd->open_caps = NULL;
6507
6508 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6509 don't need to look at the result of the function here. The initial options have
6510 been put into the cd block so that they can be changed if an option setting is
6511 found within the regex right at the beginning. Bringing initial option settings
6512 outside can help speed up starting point checks. */
6513
6514 ptr += skipatstart;
6515 code = cworkspace;
6516 *code = OP_BRA;
6517 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
6518   &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
6519   &length);
6520 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
6521
6522 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
6523   cd->hwm - cworkspace));
6524
6525 if (length > MAX_PATTERN_SIZE)
6526   {
6527   errorcode = ERR20;
6528   goto PCRE_EARLY_ERROR_RETURN;
6529   }
6530
6531 /* Compute the size of data block needed and get it, either from malloc or
6532 externally provided function. Integer overflow should no longer be possible
6533 because nowadays we limit the maximum value of cd->names_found and
6534 cd->name_entry_size. */
6535
6536 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
6537 re = (real_pcre *)(pcre_malloc)(size);
6538
6539 if (re == NULL)
6540   {
6541   errorcode = ERR21;
6542   goto PCRE_EARLY_ERROR_RETURN;
6543   }
6544
6545 /* Put in the magic number, and save the sizes, initial options, internal
6546 flags, and character table pointer. NULL is used for the default character
6547 tables. The nullpad field is at the end; it's there to help in the case when a
6548 regex compiled on a system with 4-byte pointers is run on another with 8-byte
6549 pointers. */
6550
6551 re->magic_number = MAGIC_NUMBER;
6552 re->size = size;
6553 re->options = cd->external_options;
6554 re->flags = cd->external_flags;
6555 re->dummy1 = 0;
6556 re->first_byte = 0;
6557 re->req_byte = 0;
6558 re->name_table_offset = sizeof(real_pcre);
6559 re->name_entry_size = cd->name_entry_size;
6560 re->name_count = cd->names_found;
6561 re->ref_count = 0;
6562 re->tables = (tables == _pcre_default_tables)? NULL : tables;
6563 re->nullpad = NULL;
6564
6565 /* The starting points of the name/number translation table and of the code are
6566 passed around in the compile data block. The start/end pattern and initial
6567 options are already set from the pre-compile phase, as is the name_entry_size
6568 field. Reset the bracket count and the names_found field. Also reset the hwm
6569 field; this time it's used for remembering forward references to subpatterns.
6570 */
6571
6572 cd->final_bracount = cd->bracount;  /* Save for checking forward references */
6573 cd->bracount = 0;
6574 cd->names_found = 0;
6575 cd->name_table = (uschar *)re + re->name_table_offset;
6576 codestart = cd->name_table + re->name_entry_size * re->name_count;
6577 cd->start_code = codestart;
6578 cd->hwm = cworkspace;
6579 cd->req_varyopt = 0;
6580 cd->had_accept = FALSE;
6581 cd->check_lookbehind = FALSE;
6582 cd->open_caps = NULL;
6583
6584 /* Set up a starting, non-extracting bracket, then compile the expression. On
6585 error, errorcode will be set non-zero, so we don't need to look at the result
6586 of the function here. */
6587
6588 ptr = (const uschar *)pattern + skipatstart;
6589 code = (uschar *)codestart;
6590 *code = OP_BRA;
6591 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6592   &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6593 re->top_bracket = cd->bracount;
6594 re->top_backref = cd->top_backref;
6595 re->flags = cd->external_flags;
6596
6597 if (cd->had_accept) reqbyte = -1;   /* Must disable after (*ACCEPT) */
6598
6599 /* If not reached end of pattern on success, there's an excess bracket. */
6600
6601 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6602
6603 /* Fill in the terminating state and check for disastrous overflow, but
6604 if debugging, leave the test till after things are printed out. */
6605
6606 *code++ = OP_END;
6607
6608 #ifndef PCRE_DEBUG
6609 if (code - codestart > length) errorcode = ERR23;
6610 #endif
6611
6612 /* Fill in any forward references that are required. */
6613
6614 while (errorcode == 0 && cd->hwm > cworkspace)
6615   {
6616   int offset, recno;
6617   const uschar *groupptr;
6618   cd->hwm -= LINK_SIZE;
6619   offset = GET(cd->hwm, 0);
6620   recno = GET(codestart, offset);
6621   groupptr = _pcre_find_bracket(codestart, utf8, recno);
6622   if (groupptr == NULL) errorcode = ERR53;
6623     else PUT(((uschar *)codestart), offset, groupptr - codestart);
6624   }
6625
6626 /* Give an error if there's back reference to a non-existent capturing
6627 subpattern. */
6628
6629 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6630
6631 /* If there were any lookbehind assertions that contained OP_RECURSE
6632 (recursions or subroutine calls), a flag is set for them to be checked here,
6633 because they may contain forward references. Actual recursions can't be fixed
6634 length, but subroutine calls can. It is done like this so that those without
6635 OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
6636 exceptional ones forgo this. We scan the pattern to check that they are fixed
6637 length, and set their lengths. */
6638
6639 if (cd->check_lookbehind)
6640   {
6641   uschar *cc = (uschar *)codestart;
6642
6643   /* Loop, searching for OP_REVERSE items, and process those that do not have
6644   their length set. (Actually, it will also re-process any that have a length
6645   of zero, but that is a pathological case, and it does no harm.) When we find
6646   one, we temporarily terminate the branch it is in while we scan it. */
6647
6648   for (cc = (uschar *)_pcre_find_bracket(codestart, utf8, -1);
6649        cc != NULL;
6650        cc = (uschar *)_pcre_find_bracket(cc, utf8, -1))
6651     {
6652     if (GET(cc, 1) == 0)
6653       {
6654       int fixed_length;
6655       uschar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
6656       int end_op = *be;
6657       *be = OP_END;
6658       fixed_length = find_fixedlength(cc, re->options, TRUE, cd);
6659       *be = end_op;
6660       DPRINTF(("fixed length = %d\n", fixed_length));
6661       if (fixed_length < 0)
6662         {
6663         errorcode = (fixed_length == -2)? ERR36 : ERR25;
6664         break;
6665         }
6666       PUT(cc, 1, fixed_length);
6667       }
6668     cc += 1 + LINK_SIZE;
6669     }
6670   }
6671
6672 /* Failed to compile, or error while post-processing */
6673
6674 if (errorcode != 0)
6675   {
6676   (pcre_free)(re);
6677   PCRE_EARLY_ERROR_RETURN:
6678   *erroroffset = ptr - (const uschar *)pattern;
6679   PCRE_EARLY_ERROR_RETURN2:
6680   *errorptr = find_error_text(errorcode);
6681   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6682   return NULL;
6683   }
6684
6685 /* If the anchored option was not passed, set the flag if we can determine that
6686 the pattern is anchored by virtue of ^ characters or \A or anything else (such
6687 as starting with .* when DOTALL is set).
6688
6689 Otherwise, if we know what the first byte has to be, save it, because that
6690 speeds up unanchored matches no end. If not, see if we can set the
6691 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6692 start with ^. and also when all branches start with .* for non-DOTALL matches.
6693 */
6694
6695 if ((re->options & PCRE_ANCHORED) == 0)
6696   {
6697   int temp_options = re->options;   /* May get changed during these scans */
6698   if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6699     re->options |= PCRE_ANCHORED;
6700   else
6701     {
6702     if (firstbyte < 0)
6703       firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6704     if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
6705       {
6706       int ch = firstbyte & 255;
6707       re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6708          cd->fcc[ch] == ch)? ch : firstbyte;
6709       re->flags |= PCRE_FIRSTSET;
6710       }
6711     else if (is_startline(codestart, 0, cd->backref_map))
6712       re->flags |= PCRE_STARTLINE;
6713     }
6714   }
6715
6716 /* For an anchored pattern, we use the "required byte" only if it follows a
6717 variable length item in the regex. Remove the caseless flag for non-caseable
6718 bytes. */
6719
6720 if (reqbyte >= 0 &&
6721      ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6722   {
6723   int ch = reqbyte & 255;
6724   re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6725     cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6726   re->flags |= PCRE_REQCHSET;
6727   }
6728
6729 /* Print out the compiled data if debugging is enabled. This is never the
6730 case when building a production library. */
6731
6732 #ifdef PCRE_DEBUG
6733 printf("Length = %d top_bracket = %d top_backref = %d\n",
6734   length, re->top_bracket, re->top_backref);
6735
6736 printf("Options=%08x\n", re->options);
6737
6738 if ((re->flags & PCRE_FIRSTSET) != 0)
6739   {
6740   int ch = re->first_byte & 255;
6741   const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6742     "" : " (caseless)";
6743   if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6744     else printf("First char = \\x%02x%s\n", ch, caseless);
6745   }
6746
6747 if ((re->flags & PCRE_REQCHSET) != 0)
6748   {
6749   int ch = re->req_byte & 255;
6750   const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6751     "" : " (caseless)";
6752   if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6753     else printf("Req char = \\x%02x%s\n", ch, caseless);
6754   }
6755
6756 pcre_printint(re, stdout, TRUE);
6757
6758 /* This check is done here in the debugging case so that the code that
6759 was compiled can be seen. */
6760
6761 if (code - codestart > length)
6762   {
6763   (pcre_free)(re);
6764   *errorptr = find_error_text(ERR23);
6765   *erroroffset = ptr - (uschar *)pattern;
6766   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6767   return NULL;
6768   }
6769 #endif   /* PCRE_DEBUG */
6770
6771 return (pcre *)re;
6772 }
6773
6774 /* End of pcre_compile.c */