pcre_compile.c

   1 /*************************************************
   2 *      Perl-Compatible Regular Expressions       *
   3 *************************************************/
   4
   5 /* PCRE is a library of functions to support regular expressions whose syntax
   6 and semantics are as close as possible to those of the Perl 5 language.
   7
   8                        Written by Philip Hazel
   9            Copyright (c) 1997-2010 University of Cambridge
  10
  11 -----------------------------------------------------------------------------
  12 Redistribution and use in source and binary forms, with or without
  13 modification, are permitted provided that the following conditions are met:
  14
  15     * Redistributions of source code must retain the above copyright notice,
  16       this list of conditions and the following disclaimer.
  17
  18     * Redistributions in binary form must reproduce the above copyright
  19       notice, this list of conditions and the following disclaimer in the
  20       documentation and/or other materials provided with the distribution.
  21
  22     * Neither the name of the University of Cambridge nor the names of its
  23       contributors may be used to endorse or promote products derived from
  24       this software without specific prior written permission.
  25
  26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  36 POSSIBILITY OF SUCH DAMAGE.
  37 -----------------------------------------------------------------------------
  38 */
  39
  40
  41 /* This module contains the external function pcre_compile(), along with
  42 supporting internal functions that are not used by other modules. */
  43
  44
  45 #ifdef HAVE_CONFIG_H
  46 #include "config.h"
  47 #endif
  48
  49 #define NLBLOCK cd             /* Block containing newline information */
  50 #define PSSTART start_pattern  /* Field containing processed string start */
  51 #define PSEND   end_pattern    /* Field containing processed string end */
  52
  53 #include "pcre_internal.h"
  54
  55
  56 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
  57 also used by pcretest. PCRE_DEBUG is not defined when building a production
  58 library. */
  59
  60 #ifdef PCRE_DEBUG
  61 #include "pcre_printint.src"
  62 #endif
  63
  64
  65 /* Macro for setting individual bits in class bitmaps. */
  66
  67 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
  68
  69 /* Maximum length value to check against when making sure that the integer that
  70 holds the compiled pattern length does not overflow. We make it a bit less than
  71 INT_MAX to allow for adding in group terminating bytes, so that we don't have
  72 to check them every time. */
  73
  74 #define OFLOW_MAX (INT_MAX - 20)
  75
  76
  77 /*************************************************
  78 *      Code parameters and static tables         *
  79 *************************************************/
  80
  81 /* This value specifies the size of stack workspace that is used during the
  82 first pre-compile phase that determines how much memory is required. The regex
  83 is partly compiled into this space, but the compiled parts are discarded as
  84 soon as they can be, so that hopefully there will never be an overrun. The code
  85 does, however, check for an overrun. The largest amount I've seen used is 218,
  86 so this number is very generous.
  87
  88 The same workspace is used during the second, actual compile phase for
  89 remembering forward references to groups so that they can be filled in at the
  90 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
  91 is 4 there is plenty of room. */
  92
  93 #define COMPILE_WORK_SIZE (4096)
  94
  95 /* The overrun tests check for a slightly smaller size so that they detect the
  96 overrun before it actually does run off the end of the data block. */
  97
  98 #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
  99
 100
 101 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
 102 are simple data values; negative values are for special things like \d and so
 103 on. Zero means further processing is needed (for things like \x), or the escape
 104 is invalid. */
 105
 106 #ifndef EBCDIC
 107
 108 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
 109 in UTF-8 mode. */
 110
 111 static const short int escapes[] = {
 112      0,                       0,
 113      0,                       0,
 114      0,                       0,
 115      0,                       0,
 116      0,                       0,
 117      CHAR_COLON,              CHAR_SEMICOLON,
 118      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
 119      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
 120      CHAR_COMMERCIAL_AT,      -ESC_A,
 121      -ESC_B,                  -ESC_C,
 122      -ESC_D,                  -ESC_E,
 123      0,                       -ESC_G,
 124      -ESC_H,                  0,
 125      0,                       -ESC_K,
 126      0,                       0,
 127      -ESC_N,                  0,
 128      -ESC_P,                  -ESC_Q,
 129      -ESC_R,                  -ESC_S,
 130      0,                       0,
 131      -ESC_V,                  -ESC_W,
 132      -ESC_X,                  0,
 133      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
 134      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
 135      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
 136      CHAR_GRAVE_ACCENT,       7,
 137      -ESC_b,                  0,
 138      -ESC_d,                  ESC_e,
 139      ESC_f,                   0,
 140      -ESC_h,                  0,
 141      0,                       -ESC_k,
 142      0,                       0,
 143      ESC_n,                   0,
 144      -ESC_p,                  0,
 145      ESC_r,                   -ESC_s,
 146      ESC_tee,                 0,
 147      -ESC_v,                  -ESC_w,
 148      0,                       0,
 149      -ESC_z
 150 };
 151
 152 #else
 153
 154 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
 155
 156 static const short int escapes[] = {
 157 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
 158 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
 159 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
 160 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
 161 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
 162 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
 163 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
 164 /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
 165 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
 166 /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
 167 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
 168 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
 169 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
 170 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
 171 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
 172 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
 173 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
 174 /*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
 175 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
 176 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
 177 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
 178 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
 179 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
 180 };
 181 #endif
 182
 183
 184 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
 185 searched linearly. Put all the names into a single string, in order to reduce
 186 the number of relocations when a shared library is dynamically linked. The
 187 string is built from string macros so that it works in UTF-8 mode on EBCDIC
 188 platforms. */
 189
 190 typedef struct verbitem {
 191   int   len;                 /* Length of verb name */
 192   int   op;                  /* Op when no arg, or -1 if arg mandatory */
 193   int   op_arg;              /* Op when arg present, or -1 if not allowed */
 194 } verbitem;
 195
 196 static const char verbnames[] =
 197   "\0"                       /* Empty name is a shorthand for MARK */
 198   STRING_MARK0
 199   STRING_ACCEPT0
 200   STRING_COMMIT0
 201   STRING_F0
 202   STRING_FAIL0
 203   STRING_PRUNE0
 204   STRING_SKIP0
 205   STRING_THEN;
 206
 207 static const verbitem verbs[] = {
 208   { 0, -1,        OP_MARK },
 209   { 4, -1,        OP_MARK },
 210   { 6, OP_ACCEPT, -1 },
 211   { 6, OP_COMMIT, -1 },
 212   { 1, OP_FAIL,   -1 },
 213   { 4, OP_FAIL,   -1 },
 214   { 5, OP_PRUNE,  OP_PRUNE_ARG },
 215   { 4, OP_SKIP,   OP_SKIP_ARG  },
 216   { 4, OP_THEN,   OP_THEN_ARG  }
 217 };
 218
 219 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
 220
 221
 222 /* Tables of names of POSIX character classes and their lengths. The names are
 223 now all in a single string, to reduce the number of relocations when a shared
 224 library is dynamically loaded. The list of lengths is terminated by a zero
 225 length entry. The first three must be alpha, lower, upper, as this is assumed
 226 for handling case independence. */
 227
 228 static const char posix_names[] =
 229   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
 230   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
 231   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
 232   STRING_word0  STRING_xdigit;
 233
 234 static const uschar posix_name_lengths[] = {
 235   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
 236
 237 /* Table of class bit maps for each POSIX class. Each class is formed from a
 238 base map, with an optional addition or removal of another map. Then, for some
 239 classes, there is some additional tweaking: for [:blank:] the vertical space
 240 characters are removed, and for [:alpha:] and [:alnum:] the underscore
 241 character is removed. The triples in the table consist of the base map offset,
 242 second map offset or -1 if no second map, and a non-negative value for map
 243 addition or a negative value for map subtraction (if there are two maps). The
 244 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
 245 remove vertical space characters, 2 => remove underscore. */
 246
 247 static const int posix_class_maps[] = {
 248   cbit_word,  cbit_digit, -2,             /* alpha */
 249   cbit_lower, -1,          0,             /* lower */
 250   cbit_upper, -1,          0,             /* upper */
 251   cbit_word,  -1,          2,             /* alnum - word without underscore */
 252   cbit_print, cbit_cntrl,  0,             /* ascii */
 253   cbit_space, -1,          1,             /* blank - a GNU extension */
 254   cbit_cntrl, -1,          0,             /* cntrl */
 255   cbit_digit, -1,          0,             /* digit */
 256   cbit_graph, -1,          0,             /* graph */
 257   cbit_print, -1,          0,             /* print */
 258   cbit_punct, -1,          0,             /* punct */
 259   cbit_space, -1,          0,             /* space */
 260   cbit_word,  -1,          0,             /* word - a Perl extension */
 261   cbit_xdigit,-1,          0              /* xdigit */
 262 };
 263
 264 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
 265 substitutes must be in the order of the names, defined above, and there are
 266 both positive and negative cases. NULL means no substitute. */
 267
 268 #ifdef SUPPORT_UCP
 269 static const uschar *substitutes[] = {
 270   (uschar *)"\\P{Nd}",    /* \D */
 271   (uschar *)"\\p{Nd}",    /* \d */
 272   (uschar *)"\\P{Xsp}",   /* \S */       /* NOTE: Xsp is Perl space */
 273   (uschar *)"\\p{Xsp}",   /* \s */
 274   (uschar *)"\\P{Xwd}",   /* \W */
 275   (uschar *)"\\p{Xwd}"    /* \w */
 276 };
 277
 278 static const uschar *posix_substitutes[] = {
 279   (uschar *)"\\p{L}",     /* alpha */
 280   (uschar *)"\\p{Ll}",    /* lower */
 281   (uschar *)"\\p{Lu}",    /* upper */
 282   (uschar *)"\\p{Xan}",   /* alnum */
 283   NULL,                   /* ascii */
 284   (uschar *)"\\h",        /* blank */
 285   NULL,                   /* cntrl */
 286   (uschar *)"\\p{Nd}",    /* digit */
 287   NULL,                   /* graph */
 288   NULL,                   /* print */
 289   NULL,                   /* punct */
 290   (uschar *)"\\p{Xps}",   /* space */    /* NOTE: Xps is POSIX space */
 291   (uschar *)"\\p{Xwd}",   /* word */
 292   NULL,                   /* xdigit */
 293   /* Negated cases */
 294   (uschar *)"\\P{L}",     /* ^alpha */
 295   (uschar *)"\\P{Ll}",    /* ^lower */
 296   (uschar *)"\\P{Lu}",    /* ^upper */
 297   (uschar *)"\\P{Xan}",   /* ^alnum */
 298   NULL,                   /* ^ascii */
 299   (uschar *)"\\H",        /* ^blank */
 300   NULL,                   /* ^cntrl */
 301   (uschar *)"\\P{Nd}",    /* ^digit */
 302   NULL,                   /* ^graph */
 303   NULL,                   /* ^print */
 304   NULL,                   /* ^punct */
 305   (uschar *)"\\P{Xps}",   /* ^space */   /* NOTE: Xps is POSIX space */
 306   (uschar *)"\\P{Xwd}",   /* ^word */
 307   NULL                    /* ^xdigit */
 308 };
 309 #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))
 310 #endif
 311
 312 #define STRING(a)  # a
 313 #define XSTRING(s) STRING(s)
 314
 315 /* The texts of compile-time error messages. These are "char *" because they
 316 are passed to the outside world. Do not ever re-use any error number, because
 317 they are documented. Always add a new error instead. Messages marked DEAD below
 318 are no longer used. This used to be a table of strings, but in order to reduce
 319 the number of relocations needed when a shared library is loaded dynamically,
 320 it is now one long string. We cannot use a table of offsets, because the
 321 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
 322 simply count through to the one we want - this isn't a performance issue
 323 because these strings are used only when there is a compilation error.
 324
 325 Each substring ends with \0 to insert a null character. This includes the final
 326 substring, so that the whole string ends with \0\0, which can be detected when
 327 counting through. */
 328
 329 static const char error_texts[] =
 330   "no error\0"
 331   "\\ at end of pattern\0"
 332   "\\c at end of pattern\0"
 333   "unrecognized character follows \\\0"
 334   "numbers out of order in {} quantifier\0"
 335   /* 5 */
 336   "number too big in {} quantifier\0"
 337   "missing terminating ] for character class\0"
 338   "invalid escape sequence in character class\0"
 339   "range out of order in character class\0"
 340   "nothing to repeat\0"
 341   /* 10 */
 342   "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
 343   "internal error: unexpected repeat\0"
 344   "unrecognized character after (? or (?-\0"
 345   "POSIX named classes are supported only within a class\0"
 346   "missing )\0"
 347   /* 15 */
 348   "reference to non-existent subpattern\0"
 349   "erroffset passed as NULL\0"
 350   "unknown option bit(s) set\0"
 351   "missing ) after comment\0"
 352   "parentheses nested too deeply\0"  /** DEAD **/
 353   /* 20 */
 354   "regular expression is too large\0"
 355   "failed to get memory\0"
 356   "unmatched parentheses\0"
 357   "internal error: code overflow\0"
 358   "unrecognized character after (?<\0"
 359   /* 25 */
 360   "lookbehind assertion is not fixed length\0"
 361   "malformed number or name after (?(\0"
 362   "conditional group contains more than two branches\0"
 363   "assertion expected after (?(\0"
 364   "(?R or (?[+-]digits must be followed by )\0"
 365   /* 30 */
 366   "unknown POSIX class name\0"
 367   "POSIX collating elements are not supported\0"
 368   "this version of PCRE is not compiled with PCRE_UTF8 support\0"
 369   "spare error\0"  /** DEAD **/
 370   "character value in \\x{...} sequence is too large\0"
 371   /* 35 */
 372   "invalid condition (?(0)\0"
 373   "\\C not allowed in lookbehind assertion\0"
 374   "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
 375   "number after (?C is > 255\0"
 376   "closing ) for (?C expected\0"
 377   /* 40 */
 378   "recursive call could loop indefinitely\0"
 379   "unrecognized character after (?P\0"
 380   "syntax error in subpattern name (missing terminator)\0"
 381   "two named subpatterns have the same name\0"
 382   "invalid UTF-8 string\0"
 383   /* 45 */
 384   "support for \\P, \\p, and \\X has not been compiled\0"
 385   "malformed \\P or \\p sequence\0"
 386   "unknown property name after \\P or \\p\0"
 387   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
 388   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
 389   /* 50 */
 390   "repeated subpattern is too long\0"    /** DEAD **/
 391   "octal value is greater than \\377 (not in UTF-8 mode)\0"
 392   "internal error: overran compiling workspace\0"
 393   "internal error: previously-checked referenced subpattern not found\0"
 394   "DEFINE group contains more than one branch\0"
 395   /* 55 */
 396   "repeating a DEFINE group is not allowed\0"
 397   "inconsistent NEWLINE options\0"
 398   "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
 399   "a numbered reference must not be zero\0"
 400   "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
 401   /* 60 */
 402   "(*VERB) not recognized\0"
 403   "number is too big\0"
 404   "subpattern name expected\0"
 405   "digit expected after (?+\0"
 406   "] is an invalid data character in JavaScript compatibility mode\0"
 407   /* 65 */
 408   "different names for subpatterns of the same number are not allowed\0"
 409   "(*MARK) must have an argument\0"
 410   "this version of PCRE is not compiled with PCRE_UCP support\0"
 411   "\\c must be followed by an ASCII character\0"
 412   ;
 413
 414 /* Table to identify digits and hex digits. This is used when compiling
 415 patterns. Note that the tables in chartables are dependent on the locale, and
 416 may mark arbitrary characters as digits - but the PCRE compiling code expects
 417 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
 418 a private table here. It costs 256 bytes, but it is a lot faster than doing
 419 character value tests (at least in some simple cases I timed), and in some
 420 applications one wants PCRE to compile efficiently as well as match
 421 efficiently.
 422
 423 For convenience, we use the same bit definitions as in chartables:
 424
 425   0x04   decimal digit
 426   0x08   hexadecimal digit
 427
 428 Then we can use ctype_digit and ctype_xdigit in the code. */
 429
 430 #ifndef EBCDIC
 431
 432 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
 433 UTF-8 mode. */
 434
 435 static const unsigned char digitab[] =
 436   {
 437   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
 438   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
 439   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
 440   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
 441   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
 442   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
 443   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
 444   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
 445   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
 446   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
 447   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
 448   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
 449   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
 450   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
 451   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
 452   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
 453   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
 454   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
 455   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
 456   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
 457   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
 458   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
 459   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
 460   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
 461   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
 462   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
 463   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
 464   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
 465   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
 466   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
 467   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
 468   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
 469
 470 #else
 471
 472 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
 473
 474 static const unsigned char digitab[] =
 475   {
 476   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
 477   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
 478   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
 479   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
 480   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
 481   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
 482   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
 483   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
 484   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
 485   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
 486   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
 487   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
 488   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
 489   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
 490   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
 491   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
 492   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
 493   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
 494   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
 495   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
 496   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
 497   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
 498   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
 499   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
 500   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
 501   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
 502   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
 503   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
 504   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
 505   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
 506   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
 507   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
 508
 509 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
 510   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
 511   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
 512   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
 513   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
 514   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
 515   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
 516   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
 517   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
 518   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
 519   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
 520   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
 521   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
 522   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
 523   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
 524   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
 525   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
 526   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
 527   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
 528   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
 529   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
 530   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
 531   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
 532   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
 533   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
 534   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
 535   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
 536   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
 537   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
 538   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
 539   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
 540   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
 541   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
 542 #endif
 543
 544
 545 /* Definition to allow mutual recursion */
 546
 547 static BOOL
 548   compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
 549     int *, int *, branch_chain *, compile_data *, int *);
 550
 551
 552
 553 /*************************************************
 554 *            Find an error text                  *
 555 *************************************************/
 556
 557 /* The error texts are now all in one long string, to save on relocations. As
 558 some of the text is of unknown length, we can't use a table of offsets.
 559 Instead, just count through the strings. This is not a performance issue
 560 because it happens only when there has been a compilation error.
 561
 562 Argument:   the error number
 563 Returns:    pointer to the error string
 564 */
 565
 566 static const char *
 567 find_error_text(int n)
 568 {
 569 const char *s = error_texts;
 570 for (; n > 0; n--)
 571   {
 572   while (*s++ != 0) {};
 573   if (*s == 0) return "Error text not found (please report)";
 574   }
 575 return s;
 576 }
 577
 578
 579 /*************************************************
 580 *            Handle escapes                      *
 581 *************************************************/
 582
 583 /* This function is called when a \ has been encountered. It either returns a
 584 positive value for a simple escape such as \n, or a negative value which
 585 encodes one of the more complicated things such as \d. A backreference to group
 586 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
 587 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
 588 ptr is pointing at the \. On exit, it is on the final character of the escape
 589 sequence.
 590
 591 Arguments:
 592   ptrptr         points to the pattern position pointer
 593   errorcodeptr   points to the errorcode variable
 594   bracount       number of previous extracting brackets
 595   options        the options bits
 596   isclass        TRUE if inside a character class
 597
 598 Returns:         zero or positive => a data character
 599                  negative => a special escape sequence
 600                  on error, errorcodeptr is set
 601 */
 602
 603 static int
 604 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
 605   int options, BOOL isclass)
 606 {
 607 BOOL utf8 = (options & PCRE_UTF8) != 0;
 608 const uschar *ptr = *ptrptr + 1;
 609 int c, i;
 610
 611 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
 612 ptr--;                            /* Set pointer back to the last byte */
 613
 614 /* If backslash is at the end of the pattern, it's an error. */
 615
 616 if (c == 0) *errorcodeptr = ERR1;
 617
 618 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
 619 in a table. A non-zero result is something that can be returned immediately.
 620 Otherwise further processing may be required. */
 621
 622 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
 623 else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */
 624 else if ((i = escapes[c - CHAR_0]) != 0) c = i;
 625
 626 #else           /* EBCDIC coding */
 627 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
 628 else if ((i = escapes[c - 0x48]) != 0)  c = i;
 629 #endif
 630
 631 /* Escapes that need further processing, or are illegal. */
 632
 633 else
 634   {
 635   const uschar *oldptr;
 636   BOOL braced, negated;
 637
 638   switch (c)
 639     {
 640     /* A number of Perl escapes are not handled by PCRE. We give an explicit
 641     error. */
 642
 643     case CHAR_l:
 644     case CHAR_L:
 645     case CHAR_u:
 646     case CHAR_U:
 647     *errorcodeptr = ERR37;
 648     break;
 649
 650     /* \g must be followed by one of a number of specific things:
 651
 652     (1) A number, either plain or braced. If positive, it is an absolute
 653     backreference. If negative, it is a relative backreference. This is a Perl
 654     5.10 feature.
 655
 656     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
 657     is part of Perl's movement towards a unified syntax for back references. As
 658     this is synonymous with \k{name}, we fudge it up by pretending it really
 659     was \k.
 660
 661     (3) For Oniguruma compatibility we also support \g followed by a name or a
 662     number either in angle brackets or in single quotes. However, these are
 663     (possibly recursive) subroutine calls, _not_ backreferences. Just return
 664     the -ESC_g code (cf \k). */
 665
 666     case CHAR_g:
 667     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
 668       {
 669       c = -ESC_g;
 670       break;
 671       }
 672
 673     /* Handle the Perl-compatible cases */
 674
 675     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
 676       {
 677       const uschar *p;
 678       for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
 679         if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
 680       if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
 681         {
 682         c = -ESC_k;
 683         break;
 684         }
 685       braced = TRUE;
 686       ptr++;
 687       }
 688     else braced = FALSE;
 689
 690     if (ptr[1] == CHAR_MINUS)
 691       {
 692       negated = TRUE;
 693       ptr++;
 694       }
 695     else negated = FALSE;
 696
 697     c = 0;
 698     while ((digitab[ptr[1]] & ctype_digit) != 0)
 699       c = c * 10 + *(++ptr) - CHAR_0;
 700
 701     if (c < 0)   /* Integer overflow */
 702       {
 703       *errorcodeptr = ERR61;
 704       break;
 705       }
 706
 707     if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
 708       {
 709       *errorcodeptr = ERR57;
 710       break;
 711       }
 712
 713     if (c == 0)
 714       {
 715       *errorcodeptr = ERR58;
 716       break;
 717       }
 718
 719     if (negated)
 720       {
 721       if (c > bracount)
 722         {
 723         *errorcodeptr = ERR15;
 724         break;
 725         }
 726       c = bracount - (c - 1);
 727       }
 728
 729     c = -(ESC_REF + c);
 730     break;
 731
 732     /* The handling of escape sequences consisting of a string of digits
 733     starting with one that is not zero is not straightforward. By experiment,
 734     the way Perl works seems to be as follows:
 735
 736     Outside a character class, the digits are read as a decimal number. If the
 737     number is less than 10, or if there are that many previous extracting
 738     left brackets, then it is a back reference. Otherwise, up to three octal
 739     digits are read to form an escaped byte. Thus \123 is likely to be octal
 740     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
 741     value is greater than 377, the least significant 8 bits are taken. Inside a
 742     character class, \ followed by a digit is always an octal number. */
 743
 744     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
 745     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
 746
 747     if (!isclass)
 748       {
 749       oldptr = ptr;
 750       c -= CHAR_0;
 751       while ((digitab[ptr[1]] & ctype_digit) != 0)
 752         c = c * 10 + *(++ptr) - CHAR_0;
 753       if (c < 0)    /* Integer overflow */
 754         {
 755         *errorcodeptr = ERR61;
 756         break;
 757         }
 758       if (c < 10 || c <= bracount)
 759         {
 760         c = -(ESC_REF + c);
 761         break;
 762         }
 763       ptr = oldptr;      /* Put the pointer back and fall through */
 764       }
 765
 766     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
 767     generates a binary zero byte and treats the digit as a following literal.
 768     Thus we have to pull back the pointer by one. */
 769
 770     if ((c = *ptr) >= CHAR_8)
 771       {
 772       ptr--;
 773       c = 0;
 774       break;
 775       }
 776
 777     /* \0 always starts an octal number, but we may drop through to here with a
 778     larger first octal digit. The original code used just to take the least
 779     significant 8 bits of octal numbers (I think this is what early Perls used
 780     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
 781     than 3 octal digits. */
 782
 783     case CHAR_0:
 784     c -= CHAR_0;
 785     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
 786         c = c * 8 + *(++ptr) - CHAR_0;
 787     if (!utf8 && c > 255) *errorcodeptr = ERR51;
 788     break;
 789
 790     /* \x is complicated. \x{ddd} is a character number which can be greater
 791     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
 792     treated as a data character. */
 793
 794     case CHAR_x:
 795     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
 796       {
 797       const uschar *pt = ptr + 2;
 798       int count = 0;
 799
 800       c = 0;
 801       while ((digitab[*pt] & ctype_xdigit) != 0)
 802         {
 803         register int cc = *pt++;
 804         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
 805         count++;
 806
 807 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
 808         if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
 809         c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
 810 #else           /* EBCDIC coding */
 811         if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
 812         c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
 813 #endif
 814         }
 815
 816       if (*pt == CHAR_RIGHT_CURLY_BRACKET)
 817         {
 818         if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
 819         ptr = pt;
 820         break;
 821         }
 822
 823       /* If the sequence of hex digits does not end with '}', then we don't
 824       recognize this construct; fall through to the normal \x handling. */
 825       }
 826
 827     /* Read just a single-byte hex-defined char */
 828
 829     c = 0;
 830     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
 831       {
 832       int cc;                                  /* Some compilers don't like */
 833       cc = *(++ptr);                           /* ++ in initializers */
 834 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
 835       if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
 836       c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
 837 #else           /* EBCDIC coding */
 838       if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
 839       c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
 840 #endif
 841       }
 842     break;
 843
 844     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
 845     An error is given if the byte following \c is not an ASCII character. This
 846     coding is ASCII-specific, but then the whole concept of \cx is
 847     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
 848
 849     case CHAR_c:
 850     c = *(++ptr);
 851     if (c == 0)
 852       {
 853       *errorcodeptr = ERR2;
 854       break;
 855       }
 856 #ifndef EBCDIC    /* ASCII/UTF-8 coding */
 857     if (c > 127)  /* Excludes all non-ASCII in either mode */
 858       {
 859       *errorcodeptr = ERR68;
 860       break;
 861       }
 862     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
 863     c ^= 0x40;
 864 #else             /* EBCDIC coding */
 865     if (c >= CHAR_a && c <= CHAR_z) c += 64;
 866     c ^= 0xC0;
 867 #endif
 868     break;
 869
 870     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
 871     other alphanumeric following \ is an error if PCRE_EXTRA was set;
 872     otherwise, for Perl compatibility, it is a literal. This code looks a bit
 873     odd, but there used to be some cases other than the default, and there may
 874     be again in future, so I haven't "optimized" it. */
 875
 876     default:
 877     if ((options & PCRE_EXTRA) != 0) switch(c)
 878       {
 879       default:
 880       *errorcodeptr = ERR3;
 881       break;
 882       }
 883     break;
 884     }
 885   }
 886
 887 /* Perl supports \N{name} for character names, as well as plain \N for "not
 888 newline". PCRE does not support \N{name}. */
 889
 890 if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
 891   *errorcodeptr = ERR37;
 892
 893 /* If PCRE_UCP is set, we change the values for \d etc. */
 894
 895 if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
 896   c -= (ESC_DU - ESC_D);
 897
 898 /* Set the pointer to the final character before returning. */
 899
 900 *ptrptr = ptr;
 901 return c;
 902 }
 903
 904
 905
 906 #ifdef SUPPORT_UCP
 907 /*************************************************
 908 *               Handle \P and \p                 *
 909 *************************************************/
 910
 911 /* This function is called after \P or \p has been encountered, provided that
 912 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
 913 pointing at the P or p. On exit, it is pointing at the final character of the
 914 escape sequence.
 915
 916 Argument:
 917   ptrptr         points to the pattern position pointer
 918   negptr         points to a boolean that is set TRUE for negation else FALSE
 919   dptr           points to an int that is set to the detailed property value
 920   errorcodeptr   points to the error code variable
 921
 922 Returns:         type value from ucp_type_table, or -1 for an invalid type
 923 */
 924
 925 static int
 926 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
 927 {
 928 int c, i, bot, top;
 929 const uschar *ptr = *ptrptr;
 930 char name[32];
 931
 932 c = *(++ptr);
 933 if (c == 0) goto ERROR_RETURN;
 934
 935 *negptr = FALSE;
 936
 937 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
 938 negation. */
 939
 940 if (c == CHAR_LEFT_CURLY_BRACKET)
 941   {
 942   if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
 943     {
 944     *negptr = TRUE;
 945     ptr++;
 946     }
 947   for (i = 0; i < (int)sizeof(name) - 1; i++)
 948     {
 949     c = *(++ptr);
 950     if (c == 0) goto ERROR_RETURN;
 951     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
 952     name[i] = c;
 953     }
 954   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
 955   name[i] = 0;
 956   }
 957
 958 /* Otherwise there is just one following character */
 959
 960 else
 961   {
 962   name[0] = c;
 963   name[1] = 0;
 964   }
 965
 966 *ptrptr = ptr;
 967
 968 /* Search for a recognized property name using binary chop */
 969
 970 bot = 0;
 971 top = _pcre_utt_size;
 972
 973 while (bot < top)
 974   {
 975   i = (bot + top) >> 1;
 976   c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
 977   if (c == 0)
 978     {
 979     *dptr = _pcre_utt[i].value;
 980     return _pcre_utt[i].type;
 981     }
 982   if (c > 0) bot = i + 1; else top = i;
 983   }
 984
 985 *errorcodeptr = ERR47;
 986 *ptrptr = ptr;
 987 return -1;
 988
 989 ERROR_RETURN:
 990 *errorcodeptr = ERR46;
 991 *ptrptr = ptr;
 992 return -1;
 993 }
 994 #endif
 995
 996
 997
 998
 999 /*************************************************
1000 *            Check for counted repeat            *
1001 *************************************************/
1002
1003 /* This function is called when a '{' is encountered in a place where it might
1004 start a quantifier. It looks ahead to see if it really is a quantifier or not.
1005 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1006 where the ddds are digits.
1007
1008 Arguments:
1009   p         pointer to the first char after '{'
1010
1011 Returns:    TRUE or FALSE
1012 */
1013
1014 static BOOL
1015 is_counted_repeat(const uschar *p)
1016 {
1017 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1018 while ((digitab[*p] & ctype_digit) != 0) p++;
1019 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
1020
1021 if (*p++ != CHAR_COMMA) return FALSE;
1022 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
1023
1024 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1025 while ((digitab[*p] & ctype_digit) != 0) p++;
1026
1027 return (*p == CHAR_RIGHT_CURLY_BRACKET);
1028 }
1029
1030
1031
1032 /*************************************************
1033 *         Read repeat counts                     *
1034 *************************************************/
1035
1036 /* Read an item of the form {n,m} and return the values. This is called only
1037 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1038 so the syntax is guaranteed to be correct, but we need to check the values.
1039
1040 Arguments:
1041   p              pointer to first char after '{'
1042   minp           pointer to int for min
1043   maxp           pointer to int for max
1044                  returned as -1 if no max
1045   errorcodeptr   points to error code variable
1046
1047 Returns:         pointer to '}' on success;
1048                  current ptr on error, with errorcodeptr set non-zero
1049 */
1050
1051 static const uschar *
1052 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
1053 {
1054 int min = 0;
1055 int max = -1;
1056
1057 /* Read the minimum value and do a paranoid check: a negative value indicates
1058 an integer overflow. */
1059
1060 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
1061 if (min < 0 || min > 65535)
1062   {
1063   *errorcodeptr = ERR5;
1064   return p;
1065   }
1066
1067 /* Read the maximum value if there is one, and again do a paranoid on its size.
1068 Also, max must not be less than min. */
1069
1070 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1071   {
1072   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1073     {
1074     max = 0;
1075     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
1076     if (max < 0 || max > 65535)
1077       {
1078       *errorcodeptr = ERR5;
1079       return p;
1080       }
1081     if (max < min)
1082       {
1083       *errorcodeptr = ERR4;
1084       return p;
1085       }
1086     }
1087   }
1088
1089 /* Fill in the required variables, and pass back the pointer to the terminating
1090 '}'. */
1091
1092 *minp = min;
1093 *maxp = max;
1094 return p;
1095 }
1096
1097
1098
1099 /*************************************************
1100 *  Subroutine for finding forward reference      *
1101 *************************************************/
1102
1103 /* This recursive function is called only from find_parens() below. The
1104 top-level call starts at the beginning of the pattern. All other calls must
1105 start at a parenthesis. It scans along a pattern's text looking for capturing
1106 subpatterns, and counting them. If it finds a named pattern that matches the
1107 name it is given, it returns its number. Alternatively, if the name is NULL, it
1108 returns when it reaches a given numbered subpattern. Recursion is used to keep
1109 track of subpatterns that reset the capturing group numbers - the (?| feature.
1110
1111 This function was originally called only from the second pass, in which we know
1112 that if (?< or (?' or (?P< is encountered, the name will be correctly
1113 terminated because that is checked in the first pass. There is now one call to
1114 this function in the first pass, to check for a recursive back reference by
1115 name (so that we can make the whole group atomic). In this case, we need check
1116 only up to the current position in the pattern, and that is still OK because
1117 and previous occurrences will have been checked. To make this work, the test
1118 for "end of pattern" is a check against cd->end_pattern in the main loop,
1119 instead of looking for a binary zero. This means that the special first-pass
1120 call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1121 processing items within the loop are OK, because afterwards the main loop will
1122 terminate.)
1123
1124 Arguments:
1125   ptrptr       address of the current character pointer (updated)
1126   cd           compile background data
1127   name         name to seek, or NULL if seeking a numbered subpattern
1128   lorn         name length, or subpattern number if name is NULL
1129   xmode        TRUE if we are in /x mode
1130   utf8         TRUE if we are in UTF-8 mode
1131   count        pointer to the current capturing subpattern number (updated)
1132
1133 Returns:       the number of the named subpattern, or -1 if not found
1134 */
1135
1136 static int
1137 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1138   BOOL xmode, BOOL utf8, int *count)
1139 {
1140 uschar *ptr = *ptrptr;
1141 int start_count = *count;
1142 int hwm_count = start_count;
1143 BOOL dup_parens = FALSE;
1144
1145 /* If the first character is a parenthesis, check on the type of group we are
1146 dealing with. The very first call may not start with a parenthesis. */
1147
1148 if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1149   {
1150   /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1151
1152   if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1153
1154   /* Handle a normal, unnamed capturing parenthesis. */
1155
1156   else if (ptr[1] != CHAR_QUESTION_MARK)
1157     {
1158     *count += 1;
1159     if (name == NULL && *count == lorn) return *count;
1160     ptr++;
1161     }
1162
1163   /* All cases now have (? at the start. Remember when we are in a group
1164   where the parenthesis numbers are duplicated. */
1165
1166   else if (ptr[2] == CHAR_VERTICAL_LINE)
1167     {
1168     ptr += 3;
1169     dup_parens = TRUE;
1170     }
1171
1172   /* Handle comments; all characters are allowed until a ket is reached. */
1173
1174   else if (ptr[2] == CHAR_NUMBER_SIGN)
1175     {
1176     for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1177     goto FAIL_EXIT;
1178     }
1179
1180   /* Handle a condition. If it is an assertion, just carry on so that it
1181   is processed as normal. If not, skip to the closing parenthesis of the
1182   condition (there can't be any nested parens). */
1183
1184   else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1185     {
1186     ptr += 2;
1187     if (ptr[1] != CHAR_QUESTION_MARK)
1188       {
1189       while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1190       if (*ptr != 0) ptr++;
1191       }
1192     }
1193
1194   /* Start with (? but not a condition. */
1195
1196   else
1197     {
1198     ptr += 2;
1199     if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
1200
1201     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1202
1203     if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1204         ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1205       {
1206       int term;
1207       const uschar *thisname;
1208       *count += 1;
1209       if (name == NULL && *count == lorn) return *count;
1210       term = *ptr++;
1211       if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1212       thisname = ptr;
1213       while (*ptr != term) ptr++;
1214       if (name != NULL && lorn == ptr - thisname &&
1215           strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1216         return *count;
1217       term++;
1218       }
1219     }
1220   }
1221
1222 /* Past any initial parenthesis handling, scan for parentheses or vertical
1223 bars. Stop if we get to cd->end_pattern. Note that this is important for the
1224 first-pass call when this value is temporarily adjusted to stop at the current
1225 position. So DO NOT change this to a test for binary zero. */
1226
1227 for (; ptr < cd->end_pattern; ptr++)
1228   {
1229   /* Skip over backslashed characters and also entire \Q...\E */
1230
1231   if (*ptr == CHAR_BACKSLASH)
1232     {
1233     if (*(++ptr) == 0) goto FAIL_EXIT;
1234     if (*ptr == CHAR_Q) for (;;)
1235       {
1236       while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1237       if (*ptr == 0) goto FAIL_EXIT;
1238       if (*(++ptr) == CHAR_E) break;
1239       }
1240     continue;
1241     }
1242
1243   /* Skip over character classes; this logic must be similar to the way they
1244   are handled for real. If the first character is '^', skip it. Also, if the
1245   first few characters (either before or after ^) are \Q\E or \E we skip them
1246   too. This makes for compatibility with Perl. Note the use of STR macros to
1247   encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1248
1249   if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1250     {
1251     BOOL negate_class = FALSE;
1252     for (;;)
1253       {
1254       if (ptr[1] == CHAR_BACKSLASH)
1255         {
1256         if (ptr[2] == CHAR_E)
1257           ptr+= 2;
1258         else if (strncmp((const char *)ptr+2,
1259                  STR_Q STR_BACKSLASH STR_E, 3) == 0)
1260           ptr += 4;
1261         else
1262           break;
1263         }
1264       else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1265         {
1266         negate_class = TRUE;
1267         ptr++;
1268         }
1269       else break;
1270       }
1271
1272     /* If the next character is ']', it is a data character that must be
1273     skipped, except in JavaScript compatibility mode. */
1274
1275     if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1276         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1277       ptr++;
1278
1279     while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1280       {
1281       if (*ptr == 0) return -1;
1282       if (*ptr == CHAR_BACKSLASH)
1283         {
1284         if (*(++ptr) == 0) goto FAIL_EXIT;
1285         if (*ptr == CHAR_Q) for (;;)
1286           {
1287           while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1288           if (*ptr == 0) goto FAIL_EXIT;
1289           if (*(++ptr) == CHAR_E) break;
1290           }
1291         continue;
1292         }
1293       }
1294     continue;
1295     }
1296
1297   /* Skip comments in /x mode */
1298
1299   if (xmode && *ptr == CHAR_NUMBER_SIGN)
1300     {
1301     ptr++;
1302     while (*ptr != 0)
1303       {
1304       if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1305       ptr++;
1306 #ifdef SUPPORT_UTF8
1307       if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
1308 #endif
1309       }
1310     if (*ptr == 0) goto FAIL_EXIT;
1311     continue;
1312     }
1313
1314   /* Check for the special metacharacters */
1315
1316   if (*ptr == CHAR_LEFT_PARENTHESIS)
1317     {
1318     int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
1319     if (rc > 0) return rc;
1320     if (*ptr == 0) goto FAIL_EXIT;
1321     }
1322
1323   else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1324     {
1325     if (dup_parens && *count < hwm_count) *count = hwm_count;
1326     goto FAIL_EXIT;
1327     }
1328
1329   else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1330     {
1331     if (*count > hwm_count) hwm_count = *count;
1332     *count = start_count;
1333     }
1334   }
1335
1336 FAIL_EXIT:
1337 *ptrptr = ptr;
1338 return -1;
1339 }
1340
1341
1342
1343
1344 /*************************************************
1345 *       Find forward referenced subpattern       *
1346 *************************************************/
1347
1348 /* This function scans along a pattern's text looking for capturing
1349 subpatterns, and counting them. If it finds a named pattern that matches the
1350 name it is given, it returns its number. Alternatively, if the name is NULL, it
1351 returns when it reaches a given numbered subpattern. This is used for forward
1352 references to subpatterns. We used to be able to start this scan from the
1353 current compiling point, using the current count value from cd->bracount, and
1354 do it all in a single loop, but the addition of the possibility of duplicate
1355 subpattern numbers means that we have to scan from the very start, in order to
1356 take account of such duplicates, and to use a recursive function to keep track
1357 of the different types of group.
1358
1359 Arguments:
1360   cd           compile background data
1361   name         name to seek, or NULL if seeking a numbered subpattern
1362   lorn         name length, or subpattern number if name is NULL
1363   xmode        TRUE if we are in /x mode
1364   utf8         TRUE if we are in UTF-8 mode
1365
1366 Returns:       the number of the found subpattern, or -1 if not found
1367 */
1368
1369 static int
1370 find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,
1371   BOOL utf8)
1372 {
1373 uschar *ptr = (uschar *)cd->start_pattern;
1374 int count = 0;
1375 int rc;
1376
1377 /* If the pattern does not start with an opening parenthesis, the first call
1378 to find_parens_sub() will scan right to the end (if necessary). However, if it
1379 does start with a parenthesis, find_parens_sub() will return when it hits the
1380 matching closing parens. That is why we have to have a loop. */
1381
1382 for (;;)
1383   {
1384   rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
1385   if (rc > 0 || *ptr++ == 0) break;
1386   }
1387
1388 return rc;
1389 }
1390
1391
1392
1393
1394 /*************************************************
1395 *      Find first significant op code            *
1396 *************************************************/
1397
1398 /* This is called by several functions that scan a compiled expression looking
1399 for a fixed first character, or an anchoring op code etc. It skips over things
1400 that do not influence this. For some calls, a change of option is important.
1401 For some calls, it makes sense to skip negative forward and all backward
1402 assertions, and also the \b assertion; for others it does not.
1403
1404 Arguments:
1405   code         pointer to the start of the group
1406   options      pointer to external options
1407   optbit       the option bit whose changing is significant, or
1408                  zero if none are
1409   skipassert   TRUE if certain assertions are to be skipped
1410
1411 Returns:       pointer to the first significant opcode
1412 */
1413
1414 static const uschar*
1415 first_significant_code(const uschar *code, int *options, int optbit,
1416   BOOL skipassert)
1417 {
1418 for (;;)
1419   {
1420   switch ((int)*code)
1421     {
1422     case OP_OPT:
1423     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1424       *options = (int)code[1];
1425     code += 2;
1426     break;
1427
1428     case OP_ASSERT_NOT:
1429     case OP_ASSERTBACK:
1430     case OP_ASSERTBACK_NOT:
1431     if (!skipassert) return code;
1432     do code += GET(code, 1); while (*code == OP_ALT);
1433     code += _pcre_OP_lengths[*code];
1434     break;
1435
1436     case OP_WORD_BOUNDARY:
1437     case OP_NOT_WORD_BOUNDARY:
1438     if (!skipassert) return code;
1439     /* Fall through */
1440
1441     case OP_CALLOUT:
1442     case OP_CREF:
1443     case OP_NCREF:
1444     case OP_RREF:
1445     case OP_NRREF:
1446     case OP_DEF:
1447     code += _pcre_OP_lengths[*code];
1448     break;
1449
1450     default:
1451     return code;
1452     }
1453   }
1454 /* Control never reaches here */
1455 }
1456
1457
1458
1459
1460 /*************************************************
1461 *        Find the fixed length of a branch       *
1462 *************************************************/
1463
1464 /* Scan a branch and compute the fixed length of subject that will match it,
1465 if the length is fixed. This is needed for dealing with backward assertions.
1466 In UTF8 mode, the result is in characters rather than bytes. The branch is
1467 temporarily terminated with OP_END when this function is called.
1468
1469 This function is called when a backward assertion is encountered, so that if it
1470 fails, the error message can point to the correct place in the pattern.
1471 However, we cannot do this when the assertion contains subroutine calls,
1472 because they can be forward references. We solve this by remembering this case
1473 and doing the check at the end; a flag specifies which mode we are running in.
1474
1475 Arguments:
1476   code     points to the start of the pattern (the bracket)
1477   options  the compiling options
1478   atend    TRUE if called when the pattern is complete
1479   cd       the "compile data" structure
1480
1481 Returns:   the fixed length,
1482              or -1 if there is no fixed length,
1483              or -2 if \C was encountered
1484              or -3 if an OP_RECURSE item was encountered and atend is FALSE
1485 */
1486
1487 static int
1488 find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1489 {
1490 int length = -1;
1491
1492 register int branchlength = 0;
1493 register uschar *cc = code + 1 + LINK_SIZE;
1494
1495 /* Scan along the opcodes for this branch. If we get to the end of the
1496 branch, check the length against that of the other branches. */
1497
1498 for (;;)
1499   {
1500   int d;
1501   uschar *ce, *cs;
1502   register int op = *cc;
1503   switch (op)
1504     {
1505     case OP_CBRA:
1506     case OP_BRA:
1507     case OP_ONCE:
1508     case OP_COND:
1509     d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1510     if (d < 0) return d;
1511     branchlength += d;
1512     do cc += GET(cc, 1); while (*cc == OP_ALT);
1513     cc += 1 + LINK_SIZE;
1514     break;
1515
1516     /* Reached end of a branch; if it's a ket it is the end of a nested
1517     call. If it's ALT it is an alternation in a nested call. If it is
1518     END it's the end of the outer call. All can be handled by the same code. */
1519
1520     case OP_ALT:
1521     case OP_KET:
1522     case OP_KETRMAX:
1523     case OP_KETRMIN:
1524     case OP_END:
1525     if (length < 0) length = branchlength;
1526       else if (length != branchlength) return -1;
1527     if (*cc != OP_ALT) return length;
1528     cc += 1 + LINK_SIZE;
1529     branchlength = 0;
1530     break;
1531
1532     /* A true recursion implies not fixed length, but a subroutine call may
1533     be OK. If the subroutine is a forward reference, we can't deal with
1534     it until the end of the pattern, so return -3. */
1535
1536     case OP_RECURSE:
1537     if (!atend) return -3;
1538     cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1539     do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */
1540     if (cc > cs && cc < ce) return -1;                /* Recursion */
1541     d = find_fixedlength(cs + 2, options, atend, cd);
1542     if (d < 0) return d;
1543     branchlength += d;
1544     cc += 1 + LINK_SIZE;
1545     break;
1546
1547     /* Skip over assertive subpatterns */
1548
1549     case OP_ASSERT:
1550     case OP_ASSERT_NOT:
1551     case OP_ASSERTBACK:
1552     case OP_ASSERTBACK_NOT:
1553     do cc += GET(cc, 1); while (*cc == OP_ALT);
1554     /* Fall through */
1555
1556     /* Skip over things that don't match chars */
1557
1558     case OP_REVERSE:
1559     case OP_CREF:
1560     case OP_NCREF:
1561     case OP_RREF:
1562     case OP_NRREF:
1563     case OP_DEF:
1564     case OP_OPT:
1565     case OP_CALLOUT:
1566     case OP_SOD:
1567     case OP_SOM:
1568     case OP_SET_SOM:
1569     case OP_EOD:
1570     case OP_EODN:
1571     case OP_CIRC:
1572     case OP_DOLL:
1573     case OP_NOT_WORD_BOUNDARY:
1574     case OP_WORD_BOUNDARY:
1575     cc += _pcre_OP_lengths[*cc];
1576     break;
1577
1578     /* Handle literal characters */
1579
1580     case OP_CHAR:
1581     case OP_CHARNC:
1582     case OP_NOT:
1583     branchlength++;
1584     cc += 2;
1585 #ifdef SUPPORT_UTF8
1586     if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1587       cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1588 #endif
1589     break;
1590
1591     /* Handle exact repetitions. The count is already in characters, but we
1592     need to skip over a multibyte character in UTF8 mode.  */
1593
1594     case OP_EXACT:
1595     branchlength += GET2(cc,1);
1596     cc += 4;
1597 #ifdef SUPPORT_UTF8
1598     if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1599       cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1600 #endif
1601     break;
1602
1603     case OP_TYPEEXACT:
1604     branchlength += GET2(cc,1);
1605     if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1606     cc += 4;
1607     break;
1608
1609     /* Handle single-char matchers */
1610
1611     case OP_PROP:
1612     case OP_NOTPROP:
1613     cc += 2;
1614     /* Fall through */
1615
1616     case OP_NOT_DIGIT:
1617     case OP_DIGIT:
1618     case OP_NOT_WHITESPACE:
1619     case OP_WHITESPACE:
1620     case OP_NOT_WORDCHAR:
1621     case OP_WORDCHAR:
1622     case OP_ANY:
1623     case OP_ALLANY:
1624     branchlength++;
1625     cc++;
1626     break;
1627
1628     /* The single-byte matcher isn't allowed */
1629
1630     case OP_ANYBYTE:
1631     return -2;
1632
1633     /* Check a class for variable quantification */
1634
1635 #ifdef SUPPORT_UTF8
1636     case OP_XCLASS:
1637     cc += GET(cc, 1) - 33;
1638     /* Fall through */
1639 #endif
1640
1641     case OP_CLASS:
1642     case OP_NCLASS:
1643     cc += 33;
1644
1645     switch (*cc)
1646       {
1647       case OP_CRSTAR:
1648       case OP_CRMINSTAR:
1649       case OP_CRQUERY:
1650       case OP_CRMINQUERY:
1651       return -1;
1652
1653       case OP_CRRANGE:
1654       case OP_CRMINRANGE:
1655       if (GET2(cc,1) != GET2(cc,3)) return -1;
1656       branchlength += GET2(cc,1);
1657       cc += 5;
1658       break;
1659
1660       default:
1661       branchlength++;
1662       }
1663     break;
1664
1665     /* Anything else is variable length */
1666
1667     default:
1668     return -1;
1669     }
1670   }
1671 /* Control never gets here */
1672 }
1673
1674
1675
1676
1677 /*************************************************
1678 *    Scan compiled regex for specific bracket    *
1679 *************************************************/
1680
1681 /* This little function scans through a compiled pattern until it finds a
1682 capturing bracket with the given number, or, if the number is negative, an
1683 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1684 so that it can be called from pcre_study() when finding the minimum matching
1685 length.
1686
1687 Arguments:
1688   code        points to start of expression
1689   utf8        TRUE in UTF-8 mode
1690   number      the required bracket number or negative to find a lookbehind
1691
1692 Returns:      pointer to the opcode for the bracket, or NULL if not found
1693 */
1694
1695 const uschar *
1696 _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1697 {
1698 for (;;)
1699   {
1700   register int c = *code;
1701   if (c == OP_END) return NULL;
1702
1703   /* XCLASS is used for classes that cannot be represented just by a bit
1704   map. This includes negated single high-valued characters. The length in
1705   the table is zero; the actual length is stored in the compiled code. */
1706
1707   if (c == OP_XCLASS) code += GET(code, 1);
1708
1709   /* Handle recursion */
1710
1711   else if (c == OP_REVERSE)
1712     {
1713     if (number < 0) return (uschar *)code;
1714     code += _pcre_OP_lengths[c];
1715     }
1716
1717   /* Handle capturing bracket */
1718
1719   else if (c == OP_CBRA)
1720     {
1721     int n = GET2(code, 1+LINK_SIZE);
1722     if (n == number) return (uschar *)code;
1723     code += _pcre_OP_lengths[c];
1724     }
1725
1726   /* Otherwise, we can get the item's length from the table, except that for
1727   repeated character types, we have to test for \p and \P, which have an extra
1728   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1729   must add in its length. */
1730
1731   else
1732     {
1733     switch(c)
1734       {
1735       case OP_TYPESTAR:
1736       case OP_TYPEMINSTAR:
1737       case OP_TYPEPLUS:
1738       case OP_TYPEMINPLUS:
1739       case OP_TYPEQUERY:
1740       case OP_TYPEMINQUERY:
1741       case OP_TYPEPOSSTAR:
1742       case OP_TYPEPOSPLUS:
1743       case OP_TYPEPOSQUERY:
1744       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1745       break;
1746
1747       case OP_TYPEUPTO:
1748       case OP_TYPEMINUPTO:
1749       case OP_TYPEEXACT:
1750       case OP_TYPEPOSUPTO:
1751       if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1752       break;
1753
1754       case OP_MARK:
1755       case OP_PRUNE_ARG:
1756       case OP_SKIP_ARG:
1757       code += code[1];
1758       break;
1759
1760       case OP_THEN_ARG:
1761       code += code[1+LINK_SIZE];
1762       break;
1763       }
1764
1765     /* Add in the fixed length from the table */
1766
1767     code += _pcre_OP_lengths[c];
1768
1769   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1770   a multi-byte character. The length in the table is a minimum, so we have to
1771   arrange to skip the extra bytes. */
1772
1773 #ifdef SUPPORT_UTF8
1774     if (utf8) switch(c)
1775       {
1776       case OP_CHAR:
1777       case OP_CHARNC:
1778       case OP_EXACT:
1779       case OP_UPTO:
1780       case OP_MINUPTO:
1781       case OP_POSUPTO:
1782       case OP_STAR:
1783       case OP_MINSTAR:
1784       case OP_POSSTAR:
1785       case OP_PLUS:
1786       case OP_MINPLUS:
1787       case OP_POSPLUS:
1788       case OP_QUERY:
1789       case OP_MINQUERY:
1790       case OP_POSQUERY:
1791       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1792       break;
1793       }
1794 #else
1795     (void)(utf8);  /* Keep compiler happy by referencing function argument */
1796 #endif
1797     }
1798   }
1799 }
1800
1801
1802
1803 /*************************************************
1804 *   Scan compiled regex for recursion reference  *
1805 *************************************************/
1806
1807 /* This little function scans through a compiled pattern until it finds an
1808 instance of OP_RECURSE.
1809
1810 Arguments:
1811   code        points to start of expression
1812   utf8        TRUE in UTF-8 mode
1813
1814 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
1815 */
1816
1817 static const uschar *
1818 find_recurse(const uschar *code, BOOL utf8)
1819 {
1820 for (;;)
1821   {
1822   register int c = *code;
1823   if (c == OP_END) return NULL;
1824   if (c == OP_RECURSE) return code;
1825
1826   /* XCLASS is used for classes that cannot be represented just by a bit
1827   map. This includes negated single high-valued characters. The length in
1828   the table is zero; the actual length is stored in the compiled code. */
1829
1830   if (c == OP_XCLASS) code += GET(code, 1);
1831
1832   /* Otherwise, we can get the item's length from the table, except that for
1833   repeated character types, we have to test for \p and \P, which have an extra
1834   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1835   must add in its length. */
1836
1837   else
1838     {
1839     switch(c)
1840       {
1841       case OP_TYPESTAR:
1842       case OP_TYPEMINSTAR:
1843       case OP_TYPEPLUS:
1844       case OP_TYPEMINPLUS:
1845       case OP_TYPEQUERY:
1846       case OP_TYPEMINQUERY:
1847       case OP_TYPEPOSSTAR:
1848       case OP_TYPEPOSPLUS:
1849       case OP_TYPEPOSQUERY:
1850       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1851       break;
1852
1853       case OP_TYPEPOSUPTO:
1854       case OP_TYPEUPTO:
1855       case OP_TYPEMINUPTO:
1856       case OP_TYPEEXACT:
1857       if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1858       break;
1859
1860       case OP_MARK:
1861       case OP_PRUNE_ARG:
1862       case OP_SKIP_ARG:
1863       code += code[1];
1864       break;
1865
1866       case OP_THEN_ARG:
1867       code += code[1+LINK_SIZE];
1868       break;
1869       }
1870
1871     /* Add in the fixed length from the table */
1872
1873     code += _pcre_OP_lengths[c];
1874
1875     /* In UTF-8 mode, opcodes that are followed by a character may be followed
1876     by a multi-byte character. The length in the table is a minimum, so we have
1877     to arrange to skip the extra bytes. */
1878
1879 #ifdef SUPPORT_UTF8
1880     if (utf8) switch(c)
1881       {
1882       case OP_CHAR:
1883       case OP_CHARNC:
1884       case OP_EXACT:
1885       case OP_UPTO:
1886       case OP_MINUPTO:
1887       case OP_POSUPTO:
1888       case OP_STAR:
1889       case OP_MINSTAR:
1890       case OP_POSSTAR:
1891       case OP_PLUS:
1892       case OP_MINPLUS:
1893       case OP_POSPLUS:
1894       case OP_QUERY:
1895       case OP_MINQUERY:
1896       case OP_POSQUERY:
1897       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1898       break;
1899       }
1900 #else
1901     (void)(utf8);  /* Keep compiler happy by referencing function argument */
1902 #endif
1903     }
1904   }
1905 }
1906
1907
1908
1909 /*************************************************
1910 *    Scan compiled branch for non-emptiness      *
1911 *************************************************/
1912
1913 /* This function scans through a branch of a compiled pattern to see whether it
1914 can match the empty string or not. It is called from could_be_empty()
1915 below and from compile_branch() when checking for an unlimited repeat of a
1916 group that can match nothing. Note that first_significant_code() skips over
1917 backward and negative forward assertions when its final argument is TRUE. If we
1918 hit an unclosed bracket, we return "empty" - this means we've struck an inner
1919 bracket whose current branch will already have been scanned.
1920
1921 Arguments:
1922   code        points to start of search
1923   endcode     points to where to stop
1924   utf8        TRUE if in UTF8 mode
1925   cd          contains pointers to tables etc.
1926
1927 Returns:      TRUE if what is matched could be empty
1928 */
1929
1930 static BOOL
1931 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
1932   compile_data *cd)
1933 {
1934 register int c;
1935 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1936      code < endcode;
1937      code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1938   {
1939   const uschar *ccode;
1940
1941   c = *code;
1942
1943   /* Skip over forward assertions; the other assertions are skipped by
1944   first_significant_code() with a TRUE final argument. */
1945
1946   if (c == OP_ASSERT)
1947     {
1948     do code += GET(code, 1); while (*code == OP_ALT);
1949     c = *code;
1950     continue;
1951     }
1952
1953   /* Groups with zero repeats can of course be empty; skip them. */
1954
1955   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1956     {
1957     code += _pcre_OP_lengths[c];
1958     do code += GET(code, 1); while (*code == OP_ALT);
1959     c = *code;
1960     continue;
1961     }
1962
1963   /* For a recursion/subroutine call, if its end has been reached, which
1964   implies a subroutine call, we can scan it. */
1965
1966   if (c == OP_RECURSE)
1967     {
1968     BOOL empty_branch = FALSE;
1969     const uschar *scode = cd->start_code + GET(code, 1);
1970     if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
1971     do
1972       {
1973       if (could_be_empty_branch(scode, endcode, utf8, cd))
1974         {
1975         empty_branch = TRUE;
1976         break;
1977         }
1978       scode += GET(scode, 1);
1979       }
1980     while (*scode == OP_ALT);
1981     if (!empty_branch) return FALSE;  /* All branches are non-empty */
1982     continue;
1983     }
1984
1985   /* For other groups, scan the branches. */
1986
1987   if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1988     {
1989     BOOL empty_branch;
1990     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1991
1992     /* If a conditional group has only one branch, there is a second, implied,
1993     empty branch, so just skip over the conditional, because it could be empty.
1994     Otherwise, scan the individual branches of the group. */
1995
1996     if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1997       code += GET(code, 1);
1998     else
1999       {
2000       empty_branch = FALSE;
2001       do
2002         {
2003         if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
2004           empty_branch = TRUE;
2005         code += GET(code, 1);
2006         }
2007       while (*code == OP_ALT);
2008       if (!empty_branch) return FALSE;   /* All branches are non-empty */
2009       }
2010
2011     c = *code;
2012     continue;
2013     }
2014
2015   /* Handle the other opcodes */
2016
2017   switch (c)
2018     {
2019     /* Check for quantifiers after a class. XCLASS is used for classes that
2020     cannot be represented just by a bit map. This includes negated single
2021     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
2022     actual length is stored in the compiled code, so we must update "code"
2023     here. */
2024
2025 #ifdef SUPPORT_UTF8
2026     case OP_XCLASS:
2027     ccode = code += GET(code, 1);
2028     goto CHECK_CLASS_REPEAT;
2029 #endif
2030
2031     case OP_CLASS:
2032     case OP_NCLASS:
2033     ccode = code + 33;
2034
2035 #ifdef SUPPORT_UTF8
2036     CHECK_CLASS_REPEAT:
2037 #endif
2038
2039     switch (*ccode)
2040       {
2041       case OP_CRSTAR:            /* These could be empty; continue */
2042       case OP_CRMINSTAR:
2043       case OP_CRQUERY:
2044       case OP_CRMINQUERY:
2045       break;
2046
2047       default:                   /* Non-repeat => class must match */
2048       case OP_CRPLUS:            /* These repeats aren't empty */
2049       case OP_CRMINPLUS:
2050       return FALSE;
2051
2052       case OP_CRRANGE:
2053       case OP_CRMINRANGE:
2054       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
2055       break;
2056       }
2057     break;
2058
2059     /* Opcodes that must match a character */
2060
2061     case OP_PROP:
2062     case OP_NOTPROP:
2063     case OP_EXTUNI:
2064     case OP_NOT_DIGIT:
2065     case OP_DIGIT:
2066     case OP_NOT_WHITESPACE:
2067     case OP_WHITESPACE:
2068     case OP_NOT_WORDCHAR:
2069     case OP_WORDCHAR:
2070     case OP_ANY:
2071     case OP_ALLANY:
2072     case OP_ANYBYTE:
2073     case OP_CHAR:
2074     case OP_CHARNC:
2075     case OP_NOT:
2076     case OP_PLUS:
2077     case OP_MINPLUS:
2078     case OP_POSPLUS:
2079     case OP_EXACT:
2080     case OP_NOTPLUS:
2081     case OP_NOTMINPLUS:
2082     case OP_NOTPOSPLUS:
2083     case OP_NOTEXACT:
2084     case OP_TYPEPLUS:
2085     case OP_TYPEMINPLUS:
2086     case OP_TYPEPOSPLUS:
2087     case OP_TYPEEXACT:
2088     return FALSE;
2089
2090     /* These are going to continue, as they may be empty, but we have to
2091     fudge the length for the \p and \P cases. */
2092
2093     case OP_TYPESTAR:
2094     case OP_TYPEMINSTAR:
2095     case OP_TYPEPOSSTAR:
2096     case OP_TYPEQUERY:
2097     case OP_TYPEMINQUERY:
2098     case OP_TYPEPOSQUERY:
2099     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2100     break;
2101
2102     /* Same for these */
2103
2104     case OP_TYPEUPTO:
2105     case OP_TYPEMINUPTO:
2106     case OP_TYPEPOSUPTO:
2107     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
2108     break;
2109
2110     /* End of branch */
2111
2112     case OP_KET:
2113     case OP_KETRMAX:
2114     case OP_KETRMIN:
2115     case OP_ALT:
2116     return TRUE;
2117
2118     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2119     MINUPTO, and POSUPTO may be followed by a multibyte character */
2120
2121 #ifdef SUPPORT_UTF8
2122     case OP_STAR:
2123     case OP_MINSTAR:
2124     case OP_POSSTAR:
2125     case OP_QUERY:
2126     case OP_MINQUERY:
2127     case OP_POSQUERY:
2128     if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
2129     break;
2130
2131     case OP_UPTO:
2132     case OP_MINUPTO:
2133     case OP_POSUPTO:
2134     if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
2135     break;
2136 #endif
2137
2138     /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2139     string. */
2140
2141     case OP_MARK:
2142     case OP_PRUNE_ARG:
2143     case OP_SKIP_ARG:
2144     code += code[1];
2145     break;
2146
2147     case OP_THEN_ARG:
2148     code += code[1+LINK_SIZE];
2149     break;
2150
2151     /* None of the remaining opcodes are required to match a character. */
2152
2153     default:
2154     break;
2155     }
2156   }
2157
2158 return TRUE;
2159 }
2160
2161
2162
2163 /*************************************************
2164 *    Scan compiled regex for non-emptiness       *
2165 *************************************************/
2166
2167 /* This function is called to check for left recursive calls. We want to check
2168 the current branch of the current pattern to see if it could match the empty
2169 string. If it could, we must look outwards for branches at other levels,
2170 stopping when we pass beyond the bracket which is the subject of the recursion.
2171
2172 Arguments:
2173   code        points to start of the recursion
2174   endcode     points to where to stop (current RECURSE item)
2175   bcptr       points to the chain of current (unclosed) branch starts
2176   utf8        TRUE if in UTF-8 mode
2177   cd          pointers to tables etc
2178
2179 Returns:      TRUE if what is matched could be empty
2180 */
2181
2182 static BOOL
2183 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2184   BOOL utf8, compile_data *cd)
2185 {
2186 while (bcptr != NULL && bcptr->current_branch >= code)
2187   {
2188   if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2189     return FALSE;
2190   bcptr = bcptr->outer;
2191   }
2192 return TRUE;
2193 }
2194
2195
2196
2197 /*************************************************
2198 *           Check for POSIX class syntax         *
2199 *************************************************/
2200
2201 /* This function is called when the sequence "[:" or "[." or "[=" is
2202 encountered in a character class. It checks whether this is followed by a
2203 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2204 reach an unescaped ']' without the special preceding character, return FALSE.
2205
2206 Originally, this function only recognized a sequence of letters between the
2207 terminators, but it seems that Perl recognizes any sequence of characters,
2208 though of course unknown POSIX names are subsequently rejected. Perl gives an
2209 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2210 didn't consider this to be a POSIX class. Likewise for [:1234:].
2211
2212 The problem in trying to be exactly like Perl is in the handling of escapes. We
2213 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2214 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2215 below handles the special case of \], but does not try to do any other escape
2216 processing. This makes it different from Perl for cases such as [:l\ower:]
2217 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2218 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2219 I think.
2220
2221 Arguments:
2222   ptr      pointer to the initial [
2223   endptr   where to return the end pointer
2224
2225 Returns:   TRUE or FALSE
2226 */
2227
2228 static BOOL
2229 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2230 {
2231 int terminator;          /* Don't combine these lines; the Solaris cc */
2232 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
2233 for (++ptr; *ptr != 0; ptr++)
2234   {
2235   if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2236     {
2237     if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2238     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2239       {
2240       *endptr = ptr;
2241       return TRUE;
2242       }
2243     }
2244   }
2245 return FALSE;
2246 }
2247
2248
2249
2250
2251 /*************************************************
2252 *          Check POSIX class name                *
2253 *************************************************/
2254
2255 /* This function is called to check the name given in a POSIX-style class entry
2256 such as [:alnum:].
2257
2258 Arguments:
2259   ptr        points to the first letter
2260   len        the length of the name
2261
2262 Returns:     a value representing the name, or -1 if unknown
2263 */
2264
2265 static int
2266 check_posix_name(const uschar *ptr, int len)
2267 {
2268 const char *pn = posix_names;
2269 register int yield = 0;
2270 while (posix_name_lengths[yield] != 0)
2271   {
2272   if (len == posix_name_lengths[yield] &&
2273     strncmp((const char *)ptr, pn, len) == 0) return yield;
2274   pn += posix_name_lengths[yield] + 1;
2275   yield++;
2276   }
2277 return -1;
2278 }
2279
2280
2281 /*************************************************
2282 *    Adjust OP_RECURSE items in repeated group   *
2283 *************************************************/
2284
2285 /* OP_RECURSE items contain an offset from the start of the regex to the group
2286 that is referenced. This means that groups can be replicated for fixed
2287 repetition simply by copying (because the recursion is allowed to refer to
2288 earlier groups that are outside the current group). However, when a group is
2289 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2290 inserted before it, after it has been compiled. This means that any OP_RECURSE
2291 items within it that refer to the group itself or any contained groups have to
2292 have their offsets adjusted. That one of the jobs of this function. Before it
2293 is called, the partially compiled regex must be temporarily terminated with
2294 OP_END.
2295
2296 This function has been extended with the possibility of forward references for
2297 recursions and subroutine calls. It must also check the list of such references
2298 for the group we are dealing with. If it finds that one of the recursions in
2299 the current group is on this list, it adjusts the offset in the list, not the
2300 value in the reference (which is a group number).
2301
2302 Arguments:
2303   group      points to the start of the group
2304   adjust     the amount by which the group is to be moved
2305   utf8       TRUE in UTF-8 mode
2306   cd         contains pointers to tables etc.
2307   save_hwm   the hwm forward reference pointer at the start of the group
2308
2309 Returns:     nothing
2310 */
2311
2312 static void
2313 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2314   uschar *save_hwm)
2315 {
2316 uschar *ptr = group;
2317
2318 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2319   {
2320   int offset;
2321   uschar *hc;
2322
2323   /* See if this recursion is on the forward reference list. If so, adjust the
2324   reference. */
2325
2326   for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2327     {
2328     offset = GET(hc, 0);
2329     if (cd->start_code + offset == ptr + 1)
2330       {
2331       PUT(hc, 0, offset + adjust);
2332       break;
2333       }
2334     }
2335
2336   /* Otherwise, adjust the recursion offset if it's after the start of this
2337   group. */
2338
2339   if (hc >= cd->hwm)
2340     {
2341     offset = GET(ptr, 1);
2342     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2343     }
2344
2345   ptr += 1 + LINK_SIZE;
2346   }
2347 }
2348
2349
2350
2351 /*************************************************
2352 *        Insert an automatic callout point       *
2353 *************************************************/
2354
2355 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2356 callout points before each pattern item.
2357
2358 Arguments:
2359   code           current code pointer
2360   ptr            current pattern pointer
2361   cd             pointers to tables etc
2362
2363 Returns:         new code pointer
2364 */
2365
2366 static uschar *
2367 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2368 {
2369 *code++ = OP_CALLOUT;
2370 *code++ = 255;
2371 PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
2372 PUT(code, LINK_SIZE, 0);                       /* Default length */
2373 return code + 2*LINK_SIZE;
2374 }
2375
2376
2377
2378 /*************************************************
2379 *         Complete a callout item                *
2380 *************************************************/
2381
2382 /* A callout item contains the length of the next item in the pattern, which
2383 we can't fill in till after we have reached the relevant point. This is used
2384 for both automatic and manual callouts.
2385
2386 Arguments:
2387   previous_callout   points to previous callout item
2388   ptr                current pattern pointer
2389   cd                 pointers to tables etc
2390
2391 Returns:             nothing
2392 */
2393
2394 static void
2395 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2396 {
2397 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2398 PUT(previous_callout, 2 + LINK_SIZE, length);
2399 }
2400
2401
2402
2403 #ifdef SUPPORT_UCP
2404 /*************************************************
2405 *           Get othercase range                  *
2406 *************************************************/
2407
2408 /* This function is passed the start and end of a class range, in UTF-8 mode
2409 with UCP support. It searches up the characters, looking for internal ranges of
2410 characters in the "other" case. Each call returns the next one, updating the
2411 start address.
2412
2413 Arguments:
2414   cptr        points to starting character value; updated
2415   d           end value
2416   ocptr       where to put start of othercase range
2417   odptr       where to put end of othercase range
2418
2419 Yield:        TRUE when range returned; FALSE when no more
2420 */
2421
2422 static BOOL
2423 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2424   unsigned int *odptr)
2425 {
2426 unsigned int c, othercase, next;
2427
2428 for (c = *cptr; c <= d; c++)
2429   { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2430
2431 if (c > d) return FALSE;
2432
2433 *ocptr = othercase;
2434 next = othercase + 1;
2435
2436 for (++c; c <= d; c++)
2437   {
2438   if (UCD_OTHERCASE(c) != next) break;
2439   next++;
2440   }
2441
2442 *odptr = next - 1;
2443 *cptr = c;
2444
2445 return TRUE;
2446 }
2447
2448
2449
2450 /*************************************************
2451 *        Check a character and a property        *
2452 *************************************************/
2453
2454 /* This function is called by check_auto_possessive() when a property item
2455 is adjacent to a fixed character.
2456
2457 Arguments:
2458   c            the character
2459   ptype        the property type
2460   pdata        the data for the type
2461   negated      TRUE if it's a negated property (\P or \p{^)
2462
2463 Returns:       TRUE if auto-possessifying is OK
2464 */
2465
2466 static BOOL
2467 check_char_prop(int c, int ptype, int pdata, BOOL negated)
2468 {
2469 const ucd_record *prop = GET_UCD(c);
2470 switch(ptype)
2471   {
2472   case PT_LAMP:
2473   return (prop->chartype == ucp_Lu ||
2474           prop->chartype == ucp_Ll ||
2475           prop->chartype == ucp_Lt) == negated;
2476
2477   case PT_GC:
2478   return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;
2479
2480   case PT_PC:
2481   return (pdata == prop->chartype) == negated;
2482
2483   case PT_SC:
2484   return (pdata == prop->script) == negated;
2485
2486   /* These are specials */
2487
2488   case PT_ALNUM:
2489   return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2490           _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;
2491
2492   case PT_SPACE:    /* Perl space */
2493   return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2494           c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2495           == negated;
2496
2497   case PT_PXSPACE:  /* POSIX space */
2498   return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2499           c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2500           c == CHAR_FF || c == CHAR_CR)
2501           == negated;
2502
2503   case PT_WORD:
2504   return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2505           _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2506           c == CHAR_UNDERSCORE) == negated;
2507   }
2508 return FALSE;
2509 }
2510 #endif  /* SUPPORT_UCP */
2511
2512
2513
2514 /*************************************************
2515 *     Check if auto-possessifying is possible    *
2516 *************************************************/
2517
2518 /* This function is called for unlimited repeats of certain items, to see
2519 whether the next thing could possibly match the repeated item. If not, it makes
2520 sense to automatically possessify the repeated item.
2521
2522 Arguments:
2523   previous      pointer to the repeated opcode
2524   utf8          TRUE in UTF-8 mode
2525   ptr           next character in pattern
2526   options       options bits
2527   cd            contains pointers to tables etc.
2528
2529 Returns:        TRUE if possessifying is wanted
2530 */
2531
2532 static BOOL
2533 check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,
2534   int options, compile_data *cd)
2535 {
2536 int c, next;
2537 int op_code = *previous++;
2538
2539 /* Skip whitespace and comments in extended mode */
2540
2541 if ((options & PCRE_EXTENDED) != 0)
2542   {
2543   for (;;)
2544     {
2545     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2546     if (*ptr == CHAR_NUMBER_SIGN)
2547       {
2548       ptr++;
2549       while (*ptr != 0)
2550         {
2551         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2552         ptr++;
2553 #ifdef SUPPORT_UTF8
2554         if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2555 #endif
2556         }
2557       }
2558     else break;
2559     }
2560   }
2561
2562 /* If the next item is one that we can handle, get its value. A non-negative
2563 value is a character, a negative value is an escape value. */
2564
2565 if (*ptr == CHAR_BACKSLASH)
2566   {
2567   int temperrorcode = 0;
2568   next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2569   if (temperrorcode != 0) return FALSE;
2570   ptr++;    /* Point after the escape sequence */
2571   }
2572
2573 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2574   {
2575 #ifdef SUPPORT_UTF8
2576   if (utf8) { GETCHARINC(next, ptr); } else
2577 #endif
2578   next = *ptr++;
2579   }
2580
2581 else return FALSE;
2582
2583 /* Skip whitespace and comments in extended mode */
2584
2585 if ((options & PCRE_EXTENDED) != 0)
2586   {
2587   for (;;)
2588     {
2589     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2590     if (*ptr == CHAR_NUMBER_SIGN)
2591       {
2592       ptr++;
2593       while (*ptr != 0)
2594         {
2595         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2596         ptr++;
2597 #ifdef SUPPORT_UTF8
2598         if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2599 #endif
2600         }
2601       }
2602     else break;
2603     }
2604   }
2605
2606 /* If the next thing is itself optional, we have to give up. */
2607
2608 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2609   strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2610     return FALSE;
2611
2612 /* Now compare the next item with the previous opcode. First, handle cases when
2613 the next item is a character. */
2614
2615 if (next >= 0) switch(op_code)
2616   {
2617   case OP_CHAR:
2618 #ifdef SUPPORT_UTF8
2619   GETCHARTEST(c, previous);
2620 #else
2621   c = *previous;
2622 #endif
2623   return c != next;
2624
2625   /* For CHARNC (caseless character) we must check the other case. If we have
2626   Unicode property support, we can use it to test the other case of
2627   high-valued characters. */
2628
2629   case OP_CHARNC:
2630 #ifdef SUPPORT_UTF8
2631   GETCHARTEST(c, previous);
2632 #else
2633   c = *previous;
2634 #endif
2635   if (c == next) return FALSE;
2636 #ifdef SUPPORT_UTF8
2637   if (utf8)
2638     {
2639     unsigned int othercase;
2640     if (next < 128) othercase = cd->fcc[next]; else
2641 #ifdef SUPPORT_UCP
2642     othercase = UCD_OTHERCASE((unsigned int)next);
2643 #else
2644     othercase = NOTACHAR;
2645 #endif
2646     return (unsigned int)c != othercase;
2647     }
2648   else
2649 #endif  /* SUPPORT_UTF8 */
2650   return (c != cd->fcc[next]);  /* Non-UTF-8 mode */
2651
2652   /* For OP_NOT, its data is always a single-byte character. */
2653
2654   case OP_NOT:
2655   if ((c = *previous) == next) return TRUE;
2656   if ((options & PCRE_CASELESS) == 0) return FALSE;
2657 #ifdef SUPPORT_UTF8
2658   if (utf8)
2659     {
2660     unsigned int othercase;
2661     if (next < 128) othercase = cd->fcc[next]; else
2662 #ifdef SUPPORT_UCP
2663     othercase = UCD_OTHERCASE(next);
2664 #else
2665     othercase = NOTACHAR;
2666 #endif
2667     return (unsigned int)c == othercase;
2668     }
2669   else
2670 #endif  /* SUPPORT_UTF8 */
2671   return (c == cd->fcc[next]);  /* Non-UTF-8 mode */
2672
2673   /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
2674   When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
2675
2676   case OP_DIGIT:
2677   return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2678
2679   case OP_NOT_DIGIT:
2680   return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2681
2682   case OP_WHITESPACE:
2683   return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2684
2685   case OP_NOT_WHITESPACE:
2686   return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2687
2688   case OP_WORDCHAR:
2689   return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2690
2691   case OP_NOT_WORDCHAR:
2692   return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2693
2694   case OP_HSPACE:
2695   case OP_NOT_HSPACE:
2696   switch(next)
2697     {
2698     case 0x09:
2699     case 0x20:
2700     case 0xa0:
2701     case 0x1680:
2702     case 0x180e:
2703     case 0x2000:
2704     case 0x2001:
2705     case 0x2002:
2706     case 0x2003:
2707     case 0x2004:
2708     case 0x2005:
2709     case 0x2006:
2710     case 0x2007:
2711     case 0x2008:
2712     case 0x2009:
2713     case 0x200A:
2714     case 0x202f:
2715     case 0x205f:
2716     case 0x3000:
2717     return op_code == OP_NOT_HSPACE;
2718     default:
2719     return op_code != OP_NOT_HSPACE;
2720     }
2721
2722   case OP_ANYNL:
2723   case OP_VSPACE:
2724   case OP_NOT_VSPACE:
2725   switch(next)
2726     {
2727     case 0x0a:
2728     case 0x0b:
2729     case 0x0c:
2730     case 0x0d:
2731     case 0x85:
2732     case 0x2028:
2733     case 0x2029:
2734     return op_code == OP_NOT_VSPACE;
2735     default:
2736     return op_code != OP_NOT_VSPACE;
2737     }
2738
2739 #ifdef SUPPORT_UCP
2740   case OP_PROP:
2741   return check_char_prop(next, previous[0], previous[1], FALSE);
2742
2743   case OP_NOTPROP:
2744   return check_char_prop(next, previous[0], previous[1], TRUE);
2745 #endif
2746
2747   default:
2748   return FALSE;
2749   }
2750
2751
2752 /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
2753 is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
2754 generated only when PCRE_UCP is *not* set, that is, when only ASCII
2755 characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
2756 replaced by OP_PROP codes when PCRE_UCP is set. */
2757
2758 switch(op_code)
2759   {
2760   case OP_CHAR:
2761   case OP_CHARNC:
2762 #ifdef SUPPORT_UTF8
2763   GETCHARTEST(c, previous);
2764 #else
2765   c = *previous;
2766 #endif
2767   switch(-next)
2768     {
2769     case ESC_d:
2770     return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
2771
2772     case ESC_D:
2773     return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
2774
2775     case ESC_s:
2776     return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
2777
2778     case ESC_S:
2779     return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
2780
2781     case ESC_w:
2782     return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
2783
2784     case ESC_W:
2785     return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
2786
2787     case ESC_h:
2788     case ESC_H:
2789     switch(c)
2790       {
2791       case 0x09:
2792       case 0x20:
2793       case 0xa0:
2794       case 0x1680:
2795       case 0x180e:
2796       case 0x2000:
2797       case 0x2001:
2798       case 0x2002:
2799       case 0x2003:
2800       case 0x2004:
2801       case 0x2005:
2802       case 0x2006:
2803       case 0x2007:
2804       case 0x2008:
2805       case 0x2009:
2806       case 0x200A:
2807       case 0x202f:
2808       case 0x205f:
2809       case 0x3000:
2810       return -next != ESC_h;
2811       default:
2812       return -next == ESC_h;
2813       }
2814
2815     case ESC_v:
2816     case ESC_V:
2817     switch(c)
2818       {
2819       case 0x0a:
2820       case 0x0b:
2821       case 0x0c:
2822       case 0x0d:
2823       case 0x85:
2824       case 0x2028:
2825       case 0x2029:
2826       return -next != ESC_v;
2827       default:
2828       return -next == ESC_v;
2829       }
2830
2831     /* When PCRE_UCP is set, these values get generated for \d etc. Find
2832     their substitutions and process them. The result will always be either
2833     -ESC_p or -ESC_P. Then fall through to process those values. */
2834
2835 #ifdef SUPPORT_UCP
2836     case ESC_du:
2837     case ESC_DU:
2838     case ESC_wu:
2839     case ESC_WU:
2840     case ESC_su:
2841     case ESC_SU:
2842       {
2843       int temperrorcode = 0;
2844       ptr = substitutes[-next - ESC_DU];
2845       next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
2846       if (temperrorcode != 0) return FALSE;
2847       ptr++;    /* For compatibility */
2848       }
2849     /* Fall through */
2850
2851     case ESC_p:
2852     case ESC_P:
2853       {
2854       int ptype, pdata, errorcodeptr;
2855       BOOL negated;
2856
2857       ptr--;      /* Make ptr point at the p or P */
2858       ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
2859       if (ptype < 0) return FALSE;
2860       ptr++;      /* Point past the final curly ket */
2861
2862       /* If the property item is optional, we have to give up. (When generated
2863       from \d etc by PCRE_UCP, this test will have been applied much earlier,
2864       to the original \d etc. At this point, ptr will point to a zero byte. */
2865
2866       if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2867         strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2868           return FALSE;
2869
2870       /* Do the property check. */
2871
2872       return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
2873       }
2874 #endif
2875
2876     default:
2877     return FALSE;
2878     }
2879
2880   /* In principle, support for Unicode properties should be integrated here as
2881   well. It means re-organizing the above code so as to get hold of the property
2882   values before switching on the op-code. However, I wonder how many patterns
2883   combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
2884   these op-codes are never generated.) */
2885
2886   case OP_DIGIT:
2887   return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2888          next == -ESC_h || next == -ESC_v || next == -ESC_R;
2889
2890   case OP_NOT_DIGIT:
2891   return next == -ESC_d;
2892
2893   case OP_WHITESPACE:
2894   return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
2895
2896   case OP_NOT_WHITESPACE:
2897   return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2898
2899   case OP_HSPACE:
2900   return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
2901          next == -ESC_w || next == -ESC_v || next == -ESC_R;
2902
2903   case OP_NOT_HSPACE:
2904   return next == -ESC_h;
2905
2906   /* Can't have \S in here because VT matches \S (Perl anomaly) */
2907   case OP_ANYNL:
2908   case OP_VSPACE:
2909   return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2910
2911   case OP_NOT_VSPACE:
2912   return next == -ESC_v || next == -ESC_R;
2913
2914   case OP_WORDCHAR:
2915   return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
2916          next == -ESC_v || next == -ESC_R;
2917
2918   case OP_NOT_WORDCHAR:
2919   return next == -ESC_w || next == -ESC_d;
2920
2921   default:
2922   return FALSE;
2923   }
2924
2925 /* Control does not reach here */
2926 }
2927
2928
2929
2930 /*************************************************
2931 *           Compile one branch                   *
2932 *************************************************/
2933
2934 /* Scan the pattern, compiling it into the a vector. If the options are
2935 changed during the branch, the pointer is used to change the external options
2936 bits. This function is used during the pre-compile phase when we are trying
2937 to find out the amount of memory needed, as well as during the real compile
2938 phase. The value of lengthptr distinguishes the two phases.
2939
2940 Arguments:
2941   optionsptr     pointer to the option bits
2942   codeptr        points to the pointer to the current code point
2943   ptrptr         points to the current pattern pointer
2944   errorcodeptr   points to error code variable
2945   firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2946   reqbyteptr     set to the last literal character required, else < 0
2947   bcptr          points to current branch chain
2948   cd             contains pointers to tables etc.
2949   lengthptr      NULL during the real compile phase
2950                  points to length accumulator during pre-compile phase
2951
2952 Returns:         TRUE on success
2953                  FALSE, with *errorcodeptr set non-zero on error
2954 */
2955
2956 static BOOL
2957 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2958   int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2959   compile_data *cd, int *lengthptr)
2960 {
2961 int repeat_type, op_type;
2962 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2963 int bravalue = 0;
2964 int greedy_default, greedy_non_default;
2965 int firstbyte, reqbyte;
2966 int zeroreqbyte, zerofirstbyte;
2967 int req_caseopt, reqvary, tempreqvary;
2968 int options = *optionsptr;
2969 int after_manual_callout = 0;
2970 int length_prevgroup = 0;
2971 register int c;
2972 register uschar *code = *codeptr;
2973 uschar *last_code = code;
2974 uschar *orig_code = code;
2975 uschar *tempcode;
2976 BOOL inescq = FALSE;
2977 BOOL groupsetfirstbyte = FALSE;
2978 const uschar *ptr = *ptrptr;
2979 const uschar *tempptr;
2980 const uschar *nestptr = NULL;
2981 uschar *previous = NULL;
2982 uschar *previous_callout = NULL;
2983 uschar *save_hwm = NULL;
2984 uschar classbits[32];
2985
2986 #ifdef SUPPORT_UTF8
2987 BOOL class_utf8;
2988 BOOL utf8 = (options & PCRE_UTF8) != 0;
2989 uschar *class_utf8data;
2990 uschar *class_utf8data_base;
2991 uschar utf8_char[6];
2992 #else
2993 BOOL utf8 = FALSE;
2994 uschar *utf8_char = NULL;
2995 #endif
2996
2997 #ifdef PCRE_DEBUG
2998 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2999 #endif
3000
3001 /* Set up the default and non-default settings for greediness */
3002
3003 greedy_default = ((options & PCRE_UNGREEDY) != 0);
3004 greedy_non_default = greedy_default ^ 1;
3005
3006 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3007 matching encountered yet". It gets changed to REQ_NONE if we hit something that
3008 matches a non-fixed char first char; reqbyte just remains unset if we never
3009 find one.
3010
3011 When we hit a repeat whose minimum is zero, we may have to adjust these values
3012 to take the zero repeat into account. This is implemented by setting them to
3013 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
3014 item types that can be repeated set these backoff variables appropriately. */
3015
3016 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
3017
3018 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
3019 according to the current setting of the caseless flag. REQ_CASELESS is a bit
3020 value > 255. It is added into the firstbyte or reqbyte variables to record the
3021 case status of the value. This is used only for ASCII characters. */
3022
3023 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3024
3025 /* Switch on next character until the end of the branch */
3026
3027 for (;; ptr++)
3028   {
3029   BOOL negate_class;
3030   BOOL should_flip_negation;
3031   BOOL possessive_quantifier;
3032   BOOL is_quantifier;
3033   BOOL is_recurse;
3034   BOOL reset_bracount;
3035   int class_charcount;
3036   int class_lastchar;
3037   int newoptions;
3038   int recno;
3039   int refsign;
3040   int skipbytes;
3041   int subreqbyte;
3042   int subfirstbyte;
3043   int terminator;
3044   int mclength;
3045   uschar mcbuffer[8];
3046
3047   /* Get next byte in the pattern */
3048
3049   c = *ptr;
3050
3051   /* If we are at the end of a nested substitution, revert to the outer level
3052   string. Nesting only happens one level deep. */
3053
3054   if (c == 0 && nestptr != NULL)
3055     {
3056     ptr = nestptr;
3057     nestptr = NULL;
3058     c = *ptr;
3059     }
3060
3061   /* If we are in the pre-compile phase, accumulate the length used for the
3062   previous cycle of this loop. */
3063
3064   if (lengthptr != NULL)
3065     {
3066 #ifdef PCRE_DEBUG
3067     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
3068 #endif
3069     if (code > cd->start_workspace + WORK_SIZE_CHECK)   /* Check for overrun */
3070       {
3071       *errorcodeptr = ERR52;
3072       goto FAILED;
3073       }
3074
3075     /* There is at least one situation where code goes backwards: this is the
3076     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
3077     the class is simply eliminated. However, it is created first, so we have to
3078     allow memory for it. Therefore, don't ever reduce the length at this point.
3079     */
3080
3081     if (code < last_code) code = last_code;
3082
3083     /* Paranoid check for integer overflow */
3084
3085     if (OFLOW_MAX - *lengthptr < code - last_code)
3086       {
3087       *errorcodeptr = ERR20;
3088       goto FAILED;
3089       }
3090
3091     *lengthptr += (int)(code - last_code);
3092     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
3093
3094     /* If "previous" is set and it is not at the start of the work space, move
3095     it back to there, in order to avoid filling up the work space. Otherwise,
3096     if "previous" is NULL, reset the current code pointer to the start. */
3097
3098     if (previous != NULL)
3099       {
3100       if (previous > orig_code)
3101         {
3102         memmove(orig_code, previous, code - previous);
3103         code -= previous - orig_code;
3104         previous = orig_code;
3105         }
3106       }
3107     else code = orig_code;
3108
3109     /* Remember where this code item starts so we can pick up the length
3110     next time round. */
3111
3112     last_code = code;
3113     }
3114
3115   /* In the real compile phase, just check the workspace used by the forward
3116   reference list. */
3117
3118   else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
3119     {
3120     *errorcodeptr = ERR52;
3121     goto FAILED;
3122     }
3123
3124   /* If in \Q...\E, check for the end; if not, we have a literal */
3125
3126   if (inescq && c != 0)
3127     {
3128     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3129       {
3130       inescq = FALSE;
3131       ptr++;
3132       continue;
3133       }
3134     else
3135       {
3136       if (previous_callout != NULL)
3137         {
3138         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
3139           complete_callout(previous_callout, ptr, cd);
3140         previous_callout = NULL;
3141         }
3142       if ((options & PCRE_AUTO_CALLOUT) != 0)
3143         {
3144         previous_callout = code;
3145         code = auto_callout(code, ptr, cd);
3146         }
3147       goto NORMAL_CHAR;
3148       }
3149     }
3150
3151   /* Fill in length of a previous callout, except when the next thing is
3152   a quantifier. */
3153
3154   is_quantifier =
3155     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3156     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3157
3158   if (!is_quantifier && previous_callout != NULL &&
3159        after_manual_callout-- <= 0)
3160     {
3161     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
3162       complete_callout(previous_callout, ptr, cd);
3163     previous_callout = NULL;
3164     }
3165
3166   /* In extended mode, skip white space and comments */
3167
3168   if ((options & PCRE_EXTENDED) != 0)
3169     {
3170     if ((cd->ctypes[c] & ctype_space) != 0) continue;
3171     if (c == CHAR_NUMBER_SIGN)
3172       {
3173       ptr++;
3174       while (*ptr != 0)
3175         {
3176         if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3177         ptr++;
3178 #ifdef SUPPORT_UTF8
3179         if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
3180 #endif
3181         }
3182       if (*ptr != 0) continue;
3183
3184       /* Else fall through to handle end of string */
3185       c = 0;
3186       }
3187     }
3188
3189   /* No auto callout for quantifiers. */
3190
3191   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
3192     {
3193     previous_callout = code;
3194     code = auto_callout(code, ptr, cd);
3195     }
3196
3197   switch(c)
3198     {
3199     /* ===================================================================*/
3200     case 0:                        /* The branch terminates at string end */
3201     case CHAR_VERTICAL_LINE:       /* or | or ) */
3202     case CHAR_RIGHT_PARENTHESIS:
3203     *firstbyteptr = firstbyte;
3204     *reqbyteptr = reqbyte;
3205     *codeptr = code;
3206     *ptrptr = ptr;
3207     if (lengthptr != NULL)
3208       {
3209       if (OFLOW_MAX - *lengthptr < code - last_code)
3210         {
3211         *errorcodeptr = ERR20;
3212         goto FAILED;
3213         }
3214       *lengthptr += (int)(code - last_code);   /* To include callout length */
3215       DPRINTF((">> end branch\n"));
3216       }
3217     return TRUE;
3218
3219
3220     /* ===================================================================*/
3221     /* Handle single-character metacharacters. In multiline mode, ^ disables
3222     the setting of any following char as a first character. */
3223
3224     case CHAR_CIRCUMFLEX_ACCENT:
3225     if ((options & PCRE_MULTILINE) != 0)
3226       {
3227       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3228       }
3229     previous = NULL;
3230     *code++ = OP_CIRC;
3231     break;
3232
3233     case CHAR_DOLLAR_SIGN:
3234     previous = NULL;
3235     *code++ = OP_DOLL;
3236     break;
3237
3238     /* There can never be a first char if '.' is first, whatever happens about
3239     repeats. The value of reqbyte doesn't change either. */
3240
3241     case CHAR_DOT:
3242     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3243     zerofirstbyte = firstbyte;
3244     zeroreqbyte = reqbyte;
3245     previous = code;
3246     *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3247     break;
3248
3249
3250     /* ===================================================================*/
3251     /* Character classes. If the included characters are all < 256, we build a
3252     32-byte bitmap of the permitted characters, except in the special case
3253     where there is only one such character. For negated classes, we build the
3254     map as usual, then invert it at the end. However, we use a different opcode
3255     so that data characters > 255 can be handled correctly.
3256
3257     If the class contains characters outside the 0-255 range, a different
3258     opcode is compiled. It may optionally have a bit map for characters < 256,
3259     but those above are are explicitly listed afterwards. A flag byte tells
3260     whether the bitmap is present, and whether this is a negated class or not.
3261
3262     In JavaScript compatibility mode, an isolated ']' causes an error. In
3263     default (Perl) mode, it is treated as a data character. */
3264
3265     case CHAR_RIGHT_SQUARE_BRACKET:
3266     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3267       {
3268       *errorcodeptr = ERR64;
3269       goto FAILED;
3270       }
3271     goto NORMAL_CHAR;
3272
3273     case CHAR_LEFT_SQUARE_BRACKET:
3274     previous = code;
3275
3276     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3277     they are encountered at the top level, so we'll do that too. */
3278
3279     if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3280          ptr[1] == CHAR_EQUALS_SIGN) &&
3281         check_posix_syntax(ptr, &tempptr))
3282       {
3283       *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3284       goto FAILED;
3285       }
3286
3287     /* If the first character is '^', set the negation flag and skip it. Also,
3288     if the first few characters (either before or after ^) are \Q\E or \E we
3289     skip them too. This makes for compatibility with Perl. */
3290
3291     negate_class = FALSE;
3292     for (;;)
3293       {
3294       c = *(++ptr);
3295       if (c == CHAR_BACKSLASH)
3296         {
3297         if (ptr[1] == CHAR_E)
3298           ptr++;
3299         else if (strncmp((const char *)ptr+1,
3300                           STR_Q STR_BACKSLASH STR_E, 3) == 0)
3301           ptr += 3;
3302         else
3303           break;
3304         }
3305       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3306         negate_class = TRUE;
3307       else break;
3308       }
3309
3310     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3311     an initial ']' is taken as a data character -- the code below handles
3312     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3313     [^] must match any character, so generate OP_ALLANY. */
3314
3315     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3316         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3317       {
3318       *code++ = negate_class? OP_ALLANY : OP_FAIL;
3319       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3320       zerofirstbyte = firstbyte;
3321       break;
3322       }
3323
3324     /* If a class contains a negative special such as \S, we need to flip the
3325     negation flag at the end, so that support for characters > 255 works
3326     correctly (they are all included in the class). */
3327
3328     should_flip_negation = FALSE;
3329
3330     /* Keep a count of chars with values < 256 so that we can optimize the case
3331     of just a single character (as long as it's < 256). However, For higher
3332     valued UTF-8 characters, we don't yet do any optimization. */
3333
3334     class_charcount = 0;
3335     class_lastchar = -1;
3336
3337     /* Initialize the 32-char bit map to all zeros. We build the map in a
3338     temporary bit of memory, in case the class contains only 1 character (less
3339     than 256), because in that case the compiled code doesn't use the bit map.
3340     */
3341
3342     memset(classbits, 0, 32 * sizeof(uschar));
3343
3344 #ifdef SUPPORT_UTF8
3345     class_utf8 = FALSE;                       /* No chars >= 256 */
3346     class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
3347     class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
3348 #endif
3349
3350     /* Process characters until ] is reached. By writing this as a "do" it
3351     means that an initial ] is taken as a data character. At the start of the
3352     loop, c contains the first byte of the character. */
3353
3354     if (c != 0) do
3355       {
3356       const uschar *oldptr;
3357
3358 #ifdef SUPPORT_UTF8
3359       if (utf8 && c > 127)
3360         {                           /* Braces are required because the */
3361         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
3362         }
3363
3364       /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3365       data and reset the pointer. This is so that very large classes that
3366       contain a zillion UTF-8 characters no longer overwrite the work space
3367       (which is on the stack). */
3368
3369       if (lengthptr != NULL)
3370         {
3371         *lengthptr += class_utf8data - class_utf8data_base;
3372         class_utf8data = class_utf8data_base;
3373         }
3374
3375 #endif
3376
3377       /* Inside \Q...\E everything is literal except \E */
3378
3379       if (inescq)
3380         {
3381         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
3382           {
3383           inescq = FALSE;                   /* Reset literal state */
3384           ptr++;                            /* Skip the 'E' */
3385           continue;                         /* Carry on with next */
3386           }
3387         goto CHECK_RANGE;                   /* Could be range if \E follows */
3388         }
3389
3390       /* Handle POSIX class names. Perl allows a negation extension of the
3391       form [:^name:]. A square bracket that doesn't match the syntax is
3392       treated as a literal. We also recognize the POSIX constructions
3393       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3394       5.6 and 5.8 do. */
3395
3396       if (c == CHAR_LEFT_SQUARE_BRACKET &&
3397           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3398            ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3399         {
3400         BOOL local_negate = FALSE;
3401         int posix_class, taboffset, tabopt;
3402         register const uschar *cbits = cd->cbits;
3403         uschar pbits[32];
3404
3405         if (ptr[1] != CHAR_COLON)
3406           {
3407           *errorcodeptr = ERR31;
3408           goto FAILED;
3409           }
3410
3411         ptr += 2;
3412         if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3413           {
3414           local_negate = TRUE;
3415           should_flip_negation = TRUE;  /* Note negative special */
3416           ptr++;
3417           }
3418
3419         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3420         if (posix_class < 0)
3421           {
3422           *errorcodeptr = ERR30;
3423           goto FAILED;
3424           }
3425
3426         /* If matching is caseless, upper and lower are converted to
3427         alpha. This relies on the fact that the class table starts with
3428         alpha, lower, upper as the first 3 entries. */
3429
3430         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3431           posix_class = 0;
3432
3433         /* When PCRE_UCP is set, some of the POSIX classes are converted to
3434         different escape sequences that use Unicode properties. */
3435
3436 #ifdef SUPPORT_UCP
3437         if ((options & PCRE_UCP) != 0)
3438           {
3439           int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3440           if (posix_substitutes[pc] != NULL)
3441             {
3442             nestptr = tempptr + 1;
3443             ptr = posix_substitutes[pc] - 1;
3444             continue;
3445             }
3446           }
3447 #endif
3448         /* In the non-UCP case, we build the bit map for the POSIX class in a
3449         chunk of local store because we may be adding and subtracting from it,
3450         and we don't want to subtract bits that may be in the main map already.
3451         At the end we or the result into the bit map that is being built. */
3452
3453         posix_class *= 3;
3454
3455         /* Copy in the first table (always present) */
3456
3457         memcpy(pbits, cbits + posix_class_maps[posix_class],
3458           32 * sizeof(uschar));
3459
3460         /* If there is a second table, add or remove it as required. */
3461
3462         taboffset = posix_class_maps[posix_class + 1];
3463         tabopt = posix_class_maps[posix_class + 2];
3464
3465         if (taboffset >= 0)
3466           {
3467           if (tabopt >= 0)
3468             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3469           else
3470             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3471           }
3472
3473         /* Not see if we need to remove any special characters. An option
3474         value of 1 removes vertical space and 2 removes underscore. */
3475
3476         if (tabopt < 0) tabopt = -tabopt;
3477         if (tabopt == 1) pbits[1] &= ~0x3c;
3478           else if (tabopt == 2) pbits[11] &= 0x7f;
3479
3480         /* Add the POSIX table or its complement into the main table that is
3481         being built and we are done. */
3482
3483         if (local_negate)
3484           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3485         else
3486           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3487
3488         ptr = tempptr + 1;
3489         class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
3490         continue;    /* End of POSIX syntax handling */
3491         }
3492
3493       /* Backslash may introduce a single character, or it may introduce one
3494       of the specials, which just set a flag. The sequence \b is a special
3495       case. Inside a class (and only there) it is treated as backspace. We
3496       assume that other escapes have more than one character in them, so set
3497       class_charcount bigger than one. Unrecognized escapes fall through and
3498       are either treated as literal characters (by default), or are faulted if
3499       PCRE_EXTRA is set. */
3500
3501       if (c == CHAR_BACKSLASH)
3502         {
3503         c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3504         if (*errorcodeptr != 0) goto FAILED;
3505
3506         if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */
3507         else if (-c == ESC_Q)            /* Handle start of quoted string */
3508           {
3509           if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3510             {
3511             ptr += 2; /* avoid empty string */
3512             }
3513           else inescq = TRUE;
3514           continue;
3515           }
3516         else if (-c == ESC_E) continue;  /* Ignore orphan \E */
3517
3518         if (c < 0)
3519           {
3520           register const uschar *cbits = cd->cbits;
3521           class_charcount += 2;     /* Greater than 1 is what matters */
3522
3523           switch (-c)
3524             {
3525 #ifdef SUPPORT_UCP
3526             case ESC_du:     /* These are the values given for \d etc */
3527             case ESC_DU:     /* when PCRE_UCP is set. We replace the */
3528             case ESC_wu:     /* escape sequence with an appropriate \p */
3529             case ESC_WU:     /* or \P to test Unicode properties instead */
3530             case ESC_su:     /* of the default ASCII testing. */
3531             case ESC_SU:
3532             nestptr = ptr;
3533             ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
3534             class_charcount -= 2;                /* Undo! */
3535             continue;
3536 #endif
3537             case ESC_d:
3538             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3539             continue;
3540
3541             case ESC_D:
3542             should_flip_negation = TRUE;
3543             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3544             continue;
3545
3546             case ESC_w:
3547             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3548             continue;
3549
3550             case ESC_W:
3551             should_flip_negation = TRUE;
3552             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3553             continue;
3554
3555             /* Perl 5.004 onwards omits VT from \s, but we must preserve it
3556             if it was previously set by something earlier in the character
3557             class. */
3558
3559             case ESC_s:
3560             classbits[0] |= cbits[cbit_space];
3561             classbits[1] |= cbits[cbit_space+1] & ~0x08;
3562             for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3563             continue;
3564
3565             case ESC_S:
3566             should_flip_negation = TRUE;
3567             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3568             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
3569             continue;
3570
3571             case ESC_h:
3572             SETBIT(classbits, 0x09); /* VT */
3573             SETBIT(classbits, 0x20); /* SPACE */
3574             SETBIT(classbits, 0xa0); /* NSBP */
3575 #ifdef SUPPORT_UTF8
3576             if (utf8)
3577               {
3578               class_utf8 = TRUE;
3579               *class_utf8data++ = XCL_SINGLE;
3580               class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3581               *class_utf8data++ = XCL_SINGLE;
3582               class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3583               *class_utf8data++ = XCL_RANGE;
3584               class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3585               class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3586               *class_utf8data++ = XCL_SINGLE;
3587               class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3588               *class_utf8data++ = XCL_SINGLE;
3589               class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3590               *class_utf8data++ = XCL_SINGLE;
3591               class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3592               }
3593 #endif
3594             continue;
3595
3596             case ESC_H:
3597             for (c = 0; c < 32; c++)
3598               {
3599               int x = 0xff;
3600               switch (c)
3601                 {
3602                 case 0x09/8: x ^= 1 << (0x09%8); break;
3603                 case 0x20/8: x ^= 1 << (0x20%8); break;
3604                 case 0xa0/8: x ^= 1 << (0xa0%8); break;
3605                 default: break;
3606                 }
3607               classbits[c] |= x;
3608               }
3609
3610 #ifdef SUPPORT_UTF8
3611             if (utf8)
3612               {
3613               class_utf8 = TRUE;
3614               *class_utf8data++ = XCL_RANGE;
3615               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3616               class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3617               *class_utf8data++ = XCL_RANGE;
3618               class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3619               class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3620               *class_utf8data++ = XCL_RANGE;
3621               class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3622               class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3623               *class_utf8data++ = XCL_RANGE;
3624               class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3625               class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3626               *class_utf8data++ = XCL_RANGE;
3627               class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3628               class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3629               *class_utf8data++ = XCL_RANGE;
3630               class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3631               class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3632               *class_utf8data++ = XCL_RANGE;
3633               class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3634               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3635               }
3636 #endif
3637             continue;
3638
3639             case ESC_v:
3640             SETBIT(classbits, 0x0a); /* LF */
3641             SETBIT(classbits, 0x0b); /* VT */
3642             SETBIT(classbits, 0x0c); /* FF */
3643             SETBIT(classbits, 0x0d); /* CR */
3644             SETBIT(classbits, 0x85); /* NEL */
3645 #ifdef SUPPORT_UTF8
3646             if (utf8)
3647               {
3648               class_utf8 = TRUE;
3649               *class_utf8data++ = XCL_RANGE;
3650               class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3651               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3652               }
3653 #endif
3654             continue;
3655
3656             case ESC_V:
3657             for (c = 0; c < 32; c++)
3658               {
3659               int x = 0xff;
3660               switch (c)
3661                 {
3662                 case 0x0a/8: x ^= 1 << (0x0a%8);
3663                              x ^= 1 << (0x0b%8);
3664                              x ^= 1 << (0x0c%8);
3665                              x ^= 1 << (0x0d%8);
3666                              break;
3667                 case 0x85/8: x ^= 1 << (0x85%8); break;
3668                 default: break;
3669                 }
3670               classbits[c] |= x;
3671               }
3672
3673 #ifdef SUPPORT_UTF8
3674             if (utf8)
3675               {
3676               class_utf8 = TRUE;
3677               *class_utf8data++ = XCL_RANGE;
3678               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3679               class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3680               *class_utf8data++ = XCL_RANGE;
3681               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3682               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3683               }
3684 #endif
3685             continue;
3686
3687 #ifdef SUPPORT_UCP
3688             case ESC_p:
3689             case ESC_P:
3690               {
3691               BOOL negated;
3692               int pdata;
3693               int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3694               if (ptype < 0) goto FAILED;
3695               class_utf8 = TRUE;
3696               *class_utf8data++ = ((-c == ESC_p) != negated)?
3697                 XCL_PROP : XCL_NOTPROP;
3698               *class_utf8data++ = ptype;
3699               *class_utf8data++ = pdata;
3700               class_charcount -= 2;   /* Not a < 256 character */
3701               continue;
3702               }
3703 #endif
3704             /* Unrecognized escapes are faulted if PCRE is running in its
3705             strict mode. By default, for compatibility with Perl, they are
3706             treated as literals. */
3707
3708             default:
3709             if ((options & PCRE_EXTRA) != 0)
3710               {
3711               *errorcodeptr = ERR7;
3712               goto FAILED;
3713               }
3714             class_charcount -= 2;  /* Undo the default count from above */
3715             c = *ptr;              /* Get the final character and fall through */
3716             break;
3717             }
3718           }
3719
3720         /* Fall through if we have a single character (c >= 0). This may be
3721         greater than 256 in UTF-8 mode. */
3722
3723         }   /* End of backslash handling */
3724
3725       /* A single character may be followed by '-' to form a range. However,
3726       Perl does not permit ']' to be the end of the range. A '-' character
3727       at the end is treated as a literal. Perl ignores orphaned \E sequences
3728       entirely. The code for handling \Q and \E is messy. */
3729
3730       CHECK_RANGE:
3731       while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3732         {
3733         inescq = FALSE;
3734         ptr += 2;
3735         }
3736
3737       oldptr = ptr;
3738
3739       /* Remember \r or \n */
3740
3741       if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3742
3743       /* Check for range */
3744
3745       if (!inescq && ptr[1] == CHAR_MINUS)
3746         {
3747         int d;
3748         ptr += 2;
3749         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3750
3751         /* If we hit \Q (not followed by \E) at this point, go into escaped
3752         mode. */
3753
3754         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3755           {
3756           ptr += 2;
3757           if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3758             { ptr += 2; continue; }
3759           inescq = TRUE;
3760           break;
3761           }
3762
3763         if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3764           {
3765           ptr = oldptr;
3766           goto LONE_SINGLE_CHARACTER;
3767           }
3768
3769 #ifdef SUPPORT_UTF8
3770         if (utf8)
3771           {                           /* Braces are required because the */
3772           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
3773           }
3774         else
3775 #endif
3776         d = *ptr;  /* Not UTF-8 mode */
3777
3778         /* The second part of a range can be a single-character escape, but
3779         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3780         in such circumstances. */
3781
3782         if (!inescq && d == CHAR_BACKSLASH)
3783           {
3784           d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3785           if (*errorcodeptr != 0) goto FAILED;
3786
3787           /* \b is backspace; any other special means the '-' was literal */
3788
3789           if (d < 0)
3790             {
3791             if (d == -ESC_b) d = CHAR_BS; else
3792               {
3793               ptr = oldptr;
3794               goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3795               }
3796             }
3797           }
3798
3799         /* Check that the two values are in the correct order. Optimize
3800         one-character ranges */
3801
3802         if (d < c)
3803           {
3804           *errorcodeptr = ERR8;
3805           goto FAILED;
3806           }
3807
3808         if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3809
3810         /* Remember \r or \n */
3811
3812         if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3813
3814         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3815         matching, we have to use an XCLASS with extra data items. Caseless
3816         matching for characters > 127 is available only if UCP support is
3817         available. */
3818
3819 #ifdef SUPPORT_UTF8
3820         if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3821           {
3822           class_utf8 = TRUE;
3823
3824           /* With UCP support, we can find the other case equivalents of
3825           the relevant characters. There may be several ranges. Optimize how
3826           they fit with the basic range. */
3827
3828 #ifdef SUPPORT_UCP
3829           if ((options & PCRE_CASELESS) != 0)
3830             {
3831             unsigned int occ, ocd;
3832             unsigned int cc = c;
3833             unsigned int origd = d;
3834             while (get_othercase_range(&cc, origd, &occ, &ocd))
3835               {
3836               if (occ >= (unsigned int)c &&
3837                   ocd <= (unsigned int)d)
3838                 continue;                          /* Skip embedded ranges */
3839
3840               if (occ < (unsigned int)c  &&
3841                   ocd >= (unsigned int)c - 1)      /* Extend the basic range */
3842                 {                                  /* if there is overlap,   */
3843                 c = occ;                           /* noting that if occ < c */
3844                 continue;                          /* we can't have ocd > d  */
3845                 }                                  /* because a subrange is  */
3846               if (ocd > (unsigned int)d &&
3847                   occ <= (unsigned int)d + 1)      /* always shorter than    */
3848                 {                                  /* the basic range.       */
3849                 d = ocd;
3850                 continue;
3851                 }
3852
3853               if (occ == ocd)
3854                 {
3855                 *class_utf8data++ = XCL_SINGLE;
3856                 }
3857               else
3858                 {
3859                 *class_utf8data++ = XCL_RANGE;
3860                 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3861                 }
3862               class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3863               }
3864             }
3865 #endif  /* SUPPORT_UCP */
3866
3867           /* Now record the original range, possibly modified for UCP caseless
3868           overlapping ranges. */
3869
3870           *class_utf8data++ = XCL_RANGE;
3871           class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3872           class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3873
3874           /* With UCP support, we are done. Without UCP support, there is no
3875           caseless matching for UTF-8 characters > 127; we can use the bit map
3876           for the smaller ones. */
3877
3878 #ifdef SUPPORT_UCP
3879           continue;    /* With next character in the class */
3880 #else
3881           if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3882
3883           /* Adjust upper limit and fall through to set up the map */
3884
3885           d = 127;
3886
3887 #endif  /* SUPPORT_UCP */
3888           }
3889 #endif  /* SUPPORT_UTF8 */
3890
3891         /* We use the bit map for all cases when not in UTF-8 mode; else
3892         ranges that lie entirely within 0-127 when there is UCP support; else
3893         for partial ranges without UCP support. */
3894
3895         class_charcount += d - c + 1;
3896         class_lastchar = d;
3897
3898         /* We can save a bit of time by skipping this in the pre-compile. */
3899
3900         if (lengthptr == NULL) for (; c <= d; c++)
3901           {
3902           classbits[c/8] |= (1 << (c&7));
3903           if ((options & PCRE_CASELESS) != 0)
3904             {
3905             int uc = cd->fcc[c];           /* flip case */
3906             classbits[uc/8] |= (1 << (uc&7));
3907             }
3908           }
3909
3910         continue;   /* Go get the next char in the class */
3911         }
3912
3913       /* Handle a lone single character - we can get here for a normal
3914       non-escape char, or after \ that introduces a single character or for an
3915       apparent range that isn't. */
3916
3917       LONE_SINGLE_CHARACTER:
3918
3919       /* Handle a character that cannot go in the bit map */
3920
3921 #ifdef SUPPORT_UTF8
3922       if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3923         {
3924         class_utf8 = TRUE;
3925         *class_utf8data++ = XCL_SINGLE;
3926         class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3927
3928 #ifdef SUPPORT_UCP
3929         if ((options & PCRE_CASELESS) != 0)
3930           {
3931           unsigned int othercase;
3932           if ((othercase = UCD_OTHERCASE(c)) != c)
3933             {
3934             *class_utf8data++ = XCL_SINGLE;
3935             class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3936             }
3937           }
3938 #endif  /* SUPPORT_UCP */
3939
3940         }
3941       else
3942 #endif  /* SUPPORT_UTF8 */
3943
3944       /* Handle a single-byte character */
3945         {
3946         classbits[c/8] |= (1 << (c&7));
3947         if ((options & PCRE_CASELESS) != 0)
3948           {
3949           c = cd->fcc[c];   /* flip case */
3950           classbits[c/8] |= (1 << (c&7));
3951           }
3952         class_charcount++;
3953         class_lastchar = c;
3954         }
3955       }
3956
3957     /* Loop until ']' reached. This "while" is the end of the "do" far above.
3958     If we are at the end of an internal nested string, revert to the outer
3959     string. */
3960
3961     while (((c = *(++ptr)) != 0 ||
3962            (nestptr != NULL &&
3963              (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != 0)) &&
3964            (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3965
3966     /* Check for missing terminating ']' */
3967
3968     if (c == 0)
3969       {
3970       *errorcodeptr = ERR6;
3971       goto FAILED;
3972       }
3973
3974     /* If class_charcount is 1, we saw precisely one character whose value is
3975     less than 256. As long as there were no characters >= 128 and there was no
3976     use of \p or \P, in other words, no use of any XCLASS features, we can
3977     optimize.
3978
3979     In UTF-8 mode, we can optimize the negative case only if there were no
3980     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3981     operate on single-bytes only. This is an historical hangover. Maybe one day
3982     we can tidy these opcodes to handle multi-byte characters.
3983
3984     The optimization throws away the bit map. We turn the item into a
3985     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3986     that OP_NOT does not support multibyte characters. In the positive case, it
3987     can cause firstbyte to be set. Otherwise, there can be no first char if
3988     this item is first, whatever repeat count may follow. In the case of
3989     reqbyte, save the previous value for reinstating. */
3990
3991 #ifdef SUPPORT_UTF8
3992     if (class_charcount == 1 && !class_utf8 &&
3993       (!utf8 || !negate_class || class_lastchar < 128))
3994 #else
3995     if (class_charcount == 1)
3996 #endif
3997       {
3998       zeroreqbyte = reqbyte;
3999
4000       /* The OP_NOT opcode works on one-byte characters only. */
4001
4002       if (negate_class)
4003         {
4004         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4005         zerofirstbyte = firstbyte;
4006         *code++ = OP_NOT;
4007         *code++ = class_lastchar;
4008         break;
4009         }
4010
4011       /* For a single, positive character, get the value into mcbuffer, and
4012       then we can handle this with the normal one-character code. */
4013
4014 #ifdef SUPPORT_UTF8
4015       if (utf8 && class_lastchar > 127)
4016         mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
4017       else
4018 #endif
4019         {
4020         mcbuffer[0] = class_lastchar;
4021         mclength = 1;
4022         }
4023       goto ONE_CHAR;
4024       }       /* End of 1-char optimization */
4025
4026     /* The general case - not the one-char optimization. If this is the first
4027     thing in the branch, there can be no first char setting, whatever the
4028     repeat count. Any reqbyte setting must remain unchanged after any kind of
4029     repeat. */
4030
4031     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4032     zerofirstbyte = firstbyte;
4033     zeroreqbyte = reqbyte;
4034
4035     /* If there are characters with values > 255, we have to compile an
4036     extended class, with its own opcode, unless there was a negated special
4037     such as \S in the class, and PCRE_UCP is not set, because in that case all
4038     characters > 255 are in the class, so any that were explicitly given as
4039     well can be ignored. If (when there are explicit characters > 255 that must
4040     be listed) there are no characters < 256, we can omit the bitmap in the
4041     actual compiled code. */
4042
4043 #ifdef SUPPORT_UTF8
4044     if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0))
4045       {
4046       *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
4047       *code++ = OP_XCLASS;
4048       code += LINK_SIZE;
4049       *code = negate_class? XCL_NOT : 0;
4050
4051       /* If the map is required, move up the extra data to make room for it;
4052       otherwise just move the code pointer to the end of the extra data. */
4053
4054       if (class_charcount > 0)
4055         {
4056         *code++ |= XCL_MAP;
4057         memmove(code + 32, code, class_utf8data - code);
4058         memcpy(code, classbits, 32);
4059         code = class_utf8data + 32;
4060         }
4061       else code = class_utf8data;
4062
4063       /* Now fill in the complete length of the item */
4064
4065       PUT(previous, 1, code - previous);
4066       break;   /* End of class handling */
4067       }
4068 #endif
4069
4070     /* If there are no characters > 255, or they are all to be included or
4071     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
4072     whole class was negated and whether there were negative specials such as \S
4073     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
4074     negating it if necessary. */
4075
4076     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
4077     if (negate_class)
4078       {
4079       if (lengthptr == NULL)    /* Save time in the pre-compile phase */
4080         for (c = 0; c < 32; c++) code[c] = ~classbits[c];
4081       }
4082     else
4083       {
4084       memcpy(code, classbits, 32);
4085       }
4086     code += 32;
4087     break;
4088
4089
4090     /* ===================================================================*/
4091     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
4092     has been tested above. */
4093
4094     case CHAR_LEFT_CURLY_BRACKET:
4095     if (!is_quantifier) goto NORMAL_CHAR;
4096     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
4097     if (*errorcodeptr != 0) goto FAILED;
4098     goto REPEAT;
4099
4100     case CHAR_ASTERISK:
4101     repeat_min = 0;
4102     repeat_max = -1;
4103     goto REPEAT;
4104
4105     case CHAR_PLUS:
4106     repeat_min = 1;
4107     repeat_max = -1;
4108     goto REPEAT;
4109
4110     case CHAR_QUESTION_MARK:
4111     repeat_min = 0;
4112     repeat_max = 1;
4113
4114     REPEAT:
4115     if (previous == NULL)
4116       {
4117       *errorcodeptr = ERR9;
4118       goto FAILED;
4119       }
4120
4121     if (repeat_min == 0)
4122       {
4123       firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
4124       reqbyte = zeroreqbyte;        /* Ditto */
4125       }
4126
4127     /* Remember whether this is a variable length repeat */
4128
4129     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
4130
4131     op_type = 0;                    /* Default single-char op codes */
4132     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
4133
4134     /* Save start of previous item, in case we have to move it up to make space
4135     for an inserted OP_ONCE for the additional '+' extension. */
4136
4137     tempcode = previous;
4138
4139     /* If the next character is '+', we have a possessive quantifier. This
4140     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
4141     If the next character is '?' this is a minimizing repeat, by default,
4142     but if PCRE_UNGREEDY is set, it works the other way round. We change the
4143     repeat type to the non-default. */
4144
4145     if (ptr[1] == CHAR_PLUS)
4146       {
4147       repeat_type = 0;                  /* Force greedy */
4148       possessive_quantifier = TRUE;
4149       ptr++;
4150       }
4151     else if (ptr[1] == CHAR_QUESTION_MARK)
4152       {
4153       repeat_type = greedy_non_default;
4154       ptr++;
4155       }
4156     else repeat_type = greedy_default;
4157
4158     /* If previous was a character match, abolish the item and generate a
4159     repeat item instead. If a char item has a minumum of more than one, ensure
4160     that it is set in reqbyte - it might not be if a sequence such as x{3} is
4161     the first thing in a branch because the x will have gone into firstbyte
4162     instead.  */
4163
4164     if (*previous == OP_CHAR || *previous == OP_CHARNC)
4165       {
4166       /* Deal with UTF-8 characters that take up more than one byte. It's
4167       easier to write this out separately than try to macrify it. Use c to
4168       hold the length of the character in bytes, plus 0x80 to flag that it's a
4169       length rather than a small character. */
4170
4171 #ifdef SUPPORT_UTF8
4172       if (utf8 && (code[-1] & 0x80) != 0)
4173         {
4174         uschar *lastchar = code - 1;
4175         while((*lastchar & 0xc0) == 0x80) lastchar--;
4176         c = code - lastchar;            /* Length of UTF-8 character */
4177         memcpy(utf8_char, lastchar, c); /* Save the char */
4178         c |= 0x80;                      /* Flag c as a length */
4179         }
4180       else
4181 #endif
4182
4183       /* Handle the case of a single byte - either with no UTF8 support, or
4184       with UTF-8 disabled, or for a UTF-8 character < 128. */
4185
4186         {
4187         c = code[-1];
4188         if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
4189         }
4190
4191       /* If the repetition is unlimited, it pays to see if the next thing on
4192       the line is something that cannot possibly match this character. If so,
4193       automatically possessifying this item gains some performance in the case
4194       where the match fails. */
4195
4196       if (!possessive_quantifier &&
4197           repeat_max < 0 &&
4198           check_auto_possessive(previous, utf8, ptr + 1, options, cd))
4199         {
4200         repeat_type = 0;    /* Force greedy */
4201         possessive_quantifier = TRUE;
4202         }
4203
4204       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
4205       }
4206
4207     /* If previous was a single negated character ([^a] or similar), we use
4208     one of the special opcodes, replacing it. The code is shared with single-
4209     character repeats by setting opt_type to add a suitable offset into
4210     repeat_type. We can also test for auto-possessification. OP_NOT is
4211     currently used only for single-byte chars. */
4212
4213     else if (*previous == OP_NOT)
4214       {
4215       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
4216       c = previous[1];
4217       if (!possessive_quantifier &&
4218           repeat_max < 0 &&
4219           check_auto_possessive(previous, utf8, ptr + 1, options, cd))
4220         {
4221         repeat_type = 0;    /* Force greedy */
4222         possessive_quantifier = TRUE;
4223         }
4224       goto OUTPUT_SINGLE_REPEAT;
4225       }
4226
4227     /* If previous was a character type match (\d or similar), abolish it and
4228     create a suitable repeat item. The code is shared with single-character
4229     repeats by setting op_type to add a suitable offset into repeat_type. Note
4230     the the Unicode property types will be present only when SUPPORT_UCP is
4231     defined, but we don't wrap the little bits of code here because it just
4232     makes it horribly messy. */
4233
4234     else if (*previous < OP_EODN)
4235       {
4236       uschar *oldcode;
4237       int prop_type, prop_value;
4238       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
4239       c = *previous;
4240
4241       if (!possessive_quantifier &&
4242           repeat_max < 0 &&
4243           check_auto_possessive(previous, utf8, ptr + 1, options, cd))
4244         {
4245         repeat_type = 0;    /* Force greedy */
4246         possessive_quantifier = TRUE;
4247         }
4248
4249       OUTPUT_SINGLE_REPEAT:
4250       if (*previous == OP_PROP || *previous == OP_NOTPROP)
4251         {
4252         prop_type = previous[1];
4253         prop_value = previous[2];
4254         }
4255       else prop_type = prop_value = -1;
4256
4257       oldcode = code;
4258       code = previous;                  /* Usually overwrite previous item */
4259
4260       /* If the maximum is zero then the minimum must also be zero; Perl allows
4261       this case, so we do too - by simply omitting the item altogether. */
4262
4263       if (repeat_max == 0) goto END_REPEAT;
4264
4265       /*--------------------------------------------------------------------*/
4266       /* This code is obsolete from release 8.00; the restriction was finally
4267       removed: */
4268
4269       /* All real repeats make it impossible to handle partial matching (maybe
4270       one day we will be able to remove this restriction). */
4271
4272       /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4273       /*--------------------------------------------------------------------*/
4274
4275       /* Combine the op_type with the repeat_type */
4276
4277       repeat_type += op_type;
4278
4279       /* A minimum of zero is handled either as the special case * or ?, or as
4280       an UPTO, with the maximum given. */
4281
4282       if (repeat_min == 0)
4283         {
4284         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
4285           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
4286         else
4287           {
4288           *code++ = OP_UPTO + repeat_type;
4289           PUT2INC(code, 0, repeat_max);
4290           }
4291         }
4292
4293       /* A repeat minimum of 1 is optimized into some special cases. If the
4294       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
4295       left in place and, if the maximum is greater than 1, we use OP_UPTO with
4296       one less than the maximum. */
4297
4298       else if (repeat_min == 1)
4299         {
4300         if (repeat_max == -1)
4301           *code++ = OP_PLUS + repeat_type;
4302         else
4303           {
4304           code = oldcode;                 /* leave previous item in place */
4305           if (repeat_max == 1) goto END_REPEAT;
4306           *code++ = OP_UPTO + repeat_type;
4307           PUT2INC(code, 0, repeat_max - 1);
4308           }
4309         }
4310
4311       /* The case {n,n} is just an EXACT, while the general case {n,m} is
4312       handled as an EXACT followed by an UPTO. */
4313
4314       else
4315         {
4316         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
4317         PUT2INC(code, 0, repeat_min);
4318
4319         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
4320         we have to insert the character for the previous code. For a repeated
4321         Unicode property match, there are two extra bytes that define the
4322         required property. In UTF-8 mode, long characters have their length in
4323         c, with the 0x80 bit as a flag. */
4324
4325         if (repeat_max < 0)
4326           {
4327 #ifdef SUPPORT_UTF8
4328           if (utf8 && c >= 128)
4329             {
4330             memcpy(code, utf8_char, c & 7);
4331             code += c & 7;
4332             }
4333           else
4334 #endif
4335             {
4336             *code++ = c;
4337             if (prop_type >= 0)
4338               {
4339               *code++ = prop_type;
4340               *code++ = prop_value;
4341               }
4342             }
4343           *code++ = OP_STAR + repeat_type;
4344           }
4345
4346         /* Else insert an UPTO if the max is greater than the min, again
4347         preceded by the character, for the previously inserted code. If the
4348         UPTO is just for 1 instance, we can use QUERY instead. */
4349
4350         else if (repeat_max != repeat_min)
4351           {
4352 #ifdef SUPPORT_UTF8
4353           if (utf8 && c >= 128)
4354             {
4355             memcpy(code, utf8_char, c & 7);
4356             code += c & 7;
4357             }
4358           else
4359 #endif
4360           *code++ = c;
4361           if (prop_type >= 0)
4362             {
4363             *code++ = prop_type;
4364             *code++ = prop_value;
4365             }
4366           repeat_max -= repeat_min;
4367
4368           if (repeat_max == 1)
4369             {
4370             *code++ = OP_QUERY + repeat_type;
4371             }
4372           else
4373             {
4374             *code++ = OP_UPTO + repeat_type;
4375             PUT2INC(code, 0, repeat_max);
4376             }
4377           }
4378         }
4379
4380       /* The character or character type itself comes last in all cases. */
4381
4382 #ifdef SUPPORT_UTF8
4383       if (utf8 && c >= 128)
4384         {
4385         memcpy(code, utf8_char, c & 7);
4386         code += c & 7;
4387         }
4388       else
4389 #endif
4390       *code++ = c;
4391
4392       /* For a repeated Unicode property match, there are two extra bytes that
4393       define the required property. */
4394
4395 #ifdef SUPPORT_UCP
4396       if (prop_type >= 0)
4397         {
4398         *code++ = prop_type;
4399         *code++ = prop_value;
4400         }
4401 #endif
4402       }
4403
4404     /* If previous was a character class or a back reference, we put the repeat
4405     stuff after it, but just skip the item if the repeat was {0,0}. */
4406
4407     else if (*previous == OP_CLASS ||
4408              *previous == OP_NCLASS ||
4409 #ifdef SUPPORT_UTF8
4410              *previous == OP_XCLASS ||
4411 #endif
4412              *previous == OP_REF)
4413       {
4414       if (repeat_max == 0)
4415         {
4416         code = previous;
4417         goto END_REPEAT;
4418         }
4419
4420       /*--------------------------------------------------------------------*/
4421       /* This code is obsolete from release 8.00; the restriction was finally
4422       removed: */
4423
4424       /* All real repeats make it impossible to handle partial matching (maybe
4425       one day we will be able to remove this restriction). */
4426
4427       /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4428       /*--------------------------------------------------------------------*/
4429
4430       if (repeat_min == 0 && repeat_max == -1)
4431         *code++ = OP_CRSTAR + repeat_type;
4432       else if (repeat_min == 1 && repeat_max == -1)
4433         *code++ = OP_CRPLUS + repeat_type;
4434       else if (repeat_min == 0 && repeat_max == 1)
4435         *code++ = OP_CRQUERY + repeat_type;
4436       else
4437         {
4438         *code++ = OP_CRRANGE + repeat_type;
4439         PUT2INC(code, 0, repeat_min);
4440         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
4441         PUT2INC(code, 0, repeat_max);
4442         }
4443       }
4444
4445     /* If previous was a bracket group, we may have to replicate it in certain
4446     cases. */
4447
4448     else if (*previous == OP_BRA  || *previous == OP_CBRA ||
4449              *previous == OP_ONCE || *previous == OP_COND)
4450       {
4451       register int i;
4452       int ketoffset = 0;
4453       int len = (int)(code - previous);
4454       uschar *bralink = NULL;
4455
4456       /* Repeating a DEFINE group is pointless */
4457
4458       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
4459         {
4460         *errorcodeptr = ERR55;
4461         goto FAILED;
4462         }
4463
4464       /* If the maximum repeat count is unlimited, find the end of the bracket
4465       by scanning through from the start, and compute the offset back to it
4466       from the current code pointer. There may be an OP_OPT setting following
4467       the final KET, so we can't find the end just by going back from the code
4468       pointer. */
4469
4470       if (repeat_max == -1)
4471         {
4472         register uschar *ket = previous;
4473         do ket += GET(ket, 1); while (*ket != OP_KET);
4474         ketoffset = (int)(code - ket);
4475         }
4476
4477       /* The case of a zero minimum is special because of the need to stick
4478       OP_BRAZERO in front of it, and because the group appears once in the
4479       data, whereas in other cases it appears the minimum number of times. For
4480       this reason, it is simplest to treat this case separately, as otherwise
4481       the code gets far too messy. There are several special subcases when the
4482       minimum is zero. */
4483
4484       if (repeat_min == 0)
4485         {
4486         /* If the maximum is also zero, we used to just omit the group from the
4487         output altogether, like this:
4488
4489         ** if (repeat_max == 0)
4490         **   {
4491         **   code = previous;
4492         **   goto END_REPEAT;
4493         **   }
4494
4495         However, that fails when a group is referenced as a subroutine from
4496         elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
4497         so that it is skipped on execution. As we don't have a list of which
4498         groups are referenced, we cannot do this selectively.
4499
4500         If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
4501         and do no more at this point. However, we do need to adjust any
4502         OP_RECURSE calls inside the group that refer to the group itself or any
4503         internal or forward referenced group, because the offset is from the
4504         start of the whole regex. Temporarily terminate the pattern while doing
4505         this. */
4506
4507         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
4508           {
4509           *code = OP_END;
4510           adjust_recurse(previous, 1, utf8, cd, save_hwm);
4511           memmove(previous+1, previous, len);
4512           code++;
4513           if (repeat_max == 0)
4514             {
4515             *previous++ = OP_SKIPZERO;
4516             goto END_REPEAT;
4517             }
4518           *previous++ = OP_BRAZERO + repeat_type;
4519           }
4520
4521         /* If the maximum is greater than 1 and limited, we have to replicate
4522         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
4523         The first one has to be handled carefully because it's the original
4524         copy, which has to be moved up. The remainder can be handled by code
4525         that is common with the non-zero minimum case below. We have to
4526         adjust the value or repeat_max, since one less copy is required. Once
4527         again, we may have to adjust any OP_RECURSE calls inside the group. */
4528
4529         else
4530           {
4531           int offset;
4532           *code = OP_END;
4533           adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
4534           memmove(previous + 2 + LINK_SIZE, previous, len);
4535           code += 2 + LINK_SIZE;
4536           *previous++ = OP_BRAZERO + repeat_type;
4537           *previous++ = OP_BRA;
4538
4539           /* We chain together the bracket offset fields that have to be
4540           filled in later when the ends of the brackets are reached. */
4541
4542           offset = (bralink == NULL)? 0 : (int)(previous - bralink);
4543           bralink = previous;
4544           PUTINC(previous, 0, offset);
4545           }
4546
4547         repeat_max--;
4548         }
4549
4550       /* If the minimum is greater than zero, replicate the group as many
4551       times as necessary, and adjust the maximum to the number of subsequent
4552       copies that we need. If we set a first char from the group, and didn't
4553       set a required char, copy the latter from the former. If there are any
4554       forward reference subroutine calls in the group, there will be entries on
4555       the workspace list; replicate these with an appropriate increment. */
4556
4557       else
4558         {
4559         if (repeat_min > 1)
4560           {
4561           /* In the pre-compile phase, we don't actually do the replication. We
4562           just adjust the length as if we had. Do some paranoid checks for
4563           potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
4564           integer type when available, otherwise double. */
4565
4566           if (lengthptr != NULL)
4567             {
4568             int delta = (repeat_min - 1)*length_prevgroup;
4569             if ((INT64_OR_DOUBLE)(repeat_min - 1)*
4570                   (INT64_OR_DOUBLE)length_prevgroup >
4571                     (INT64_OR_DOUBLE)INT_MAX ||
4572                 OFLOW_MAX - *lengthptr < delta)
4573               {
4574               *errorcodeptr = ERR20;
4575               goto FAILED;
4576               }
4577             *lengthptr += delta;
4578             }
4579
4580           /* This is compiling for real */
4581
4582           else
4583             {
4584             if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
4585             for (i = 1; i < repeat_min; i++)
4586               {
4587               uschar *hc;
4588               uschar *this_hwm = cd->hwm;
4589               memcpy(code, previous, len);
4590               for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4591                 {
4592                 PUT(cd->hwm, 0, GET(hc, 0) + len);
4593                 cd->hwm += LINK_SIZE;
4594                 }
4595               save_hwm = this_hwm;
4596               code += len;
4597               }
4598             }
4599           }
4600
4601         if (repeat_max > 0) repeat_max -= repeat_min;
4602         }
4603
4604       /* This code is common to both the zero and non-zero minimum cases. If
4605       the maximum is limited, it replicates the group in a nested fashion,
4606       remembering the bracket starts on a stack. In the case of a zero minimum,
4607       the first one was set up above. In all cases the repeat_max now specifies
4608       the number of additional copies needed. Again, we must remember to
4609       replicate entries on the forward reference list. */
4610
4611       if (repeat_max >= 0)
4612         {
4613         /* In the pre-compile phase, we don't actually do the replication. We
4614         just adjust the length as if we had. For each repetition we must add 1
4615         to the length for BRAZERO and for all but the last repetition we must
4616         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4617         paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
4618         a 64-bit integer type when available, otherwise double. */
4619
4620         if (lengthptr != NULL && repeat_max > 0)
4621           {
4622           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4623                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
4624           if ((INT64_OR_DOUBLE)repeat_max *
4625                 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4626                   > (INT64_OR_DOUBLE)INT_MAX ||
4627               OFLOW_MAX - *lengthptr < delta)
4628             {
4629             *errorcodeptr = ERR20;
4630             goto FAILED;
4631             }
4632           *lengthptr += delta;
4633           }
4634
4635         /* This is compiling for real */
4636
4637         else for (i = repeat_max - 1; i >= 0; i--)
4638           {
4639           uschar *hc;
4640           uschar *this_hwm = cd->hwm;
4641
4642           *code++ = OP_BRAZERO + repeat_type;
4643
4644           /* All but the final copy start a new nesting, maintaining the
4645           chain of brackets outstanding. */
4646
4647           if (i != 0)
4648             {
4649             int offset;
4650             *code++ = OP_BRA;
4651             offset = (bralink == NULL)? 0 : (int)(code - bralink);
4652             bralink = code;
4653             PUTINC(code, 0, offset);
4654             }
4655
4656           memcpy(code, previous, len);
4657           for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4658             {
4659             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4660             cd->hwm += LINK_SIZE;
4661             }
4662           save_hwm = this_hwm;
4663           code += len;
4664           }
4665
4666         /* Now chain through the pending brackets, and fill in their length
4667         fields (which are holding the chain links pro tem). */
4668
4669         while (bralink != NULL)
4670           {
4671           int oldlinkoffset;
4672           int offset = (int)(code - bralink + 1);
4673           uschar *bra = code - offset;
4674           oldlinkoffset = GET(bra, 1);
4675           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4676           *code++ = OP_KET;
4677           PUTINC(code, 0, offset);
4678           PUT(bra, 1, offset);
4679           }
4680         }
4681
4682       /* If the maximum is unlimited, set a repeater in the final copy. We
4683       can't just offset backwards from the current code point, because we
4684       don't know if there's been an options resetting after the ket. The
4685       correct offset was computed above.
4686
4687       Then, when we are doing the actual compile phase, check to see whether
4688       this group is a non-atomic one that could match an empty string. If so,
4689       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4690       that runtime checking can be done. [This check is also applied to
4691       atomic groups at runtime, but in a different way.] */
4692
4693       else
4694         {
4695         uschar *ketcode = code - ketoffset;
4696         uschar *bracode = ketcode - GET(ketcode, 1);
4697         *ketcode = OP_KETRMAX + repeat_type;
4698         if (lengthptr == NULL && *bracode != OP_ONCE)
4699           {
4700           uschar *scode = bracode;
4701           do
4702             {
4703             if (could_be_empty_branch(scode, ketcode, utf8, cd))
4704               {
4705               *bracode += OP_SBRA - OP_BRA;
4706               break;
4707               }
4708             scode += GET(scode, 1);
4709             }
4710           while (*scode == OP_ALT);
4711           }
4712         }
4713       }
4714
4715     /* If previous is OP_FAIL, it was generated by an empty class [] in
4716     JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4717     by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4718     error above. We can just ignore the repeat in JS case. */
4719
4720     else if (*previous == OP_FAIL) goto END_REPEAT;
4721
4722     /* Else there's some kind of shambles */
4723
4724     else
4725       {
4726       *errorcodeptr = ERR11;
4727       goto FAILED;
4728       }
4729
4730     /* If the character following a repeat is '+', or if certain optimization
4731     tests above succeeded, possessive_quantifier is TRUE. For some of the
4732     simpler opcodes, there is an special alternative opcode for this. For
4733     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4734     The '+' notation is just syntactic sugar, taken from Sun's Java package,
4735     but the special opcodes can optimize it a bit. The repeated item starts at
4736     tempcode, not at previous, which might be the first part of a string whose
4737     (former) last char we repeated.
4738
4739     Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4740     an 'upto' may follow. We skip over an 'exact' item, and then test the
4741     length of what remains before proceeding. */
4742
4743     if (possessive_quantifier)
4744       {
4745       int len;
4746
4747       if (*tempcode == OP_TYPEEXACT)
4748         tempcode += _pcre_OP_lengths[*tempcode] +
4749           ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);
4750
4751       else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
4752         {
4753         tempcode += _pcre_OP_lengths[*tempcode];
4754 #ifdef SUPPORT_UTF8
4755         if (utf8 && tempcode[-1] >= 0xc0)
4756           tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
4757 #endif
4758         }
4759
4760       len = (int)(code - tempcode);
4761       if (len > 0) switch (*tempcode)
4762         {
4763         case OP_STAR:  *tempcode = OP_POSSTAR; break;
4764         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
4765         case OP_QUERY: *tempcode = OP_POSQUERY; break;
4766         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
4767
4768         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
4769         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
4770         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4771         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
4772
4773         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
4774         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
4775         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4776         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
4777
4778         /* Because we are moving code along, we must ensure that any
4779         pending recursive references are updated. */
4780
4781         default:
4782         *code = OP_END;
4783         adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);
4784         memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4785         code += 1 + LINK_SIZE;
4786         len += 1 + LINK_SIZE;
4787         tempcode[0] = OP_ONCE;
4788         *code++ = OP_KET;
4789         PUTINC(code, 0, len);
4790         PUT(tempcode, 1, len);
4791         break;
4792         }
4793       }
4794
4795     /* In all case we no longer have a previous item. We also set the
4796     "follows varying string" flag for subsequently encountered reqbytes if
4797     it isn't already set and we have just passed a varying length item. */
4798
4799     END_REPEAT:
4800     previous = NULL;
4801     cd->req_varyopt |= reqvary;
4802     break;
4803
4804
4805     /* ===================================================================*/
4806     /* Start of nested parenthesized sub-expression, or comment or lookahead or
4807     lookbehind or option setting or condition or all the other extended
4808     parenthesis forms.  */
4809
4810     case CHAR_LEFT_PARENTHESIS:
4811     newoptions = options;
4812     skipbytes = 0;
4813     bravalue = OP_CBRA;
4814     save_hwm = cd->hwm;
4815     reset_bracount = FALSE;
4816
4817     /* First deal with various "verbs" that can be introduced by '*'. */
4818
4819     if (*(++ptr) == CHAR_ASTERISK &&
4820          ((cd->ctypes[ptr[1]] & ctype_letter) != 0 || ptr[1] == ':'))
4821       {
4822       int i, namelen;
4823       int arglen = 0;
4824       const char *vn = verbnames;
4825       const uschar *name = ptr + 1;
4826       const uschar *arg = NULL;
4827       previous = NULL;
4828       while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4829       namelen = (int)(ptr - name);
4830
4831       if (*ptr == CHAR_COLON)
4832         {
4833         arg = ++ptr;
4834         while ((cd->ctypes[*ptr] & (ctype_letter|ctype_digit)) != 0
4835           || *ptr == '_') ptr++;
4836         arglen = (int)(ptr - arg);
4837         }
4838
4839       if (*ptr != CHAR_RIGHT_PARENTHESIS)
4840         {
4841         *errorcodeptr = ERR60;
4842         goto FAILED;
4843         }
4844
4845       /* Scan the table of verb names */
4846
4847       for (i = 0; i < verbcount; i++)
4848         {
4849         if (namelen == verbs[i].len &&
4850             strncmp((char *)name, vn, namelen) == 0)
4851           {
4852           /* Check for open captures before ACCEPT */
4853
4854           if (verbs[i].op == OP_ACCEPT)
4855             {
4856             open_capitem *oc;
4857             cd->had_accept = TRUE;
4858             for (oc = cd->open_caps; oc != NULL; oc = oc->next)
4859               {
4860               *code++ = OP_CLOSE;
4861               PUT2INC(code, 0, oc->number);
4862               }
4863             }
4864
4865           /* Handle the cases with/without an argument */
4866
4867           if (arglen == 0)
4868             {
4869             if (verbs[i].op < 0)   /* Argument is mandatory */
4870               {
4871               *errorcodeptr = ERR66;
4872               goto FAILED;
4873               }
4874             *code = verbs[i].op;
4875             if (*code++ == OP_THEN)
4876               {
4877               PUT(code, 0, code - bcptr->current_branch - 1);
4878               code += LINK_SIZE;
4879               }
4880             }
4881
4882           else
4883             {
4884             if (verbs[i].op_arg < 0)   /* Argument is forbidden */
4885               {
4886               *errorcodeptr = ERR59;
4887               goto FAILED;
4888               }
4889             *code = verbs[i].op_arg;
4890             if (*code++ == OP_THEN_ARG)
4891               {
4892               PUT(code, 0, code - bcptr->current_branch - 1);
4893               code += LINK_SIZE;
4894               }
4895             *code++ = arglen;
4896             memcpy(code, arg, arglen);
4897             code += arglen;
4898             *code++ = 0;
4899             }
4900
4901           break;  /* Found verb, exit loop */
4902           }
4903
4904         vn += verbs[i].len + 1;
4905         }
4906
4907       if (i < verbcount) continue;    /* Successfully handled a verb */
4908       *errorcodeptr = ERR60;          /* Verb not recognized */
4909       goto FAILED;
4910       }
4911
4912     /* Deal with the extended parentheses; all are introduced by '?', and the
4913     appearance of any of them means that this is not a capturing group. */
4914
4915     else if (*ptr == CHAR_QUESTION_MARK)
4916       {
4917       int i, set, unset, namelen;
4918       int *optset;
4919       const uschar *name;
4920       uschar *slot;
4921
4922       switch (*(++ptr))
4923         {
4924         case CHAR_NUMBER_SIGN:                 /* Comment; skip to ket */
4925         ptr++;
4926         while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4927         if (*ptr == 0)
4928           {
4929           *errorcodeptr = ERR18;
4930           goto FAILED;
4931           }
4932         continue;
4933
4934
4935         /* ------------------------------------------------------------ */
4936         case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
4937         reset_bracount = TRUE;
4938         /* Fall through */
4939
4940         /* ------------------------------------------------------------ */
4941         case CHAR_COLON:          /* Non-capturing bracket */
4942         bravalue = OP_BRA;
4943         ptr++;
4944         break;
4945
4946
4947         /* ------------------------------------------------------------ */
4948         case CHAR_LEFT_PARENTHESIS:
4949         bravalue = OP_COND;       /* Conditional group */
4950
4951         /* A condition can be an assertion, a number (referring to a numbered
4952         group), a name (referring to a named group), or 'R', referring to
4953         recursion. R<digits> and R&name are also permitted for recursion tests.
4954
4955         There are several syntaxes for testing a named group: (?(name)) is used
4956         by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4957
4958         There are two unfortunate ambiguities, caused by history. (a) 'R' can
4959         be the recursive thing or the name 'R' (and similarly for 'R' followed
4960         by digits), and (b) a number could be a name that consists of digits.
4961         In both cases, we look for a name first; if not found, we try the other
4962         cases. */
4963
4964         /* For conditions that are assertions, check the syntax, and then exit
4965         the switch. This will take control down to where bracketed groups,
4966         including assertions, are processed. */
4967
4968         if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
4969             ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
4970           break;
4971
4972         /* Most other conditions use OP_CREF (a couple change to OP_RREF
4973         below), and all need to skip 3 bytes at the start of the group. */
4974
4975         code[1+LINK_SIZE] = OP_CREF;
4976         skipbytes = 3;
4977         refsign = -1;
4978
4979         /* Check for a test for recursion in a named group. */
4980
4981         if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
4982           {
4983           terminator = -1;
4984           ptr += 2;
4985           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
4986           }
4987
4988         /* Check for a test for a named group's having been set, using the Perl
4989         syntax (?(<name>) or (?('name') */
4990
4991         else if (ptr[1] == CHAR_LESS_THAN_SIGN)
4992           {
4993           terminator = CHAR_GREATER_THAN_SIGN;
4994           ptr++;
4995           }
4996         else if (ptr[1] == CHAR_APOSTROPHE)
4997           {
4998           terminator = CHAR_APOSTROPHE;
4999           ptr++;
5000           }
5001         else
5002           {
5003           terminator = 0;
5004           if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
5005           }
5006
5007         /* We now expect to read a name; any thing else is an error */
5008
5009         if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
5010           {
5011           ptr += 1;  /* To get the right offset */
5012           *errorcodeptr = ERR28;
5013           goto FAILED;
5014           }
5015
5016         /* Read the name, but also get it as a number if it's all digits */
5017
5018         recno = 0;
5019         name = ++ptr;
5020         while ((cd->ctypes[*ptr] & ctype_word) != 0)
5021           {
5022           if (recno >= 0)
5023             recno = ((digitab[*ptr] & ctype_digit) != 0)?
5024               recno * 10 + *ptr - CHAR_0 : -1;
5025           ptr++;
5026           }
5027         namelen = (int)(ptr - name);
5028
5029         if ((terminator > 0 && *ptr++ != terminator) ||
5030             *ptr++ != CHAR_RIGHT_PARENTHESIS)
5031           {
5032           ptr--;      /* Error offset */
5033           *errorcodeptr = ERR26;
5034           goto FAILED;
5035           }
5036
5037         /* Do no further checking in the pre-compile phase. */
5038
5039         if (lengthptr != NULL) break;
5040
5041         /* In the real compile we do the work of looking for the actual
5042         reference. If the string started with "+" or "-" we require the rest to
5043         be digits, in which case recno will be set. */
5044
5045         if (refsign > 0)
5046           {
5047           if (recno <= 0)
5048             {
5049             *errorcodeptr = ERR58;
5050             goto FAILED;
5051             }
5052           recno = (refsign == CHAR_MINUS)?
5053             cd->bracount - recno + 1 : recno +cd->bracount;
5054           if (recno <= 0 || recno > cd->final_bracount)
5055             {
5056             *errorcodeptr = ERR15;
5057             goto FAILED;
5058             }
5059           PUT2(code, 2+LINK_SIZE, recno);
5060           break;
5061           }
5062
5063         /* Otherwise (did not start with "+" or "-"), start by looking for the
5064         name. If we find a name, add one to the opcode to change OP_CREF or
5065         OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
5066         except they record that the reference was originally to a name. The
5067         information is used to check duplicate names. */
5068
5069         slot = cd->name_table;
5070         for (i = 0; i < cd->names_found; i++)
5071           {
5072           if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
5073           slot += cd->name_entry_size;
5074           }
5075
5076         /* Found a previous named subpattern */
5077
5078         if (i < cd->names_found)
5079           {
5080           recno = GET2(slot, 0);
5081           PUT2(code, 2+LINK_SIZE, recno);
5082           code[1+LINK_SIZE]++;
5083           }
5084
5085         /* Search the pattern for a forward reference */
5086
5087         else if ((i = find_parens(cd, name, namelen,
5088                         (options & PCRE_EXTENDED) != 0, utf8)) > 0)
5089           {
5090           PUT2(code, 2+LINK_SIZE, i);
5091           code[1+LINK_SIZE]++;
5092           }
5093
5094         /* If terminator == 0 it means that the name followed directly after
5095         the opening parenthesis [e.g. (?(abc)...] and in this case there are
5096         some further alternatives to try. For the cases where terminator != 0
5097         [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
5098         now checked all the possibilities, so give an error. */
5099
5100         else if (terminator != 0)
5101           {
5102           *errorcodeptr = ERR15;
5103           goto FAILED;
5104           }
5105
5106         /* Check for (?(R) for recursion. Allow digits after R to specify a
5107         specific group number. */
5108
5109         else if (*name == CHAR_R)
5110           {
5111           recno = 0;
5112           for (i = 1; i < namelen; i++)
5113             {
5114             if ((digitab[name[i]] & ctype_digit) == 0)
5115               {
5116               *errorcodeptr = ERR15;
5117               goto FAILED;
5118               }
5119             recno = recno * 10 + name[i] - CHAR_0;
5120             }
5121           if (recno == 0) recno = RREF_ANY;
5122           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
5123           PUT2(code, 2+LINK_SIZE, recno);
5124           }
5125
5126         /* Similarly, check for the (?(DEFINE) "condition", which is always
5127         false. */
5128
5129         else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
5130           {
5131           code[1+LINK_SIZE] = OP_DEF;
5132           skipbytes = 1;
5133           }
5134
5135         /* Check for the "name" actually being a subpattern number. We are
5136         in the second pass here, so final_bracount is set. */
5137
5138         else if (recno > 0 && recno <= cd->final_bracount)
5139           {
5140           PUT2(code, 2+LINK_SIZE, recno);
5141           }
5142
5143         /* Either an unidentified subpattern, or a reference to (?(0) */
5144
5145         else
5146           {
5147           *errorcodeptr = (recno == 0)? ERR35: ERR15;
5148           goto FAILED;
5149           }
5150         break;
5151
5152
5153         /* ------------------------------------------------------------ */
5154         case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
5155         bravalue = OP_ASSERT;
5156         ptr++;
5157         break;
5158
5159
5160         /* ------------------------------------------------------------ */
5161         case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
5162         ptr++;
5163         if (*ptr == CHAR_RIGHT_PARENTHESIS)    /* Optimize (?!) */
5164           {
5165           *code++ = OP_FAIL;
5166           previous = NULL;
5167           continue;
5168           }
5169         bravalue = OP_ASSERT_NOT;
5170         break;
5171
5172
5173         /* ------------------------------------------------------------ */
5174         case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
5175         switch (ptr[1])
5176           {
5177           case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
5178           bravalue = OP_ASSERTBACK;
5179           ptr += 2;
5180           break;
5181
5182           case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
5183           bravalue = OP_ASSERTBACK_NOT;
5184           ptr += 2;
5185           break;
5186
5187           default:                /* Could be name define, else bad */
5188           if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
5189           ptr++;                  /* Correct offset for error */
5190           *errorcodeptr = ERR24;
5191           goto FAILED;
5192           }
5193         break;
5194
5195
5196         /* ------------------------------------------------------------ */
5197         case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
5198         bravalue = OP_ONCE;
5199         ptr++;
5200         break;
5201
5202
5203         /* ------------------------------------------------------------ */
5204         case CHAR_C:                 /* Callout - may be followed by digits; */
5205         previous_callout = code;  /* Save for later completion */
5206         after_manual_callout = 1; /* Skip one item before completing */
5207         *code++ = OP_CALLOUT;
5208           {
5209           int n = 0;
5210           while ((digitab[*(++ptr)] & ctype_digit) != 0)
5211             n = n * 10 + *ptr - CHAR_0;
5212           if (*ptr != CHAR_RIGHT_PARENTHESIS)
5213             {
5214             *errorcodeptr = ERR39;
5215             goto FAILED;
5216             }
5217           if (n > 255)
5218             {
5219             *errorcodeptr = ERR38;
5220             goto FAILED;
5221             }
5222           *code++ = n;
5223           PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
5224           PUT(code, LINK_SIZE, 0);                          /* Default length */
5225           code += 2 * LINK_SIZE;
5226           }
5227         previous = NULL;
5228         continue;
5229
5230
5231         /* ------------------------------------------------------------ */
5232         case CHAR_P:              /* Python-style named subpattern handling */
5233         if (*(++ptr) == CHAR_EQUALS_SIGN ||
5234             *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
5235           {
5236           is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
5237           terminator = CHAR_RIGHT_PARENTHESIS;
5238           goto NAMED_REF_OR_RECURSE;
5239           }
5240         else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
5241           {
5242           *errorcodeptr = ERR41;
5243           goto FAILED;
5244           }
5245         /* Fall through to handle (?P< as (?< is handled */
5246
5247
5248         /* ------------------------------------------------------------ */
5249         DEFINE_NAME:    /* Come here from (?< handling */
5250         case CHAR_APOSTROPHE:
5251           {
5252           terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
5253             CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5254           name = ++ptr;
5255
5256           while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
5257           namelen = (int)(ptr - name);
5258
5259           /* In the pre-compile phase, just do a syntax check. */
5260
5261           if (lengthptr != NULL)
5262             {
5263             if (*ptr != terminator)
5264               {
5265               *errorcodeptr = ERR42;
5266               goto FAILED;
5267               }
5268             if (cd->names_found >= MAX_NAME_COUNT)
5269               {
5270               *errorcodeptr = ERR49;
5271               goto FAILED;
5272               }
5273             if (namelen + 3 > cd->name_entry_size)
5274               {
5275               cd->name_entry_size = namelen + 3;
5276               if (namelen > MAX_NAME_SIZE)
5277                 {
5278                 *errorcodeptr = ERR48;
5279                 goto FAILED;
5280                 }
5281               }
5282             }
5283
5284           /* In the real compile, create the entry in the table, maintaining
5285           alphabetical order. Duplicate names for different numbers are
5286           permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
5287           number are always OK. (An existing number can be re-used if (?|
5288           appears in the pattern.) In either event, a duplicate name results in
5289           a duplicate entry in the table, even if the number is the same. This
5290           is because the number of names, and hence the table size, is computed
5291           in the pre-compile, and it affects various numbers and pointers which
5292           would all have to be modified, and the compiled code moved down, if
5293           duplicates with the same number were omitted from the table. This
5294           doesn't seem worth the hassle. However, *different* names for the
5295           same number are not permitted. */
5296
5297           else
5298             {
5299             BOOL dupname = FALSE;
5300             slot = cd->name_table;
5301
5302             for (i = 0; i < cd->names_found; i++)
5303               {
5304               int crc = memcmp(name, slot+2, namelen);
5305               if (crc == 0)
5306                 {
5307                 if (slot[2+namelen] == 0)
5308                   {
5309                   if (GET2(slot, 0) != cd->bracount + 1 &&
5310                       (options & PCRE_DUPNAMES) == 0)
5311                     {
5312                     *errorcodeptr = ERR43;
5313                     goto FAILED;
5314                     }
5315                   else dupname = TRUE;
5316                   }
5317                 else crc = -1;      /* Current name is a substring */
5318                 }
5319
5320               /* Make space in the table and break the loop for an earlier
5321               name. For a duplicate or later name, carry on. We do this for
5322               duplicates so that in the simple case (when ?(| is not used) they
5323               are in order of their numbers. */
5324
5325               if (crc < 0)
5326                 {
5327                 memmove(slot + cd->name_entry_size, slot,
5328                   (cd->names_found - i) * cd->name_entry_size);
5329                 break;
5330                 }
5331
5332               /* Continue the loop for a later or duplicate name */
5333
5334               slot += cd->name_entry_size;
5335               }
5336
5337             /* For non-duplicate names, check for a duplicate number before
5338             adding the new name. */
5339
5340             if (!dupname)
5341               {
5342               uschar *cslot = cd->name_table;
5343               for (i = 0; i < cd->names_found; i++)
5344                 {
5345                 if (cslot != slot)
5346                   {
5347                   if (GET2(cslot, 0) == cd->bracount + 1)
5348                     {
5349                     *errorcodeptr = ERR65;
5350                     goto FAILED;
5351                     }
5352                   }
5353                 else i--;
5354                 cslot += cd->name_entry_size;
5355                 }
5356               }
5357
5358             PUT2(slot, 0, cd->bracount + 1);
5359             memcpy(slot + 2, name, namelen);
5360             slot[2+namelen] = 0;
5361             }
5362           }
5363
5364         /* In both pre-compile and compile, count the number of names we've
5365         encountered. */
5366
5367         cd->names_found++;
5368         ptr++;                    /* Move past > or ' */
5369         goto NUMBERED_GROUP;
5370
5371
5372         /* ------------------------------------------------------------ */
5373         case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
5374         terminator = CHAR_RIGHT_PARENTHESIS;
5375         is_recurse = TRUE;
5376         /* Fall through */
5377
5378         /* We come here from the Python syntax above that handles both
5379         references (?P=name) and recursion (?P>name), as well as falling
5380         through from the Perl recursion syntax (?&name). We also come here from
5381         the Perl \k<name> or \k'name' back reference syntax and the \k{name}
5382         .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
5383
5384         NAMED_REF_OR_RECURSE:
5385         name = ++ptr;
5386         while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
5387         namelen = (int)(ptr - name);
5388
5389         /* In the pre-compile phase, do a syntax check. We used to just set
5390         a dummy reference number, because it was not used in the first pass.
5391         However, with the change of recursive back references to be atomic,
5392         we have to look for the number so that this state can be identified, as
5393         otherwise the incorrect length is computed. If it's not a backwards
5394         reference, the dummy number will do. */
5395
5396         if (lengthptr != NULL)
5397           {
5398           const uschar *temp;
5399
5400           if (namelen == 0)
5401             {
5402             *errorcodeptr = ERR62;
5403             goto FAILED;
5404             }
5405           if (*ptr != terminator)
5406             {
5407             *errorcodeptr = ERR42;
5408             goto FAILED;
5409             }
5410           if (namelen > MAX_NAME_SIZE)
5411             {
5412             *errorcodeptr = ERR48;
5413             goto FAILED;
5414             }
5415
5416           /* The name table does not exist in the first pass, so we cannot
5417           do a simple search as in the code below. Instead, we have to scan the
5418           pattern to find the number. It is important that we scan it only as
5419           far as we have got because the syntax of named subpatterns has not
5420           been checked for the rest of the pattern, and find_parens() assumes
5421           correct syntax. In any case, it's a waste of resources to scan
5422           further. We stop the scan at the current point by temporarily
5423           adjusting the value of cd->endpattern. */
5424
5425           temp = cd->end_pattern;
5426           cd->end_pattern = ptr;
5427           recno = find_parens(cd, name, namelen,
5428             (options & PCRE_EXTENDED) != 0, utf8);
5429           cd->end_pattern = temp;
5430           if (recno < 0) recno = 0;    /* Forward ref; set dummy number */
5431           }
5432
5433         /* In the real compile, seek the name in the table. We check the name
5434         first, and then check that we have reached the end of the name in the
5435         table. That way, if the name that is longer than any in the table,
5436         the comparison will fail without reading beyond the table entry. */
5437
5438         else
5439           {
5440           slot = cd->name_table;
5441           for (i = 0; i < cd->names_found; i++)
5442             {
5443             if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
5444                 slot[2+namelen] == 0)
5445               break;
5446             slot += cd->name_entry_size;
5447             }
5448
5449           if (i < cd->names_found)         /* Back reference */
5450             {
5451             recno = GET2(slot, 0);
5452             }
5453           else if ((recno =                /* Forward back reference */
5454                     find_parens(cd, name, namelen,
5455                       (options & PCRE_EXTENDED) != 0, utf8)) <= 0)
5456             {
5457             *errorcodeptr = ERR15;
5458             goto FAILED;
5459             }
5460           }
5461
5462         /* In both phases, we can now go to the code than handles numerical
5463         recursion or backreferences. */
5464
5465         if (is_recurse) goto HANDLE_RECURSION;
5466           else goto HANDLE_REFERENCE;
5467
5468
5469         /* ------------------------------------------------------------ */
5470         case CHAR_R:              /* Recursion */
5471         ptr++;                    /* Same as (?0)      */
5472         /* Fall through */
5473
5474
5475         /* ------------------------------------------------------------ */
5476         case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
5477         case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
5478         case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
5479           {
5480           const uschar *called;
5481           terminator = CHAR_RIGHT_PARENTHESIS;
5482
5483           /* Come here from the \g<...> and \g'...' code (Oniguruma
5484           compatibility). However, the syntax has been checked to ensure that
5485           the ... are a (signed) number, so that neither ERR63 nor ERR29 will
5486           be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
5487           ever be taken. */
5488
5489           HANDLE_NUMERICAL_RECURSION:
5490
5491           if ((refsign = *ptr) == CHAR_PLUS)
5492             {
5493             ptr++;
5494             if ((digitab[*ptr] & ctype_digit) == 0)
5495               {
5496               *errorcodeptr = ERR63;
5497               goto FAILED;
5498               }
5499             }
5500           else if (refsign == CHAR_MINUS)
5501             {
5502             if ((digitab[ptr[1]] & ctype_digit) == 0)
5503               goto OTHER_CHAR_AFTER_QUERY;
5504             ptr++;
5505             }
5506
5507           recno = 0;
5508           while((digitab[*ptr] & ctype_digit) != 0)
5509             recno = recno * 10 + *ptr++ - CHAR_0;
5510
5511           if (*ptr != terminator)
5512             {
5513             *errorcodeptr = ERR29;
5514             goto FAILED;
5515             }
5516
5517           if (refsign == CHAR_MINUS)
5518             {
5519             if (recno == 0)
5520               {
5521               *errorcodeptr = ERR58;
5522               goto FAILED;
5523               }
5524             recno = cd->bracount - recno + 1;
5525             if (recno <= 0)
5526               {
5527               *errorcodeptr = ERR15;
5528               goto FAILED;
5529               }
5530             }
5531           else if (refsign == CHAR_PLUS)
5532             {
5533             if (recno == 0)
5534               {
5535               *errorcodeptr = ERR58;
5536               goto FAILED;
5537               }
5538             recno += cd->bracount;
5539             }
5540
5541           /* Come here from code above that handles a named recursion */
5542
5543           HANDLE_RECURSION:
5544
5545           previous = code;
5546           called = cd->start_code;
5547
5548           /* When we are actually compiling, find the bracket that is being
5549           referenced. Temporarily end the regex in case it doesn't exist before
5550           this point. If we end up with a forward reference, first check that
5551           the bracket does occur later so we can give the error (and position)
5552           now. Then remember this forward reference in the workspace so it can
5553           be filled in at the end. */
5554
5555           if (lengthptr == NULL)
5556             {
5557             *code = OP_END;
5558             if (recno != 0)
5559               called = _pcre_find_bracket(cd->start_code, utf8, recno);
5560
5561             /* Forward reference */
5562
5563             if (called == NULL)
5564               {
5565               if (find_parens(cd, NULL, recno,
5566                     (options & PCRE_EXTENDED) != 0, utf8) < 0)
5567                 {
5568                 *errorcodeptr = ERR15;
5569                 goto FAILED;
5570                 }
5571
5572               /* Fudge the value of "called" so that when it is inserted as an
5573               offset below, what it actually inserted is the reference number
5574               of the group. */
5575
5576               called = cd->start_code + recno;
5577               PUTINC(cd->hwm, 0, (int)(code + 2 + LINK_SIZE - cd->start_code));
5578               }
5579
5580             /* If not a forward reference, and the subpattern is still open,
5581             this is a recursive call. We check to see if this is a left
5582             recursion that could loop for ever, and diagnose that case. */
5583
5584             else if (GET(called, 1) == 0 &&
5585                      could_be_empty(called, code, bcptr, utf8, cd))
5586               {
5587               *errorcodeptr = ERR40;
5588               goto FAILED;
5589               }
5590             }
5591
5592           /* Insert the recursion/subroutine item, automatically wrapped inside
5593           "once" brackets. Set up a "previous group" length so that a
5594           subsequent quantifier will work. */
5595
5596           *code = OP_ONCE;
5597           PUT(code, 1, 2 + 2*LINK_SIZE);
5598           code += 1 + LINK_SIZE;
5599
5600           *code = OP_RECURSE;
5601           PUT(code, 1, (int)(called - cd->start_code));
5602           code += 1 + LINK_SIZE;
5603
5604           *code = OP_KET;
5605           PUT(code, 1, 2 + 2*LINK_SIZE);
5606           code += 1 + LINK_SIZE;
5607
5608           length_prevgroup = 3 + 3*LINK_SIZE;
5609           }
5610
5611         /* Can't determine a first byte now */
5612
5613         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5614         continue;
5615
5616
5617         /* ------------------------------------------------------------ */
5618         default:              /* Other characters: check option setting */
5619         OTHER_CHAR_AFTER_QUERY:
5620         set = unset = 0;
5621         optset = &set;
5622
5623         while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
5624           {
5625           switch (*ptr++)
5626             {
5627             case CHAR_MINUS: optset = &unset; break;
5628
5629             case CHAR_J:    /* Record that it changed in the external options */
5630             *optset |= PCRE_DUPNAMES;
5631             cd->external_flags |= PCRE_JCHANGED;
5632             break;
5633
5634             case CHAR_i: *optset |= PCRE_CASELESS; break;
5635             case CHAR_m: *optset |= PCRE_MULTILINE; break;
5636             case CHAR_s: *optset |= PCRE_DOTALL; break;
5637             case CHAR_x: *optset |= PCRE_EXTENDED; break;
5638             case CHAR_U: *optset |= PCRE_UNGREEDY; break;
5639             case CHAR_X: *optset |= PCRE_EXTRA; break;
5640
5641             default:  *errorcodeptr = ERR12;
5642                       ptr--;    /* Correct the offset */
5643                       goto FAILED;
5644             }
5645           }
5646
5647         /* Set up the changed option bits, but don't change anything yet. */
5648
5649         newoptions = (options | set) & (~unset);
5650
5651         /* If the options ended with ')' this is not the start of a nested
5652         group with option changes, so the options change at this level. If this
5653         item is right at the start of the pattern, the options can be
5654         abstracted and made external in the pre-compile phase, and ignored in
5655         the compile phase. This can be helpful when matching -- for instance in
5656         caseless checking of required bytes.
5657
5658         If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
5659         definitely *not* at the start of the pattern because something has been
5660         compiled. In the pre-compile phase, however, the code pointer can have
5661         that value after the start, because it gets reset as code is discarded
5662         during the pre-compile. However, this can happen only at top level - if
5663         we are within parentheses, the starting BRA will still be present. At
5664         any parenthesis level, the length value can be used to test if anything
5665         has been compiled at that level. Thus, a test for both these conditions
5666         is necessary to ensure we correctly detect the start of the pattern in
5667         both phases.
5668
5669         If we are not at the pattern start, compile code to change the ims
5670         options if this setting actually changes any of them, and reset the
5671         greedy defaults and the case value for firstbyte and reqbyte. */
5672
5673         if (*ptr == CHAR_RIGHT_PARENTHESIS)
5674           {
5675           if (code == cd->start_code + 1 + LINK_SIZE &&
5676                (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
5677             {
5678             cd->external_options = newoptions;
5679             }
5680           else
5681             {
5682             if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
5683               {
5684               *code++ = OP_OPT;
5685               *code++ = newoptions & PCRE_IMS;
5686               }
5687             greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
5688             greedy_non_default = greedy_default ^ 1;
5689             req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
5690             }
5691
5692           /* Change options at this level, and pass them back for use
5693           in subsequent branches. When not at the start of the pattern, this
5694           information is also necessary so that a resetting item can be
5695           compiled at the end of a group (if we are in a group). */
5696
5697           *optionsptr = options = newoptions;
5698           previous = NULL;       /* This item can't be repeated */
5699           continue;              /* It is complete */
5700           }
5701
5702         /* If the options ended with ':' we are heading into a nested group
5703         with possible change of options. Such groups are non-capturing and are
5704         not assertions of any kind. All we need to do is skip over the ':';
5705         the newoptions value is handled below. */
5706
5707         bravalue = OP_BRA;
5708         ptr++;
5709         }     /* End of switch for character following (? */
5710       }       /* End of (? handling */
5711
5712     /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
5713     is set, all unadorned brackets become non-capturing and behave like (?:...)
5714     brackets. */
5715
5716     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
5717       {
5718       bravalue = OP_BRA;
5719       }
5720
5721     /* Else we have a capturing group. */
5722
5723     else
5724       {
5725       NUMBERED_GROUP:
5726       cd->bracount += 1;
5727       PUT2(code, 1+LINK_SIZE, cd->bracount);
5728       skipbytes = 2;
5729       }
5730
5731     /* Process nested bracketed regex. Assertions may not be repeated, but
5732     other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
5733     non-register variable in order to be able to pass its address because some
5734     compilers complain otherwise. Pass in a new setting for the ims options if
5735     they have changed. */
5736
5737     previous = (bravalue >= OP_ONCE)? code : NULL;
5738     *code = bravalue;
5739     tempcode = code;
5740     tempreqvary = cd->req_varyopt;     /* Save value before bracket */
5741     length_prevgroup = 0;              /* Initialize for pre-compile phase */
5742
5743     if (!compile_regex(
5744          newoptions,                   /* The complete new option state */
5745          options & PCRE_IMS,           /* The previous ims option state */
5746          &tempcode,                    /* Where to put code (updated) */
5747          &ptr,                         /* Input pointer (updated) */
5748          errorcodeptr,                 /* Where to put an error message */
5749          (bravalue == OP_ASSERTBACK ||
5750           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
5751          reset_bracount,               /* True if (?| group */
5752          skipbytes,                    /* Skip over bracket number */
5753          &subfirstbyte,                /* For possible first char */
5754          &subreqbyte,                  /* For possible last char */
5755          bcptr,                        /* Current branch chain */
5756          cd,                           /* Tables block */
5757          (lengthptr == NULL)? NULL :   /* Actual compile phase */
5758            &length_prevgroup           /* Pre-compile phase */
5759          ))
5760       goto FAILED;
5761
5762     /* At the end of compiling, code is still pointing to the start of the
5763     group, while tempcode has been updated to point past the end of the group
5764     and any option resetting that may follow it. The pattern pointer (ptr)
5765     is on the bracket. */
5766
5767     /* If this is a conditional bracket, check that there are no more than
5768     two branches in the group, or just one if it's a DEFINE group. We do this
5769     in the real compile phase, not in the pre-pass, where the whole group may
5770     not be available. */
5771
5772     if (bravalue == OP_COND && lengthptr == NULL)
5773       {
5774       uschar *tc = code;
5775       int condcount = 0;
5776
5777       do {
5778          condcount++;
5779          tc += GET(tc,1);
5780          }
5781       while (*tc != OP_KET);
5782
5783       /* A DEFINE group is never obeyed inline (the "condition" is always
5784       false). It must have only one branch. */
5785
5786       if (code[LINK_SIZE+1] == OP_DEF)
5787         {
5788         if (condcount > 1)
5789           {
5790           *errorcodeptr = ERR54;
5791           goto FAILED;
5792           }
5793         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
5794         }
5795
5796       /* A "normal" conditional group. If there is just one branch, we must not
5797       make use of its firstbyte or reqbyte, because this is equivalent to an
5798       empty second branch. */
5799
5800       else
5801         {
5802         if (condcount > 2)
5803           {
5804           *errorcodeptr = ERR27;
5805           goto FAILED;
5806           }
5807         if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
5808         }
5809       }
5810
5811     /* Error if hit end of pattern */
5812
5813     if (*ptr != CHAR_RIGHT_PARENTHESIS)
5814       {
5815       *errorcodeptr = ERR14;
5816       goto FAILED;
5817       }
5818
5819     /* In the pre-compile phase, update the length by the length of the group,
5820     less the brackets at either end. Then reduce the compiled code to just a
5821     set of non-capturing brackets so that it doesn't use much memory if it is
5822     duplicated by a quantifier.*/
5823
5824     if (lengthptr != NULL)
5825       {
5826       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
5827         {
5828         *errorcodeptr = ERR20;
5829         goto FAILED;
5830         }
5831       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5832       *code++ = OP_BRA;
5833       PUTINC(code, 0, 1 + LINK_SIZE);
5834       *code++ = OP_KET;
5835       PUTINC(code, 0, 1 + LINK_SIZE);
5836       break;    /* No need to waste time with special character handling */
5837       }
5838
5839     /* Otherwise update the main code pointer to the end of the group. */
5840
5841     code = tempcode;
5842
5843     /* For a DEFINE group, required and first character settings are not
5844     relevant. */
5845
5846     if (bravalue == OP_DEF) break;
5847
5848     /* Handle updating of the required and first characters for other types of
5849     group. Update for normal brackets of all kinds, and conditions with two
5850     branches (see code above). If the bracket is followed by a quantifier with
5851     zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5852     zerofirstbyte outside the main loop so that they can be accessed for the
5853     back off. */
5854
5855     zeroreqbyte = reqbyte;
5856     zerofirstbyte = firstbyte;
5857     groupsetfirstbyte = FALSE;
5858
5859     if (bravalue >= OP_ONCE)
5860       {
5861       /* If we have not yet set a firstbyte in this branch, take it from the
5862       subpattern, remembering that it was set here so that a repeat of more
5863       than one can replicate it as reqbyte if necessary. If the subpattern has
5864       no firstbyte, set "none" for the whole branch. In both cases, a zero
5865       repeat forces firstbyte to "none". */
5866
5867       if (firstbyte == REQ_UNSET)
5868         {
5869         if (subfirstbyte >= 0)
5870           {
5871           firstbyte = subfirstbyte;
5872           groupsetfirstbyte = TRUE;
5873           }
5874         else firstbyte = REQ_NONE;
5875         zerofirstbyte = REQ_NONE;
5876         }
5877
5878       /* If firstbyte was previously set, convert the subpattern's firstbyte
5879       into reqbyte if there wasn't one, using the vary flag that was in
5880       existence beforehand. */
5881
5882       else if (subfirstbyte >= 0 && subreqbyte < 0)
5883         subreqbyte = subfirstbyte | tempreqvary;
5884
5885       /* If the subpattern set a required byte (or set a first byte that isn't
5886       really the first byte - see above), set it. */
5887
5888       if (subreqbyte >= 0) reqbyte = subreqbyte;
5889       }
5890
5891     /* For a forward assertion, we take the reqbyte, if set. This can be
5892     helpful if the pattern that follows the assertion doesn't set a different
5893     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5894     for an assertion, however because it leads to incorrect effect for patterns
5895     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5896     of a firstbyte. This is overcome by a scan at the end if there's no
5897     firstbyte, looking for an asserted first char. */
5898
5899     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5900     break;     /* End of processing '(' */
5901
5902
5903     /* ===================================================================*/
5904     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5905     are arranged to be the negation of the corresponding OP_values in the
5906     default case when PCRE_UCP is not set. For the back references, the values
5907     are ESC_REF plus the reference number. Only back references and those types
5908     that consume a character may be repeated. We can test for values between
5909     ESC_b and ESC_Z for the latter; this may have to change if any new ones are
5910     ever created. */
5911
5912     case CHAR_BACKSLASH:
5913     tempptr = ptr;
5914     c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5915     if (*errorcodeptr != 0) goto FAILED;
5916
5917     if (c < 0)
5918       {
5919       if (-c == ESC_Q)            /* Handle start of quoted string */
5920         {
5921         if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5922           ptr += 2;               /* avoid empty string */
5923             else inescq = TRUE;
5924         continue;
5925         }
5926
5927       if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
5928
5929       /* For metasequences that actually match a character, we disable the
5930       setting of a first character if it hasn't already been set. */
5931
5932       if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5933         firstbyte = REQ_NONE;
5934
5935       /* Set values to reset to if this is followed by a zero repeat. */
5936
5937       zerofirstbyte = firstbyte;
5938       zeroreqbyte = reqbyte;
5939
5940       /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5941       is a subroutine call by number (Oniguruma syntax). In fact, the value
5942       -ESC_g is returned only for these cases. So we don't need to check for <
5943       or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5944       -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5945       that is a synonym for a named back reference). */
5946
5947       if (-c == ESC_g)
5948         {
5949         const uschar *p;
5950         save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
5951         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5952           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5953
5954         /* These two statements stop the compiler for warning about possibly
5955         unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5956         fact, because we actually check for a number below, the paths that
5957         would actually be in error are never taken. */
5958
5959         skipbytes = 0;
5960         reset_bracount = FALSE;
5961
5962         /* Test for a name */
5963
5964         if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
5965           {
5966           BOOL isnumber = TRUE;
5967           for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5968             {
5969             if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5970             if ((cd->ctypes[*p] & ctype_word) == 0) break;
5971             }
5972           if (*p != terminator)
5973             {
5974             *errorcodeptr = ERR57;
5975             break;
5976             }
5977           if (isnumber)
5978             {
5979             ptr++;
5980             goto HANDLE_NUMERICAL_RECURSION;
5981             }
5982           is_recurse = TRUE;
5983           goto NAMED_REF_OR_RECURSE;
5984           }
5985
5986         /* Test a signed number in angle brackets or quotes. */
5987
5988         p = ptr + 2;
5989         while ((digitab[*p] & ctype_digit) != 0) p++;
5990         if (*p != terminator)
5991           {
5992           *errorcodeptr = ERR57;
5993           break;
5994           }
5995         ptr++;
5996         goto HANDLE_NUMERICAL_RECURSION;
5997         }
5998
5999       /* \k<name> or \k'name' is a back reference by name (Perl syntax).
6000       We also support \k{name} (.NET syntax) */
6001
6002       if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||
6003           ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))
6004         {
6005         is_recurse = FALSE;
6006         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
6007           CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
6008           CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
6009         goto NAMED_REF_OR_RECURSE;
6010         }
6011
6012       /* Back references are handled specially; must disable firstbyte if
6013       not set to cope with cases like (?=(\w+))\1: which would otherwise set
6014       ':' later. */
6015
6016       if (-c >= ESC_REF)
6017         {
6018         open_capitem *oc;
6019         recno = -c - ESC_REF;
6020
6021         HANDLE_REFERENCE:    /* Come here from named backref handling */
6022         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
6023         previous = code;
6024         *code++ = OP_REF;
6025         PUT2INC(code, 0, recno);
6026         cd->backref_map |= (recno < 32)? (1 << recno) : 1;
6027         if (recno > cd->top_backref) cd->top_backref = recno;
6028
6029         /* Check to see if this back reference is recursive, that it, it
6030         is inside the group that it references. A flag is set so that the
6031         group can be made atomic. */
6032
6033         for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6034           {
6035           if (oc->number == recno)
6036             {
6037             oc->flag = TRUE;
6038             break;
6039             }
6040           }
6041         }
6042
6043       /* So are Unicode property matches, if supported. */
6044
6045 #ifdef SUPPORT_UCP
6046       else if (-c == ESC_P || -c == ESC_p)
6047         {
6048         BOOL negated;
6049         int pdata;
6050         int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
6051         if (ptype < 0) goto FAILED;
6052         previous = code;
6053         *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
6054         *code++ = ptype;
6055         *code++ = pdata;
6056         }
6057 #else
6058
6059       /* If Unicode properties are not supported, \X, \P, and \p are not
6060       allowed. */
6061
6062       else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
6063         {
6064         *errorcodeptr = ERR45;
6065         goto FAILED;
6066         }
6067 #endif
6068
6069       /* For the rest (including \X when Unicode properties are supported), we
6070       can obtain the OP value by negating the escape value in the default
6071       situation when PCRE_UCP is not set. When it *is* set, we substitute
6072       Unicode property tests. */
6073
6074       else
6075         {
6076 #ifdef SUPPORT_UCP
6077         if (-c >= ESC_DU && -c <= ESC_wu)
6078           {
6079           nestptr = ptr + 1;                   /* Where to resume */
6080           ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
6081           }
6082         else
6083 #endif
6084           {
6085           previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
6086           *code++ = -c;
6087           }
6088         }
6089       continue;
6090       }
6091
6092     /* We have a data character whose value is in c. In UTF-8 mode it may have
6093     a value > 127. We set its representation in the length/buffer, and then
6094     handle it as a data character. */
6095
6096 #ifdef SUPPORT_UTF8
6097     if (utf8 && c > 127)
6098       mclength = _pcre_ord2utf8(c, mcbuffer);
6099     else
6100 #endif
6101
6102      {
6103      mcbuffer[0] = c;
6104      mclength = 1;
6105      }
6106     goto ONE_CHAR;
6107
6108
6109     /* ===================================================================*/
6110     /* Handle a literal character. It is guaranteed not to be whitespace or #
6111     when the extended flag is set. If we are in UTF-8 mode, it may be a
6112     multi-byte literal character. */
6113
6114     default:
6115     NORMAL_CHAR:
6116     mclength = 1;
6117     mcbuffer[0] = c;
6118
6119 #ifdef SUPPORT_UTF8
6120     if (utf8 && c >= 0xc0)
6121       {
6122       while ((ptr[1] & 0xc0) == 0x80)
6123         mcbuffer[mclength++] = *(++ptr);
6124       }
6125 #endif
6126
6127     /* At this point we have the character's bytes in mcbuffer, and the length
6128     in mclength. When not in UTF-8 mode, the length is always 1. */
6129
6130     ONE_CHAR:
6131     previous = code;
6132     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
6133     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
6134
6135     /* Remember if \r or \n were seen */
6136
6137     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
6138       cd->external_flags |= PCRE_HASCRORLF;
6139
6140     /* Set the first and required bytes appropriately. If no previous first
6141     byte, set it from this character, but revert to none on a zero repeat.
6142     Otherwise, leave the firstbyte value alone, and don't change it on a zero
6143     repeat. */
6144
6145     if (firstbyte == REQ_UNSET)
6146       {
6147       zerofirstbyte = REQ_NONE;
6148       zeroreqbyte = reqbyte;
6149
6150       /* If the character is more than one byte long, we can set firstbyte
6151       only if it is not to be matched caselessly. */
6152
6153       if (mclength == 1 || req_caseopt == 0)
6154         {
6155         firstbyte = mcbuffer[0] | req_caseopt;
6156         if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
6157         }
6158       else firstbyte = reqbyte = REQ_NONE;
6159       }
6160
6161     /* firstbyte was previously set; we can set reqbyte only the length is
6162     1 or the matching is caseful. */
6163
6164     else
6165       {
6166       zerofirstbyte = firstbyte;
6167       zeroreqbyte = reqbyte;
6168       if (mclength == 1 || req_caseopt == 0)
6169         reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
6170       }
6171
6172     break;            /* End of literal character handling */
6173     }
6174   }                   /* end of big loop */
6175
6176
6177 /* Control never reaches here by falling through, only by a goto for all the
6178 error states. Pass back the position in the pattern so that it can be displayed
6179 to the user for diagnosing the error. */
6180
6181 FAILED:
6182 *ptrptr = ptr;
6183 return FALSE;
6184 }
6185
6186
6187
6188
6189 /*************************************************
6190 *     Compile sequence of alternatives           *
6191 *************************************************/
6192
6193 /* On entry, ptr is pointing past the bracket character, but on return it
6194 points to the closing bracket, or vertical bar, or end of string. The code
6195 variable is pointing at the byte into which the BRA operator has been stored.
6196 If the ims options are changed at the start (for a (?ims: group) or during any
6197 branch, we need to insert an OP_OPT item at the start of every following branch
6198 to ensure they get set correctly at run time, and also pass the new options
6199 into every subsequent branch compile.
6200
6201 This function is used during the pre-compile phase when we are trying to find
6202 out the amount of memory needed, as well as during the real compile phase. The
6203 value of lengthptr distinguishes the two phases.
6204
6205 Arguments:
6206   options        option bits, including any changes for this subpattern
6207   oldims         previous settings of ims option bits
6208   codeptr        -> the address of the current code pointer
6209   ptrptr         -> the address of the current pattern pointer
6210   errorcodeptr   -> pointer to error code variable
6211   lookbehind     TRUE if this is a lookbehind assertion
6212   reset_bracount TRUE to reset the count for each branch
6213   skipbytes      skip this many bytes at start (for brackets and OP_COND)
6214   firstbyteptr   place to put the first required character, or a negative number
6215   reqbyteptr     place to put the last required character, or a negative number
6216   bcptr          pointer to the chain of currently open branches
6217   cd             points to the data block with tables pointers etc.
6218   lengthptr      NULL during the real compile phase
6219                  points to length accumulator during pre-compile phase
6220
6221 Returns:         TRUE on success
6222 */
6223
6224 static BOOL
6225 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
6226   int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
6227   int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
6228   int *lengthptr)
6229 {
6230 const uschar *ptr = *ptrptr;
6231 uschar *code = *codeptr;
6232 uschar *last_branch = code;
6233 uschar *start_bracket = code;
6234 uschar *reverse_count = NULL;
6235 open_capitem capitem;
6236 int capnumber = 0;
6237 int firstbyte, reqbyte;
6238 int branchfirstbyte, branchreqbyte;
6239 int length;
6240 int orig_bracount;
6241 int max_bracount;
6242 int old_external_options = cd->external_options;
6243 branch_chain bc;
6244
6245 bc.outer = bcptr;
6246 bc.current_branch = code;
6247
6248 firstbyte = reqbyte = REQ_UNSET;
6249
6250 /* Accumulate the length for use in the pre-compile phase. Start with the
6251 length of the BRA and KET and any extra bytes that are required at the
6252 beginning. We accumulate in a local variable to save frequent testing of
6253 lenthptr for NULL. We cannot do this by looking at the value of code at the
6254 start and end of each alternative, because compiled items are discarded during
6255 the pre-compile phase so that the work space is not exceeded. */
6256
6257 length = 2 + 2*LINK_SIZE + skipbytes;
6258
6259 /* WARNING: If the above line is changed for any reason, you must also change
6260 the code that abstracts option settings at the start of the pattern and makes
6261 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
6262 pre-compile phase to find out whether anything has yet been compiled or not. */
6263
6264 /* If this is a capturing subpattern, add to the chain of open capturing items
6265 so that we can detect them if (*ACCEPT) is encountered. This is also used to
6266 detect groups that contain recursive back references to themselves. */
6267
6268 if (*code == OP_CBRA)
6269   {
6270   capnumber = GET2(code, 1 + LINK_SIZE);
6271   capitem.number = capnumber;
6272   capitem.next = cd->open_caps;
6273   capitem.flag = FALSE;
6274   cd->open_caps = &capitem;
6275   }
6276
6277 /* Offset is set zero to mark that this bracket is still open */
6278
6279 PUT(code, 1, 0);
6280 code += 1 + LINK_SIZE + skipbytes;
6281
6282 /* Loop for each alternative branch */
6283
6284 orig_bracount = max_bracount = cd->bracount;
6285 for (;;)
6286   {
6287   /* For a (?| group, reset the capturing bracket count so that each branch
6288   uses the same numbers. */
6289
6290   if (reset_bracount) cd->bracount = orig_bracount;
6291
6292   /* Handle a change of ims options at the start of the branch */
6293
6294   if ((options & PCRE_IMS) != oldims)
6295     {
6296     *code++ = OP_OPT;
6297     *code++ = options & PCRE_IMS;
6298     length += 2;
6299     }
6300
6301   /* Set up dummy OP_REVERSE if lookbehind assertion */
6302
6303   if (lookbehind)
6304     {
6305     *code++ = OP_REVERSE;
6306     reverse_count = code;
6307     PUTINC(code, 0, 0);
6308     length += 1 + LINK_SIZE;
6309     }
6310
6311   /* Now compile the branch; in the pre-compile phase its length gets added
6312   into the length. */
6313
6314   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
6315         &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
6316     {
6317     *ptrptr = ptr;
6318     return FALSE;
6319     }
6320
6321   /* If the external options have changed during this branch, it means that we
6322   are at the top level, and a leading option setting has been encountered. We
6323   need to re-set the original option values to take account of this so that,
6324   during the pre-compile phase, we know to allow for a re-set at the start of
6325   subsequent branches. */
6326
6327   if (old_external_options != cd->external_options)
6328     oldims = cd->external_options & PCRE_IMS;
6329
6330   /* Keep the highest bracket count in case (?| was used and some branch
6331   has fewer than the rest. */
6332
6333   if (cd->bracount > max_bracount) max_bracount = cd->bracount;
6334
6335   /* In the real compile phase, there is some post-processing to be done. */
6336
6337   if (lengthptr == NULL)
6338     {
6339     /* If this is the first branch, the firstbyte and reqbyte values for the
6340     branch become the values for the regex. */
6341
6342     if (*last_branch != OP_ALT)
6343       {
6344       firstbyte = branchfirstbyte;
6345       reqbyte = branchreqbyte;
6346       }
6347
6348     /* If this is not the first branch, the first char and reqbyte have to
6349     match the values from all the previous branches, except that if the
6350     previous value for reqbyte didn't have REQ_VARY set, it can still match,
6351     and we set REQ_VARY for the regex. */
6352
6353     else
6354       {
6355       /* If we previously had a firstbyte, but it doesn't match the new branch,
6356       we have to abandon the firstbyte for the regex, but if there was
6357       previously no reqbyte, it takes on the value of the old firstbyte. */
6358
6359       if (firstbyte >= 0 && firstbyte != branchfirstbyte)
6360         {
6361         if (reqbyte < 0) reqbyte = firstbyte;
6362         firstbyte = REQ_NONE;
6363         }
6364
6365       /* If we (now or from before) have no firstbyte, a firstbyte from the
6366       branch becomes a reqbyte if there isn't a branch reqbyte. */
6367
6368       if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
6369           branchreqbyte = branchfirstbyte;
6370
6371       /* Now ensure that the reqbytes match */
6372
6373       if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
6374         reqbyte = REQ_NONE;
6375       else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
6376       }
6377
6378     /* If lookbehind, check that this branch matches a fixed-length string, and
6379     put the length into the OP_REVERSE item. Temporarily mark the end of the
6380     branch with OP_END. If the branch contains OP_RECURSE, the result is -3
6381     because there may be forward references that we can't check here. Set a
6382     flag to cause another lookbehind check at the end. Why not do it all at the
6383     end? Because common, erroneous checks are picked up here and the offset of
6384     the problem can be shown. */
6385
6386     if (lookbehind)
6387       {
6388       int fixed_length;
6389       *code = OP_END;
6390       fixed_length = find_fixedlength(last_branch, options, FALSE, cd);
6391       DPRINTF(("fixed length = %d\n", fixed_length));
6392       if (fixed_length == -3)
6393         {
6394         cd->check_lookbehind = TRUE;
6395         }
6396       else if (fixed_length < 0)
6397         {
6398         *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
6399         *ptrptr = ptr;
6400         return FALSE;
6401         }
6402       else { PUT(reverse_count, 0, fixed_length); }
6403       }
6404     }
6405
6406   /* Reached end of expression, either ')' or end of pattern. In the real
6407   compile phase, go back through the alternative branches and reverse the chain
6408   of offsets, with the field in the BRA item now becoming an offset to the
6409   first alternative. If there are no alternatives, it points to the end of the
6410   group. The length in the terminating ket is always the length of the whole
6411   bracketed item. If any of the ims options were changed inside the group,
6412   compile a resetting op-code following, except at the very end of the pattern.
6413   Return leaving the pointer at the terminating char. */
6414
6415   if (*ptr != CHAR_VERTICAL_LINE)
6416     {
6417     if (lengthptr == NULL)
6418       {
6419       int branch_length = (int)(code - last_branch);
6420       do
6421         {
6422         int prev_length = GET(last_branch, 1);
6423         PUT(last_branch, 1, branch_length);
6424         branch_length = prev_length;
6425         last_branch -= branch_length;
6426         }
6427       while (branch_length > 0);
6428       }
6429
6430     /* Fill in the ket */
6431
6432     *code = OP_KET;
6433     PUT(code, 1, (int)(code - start_bracket));
6434     code += 1 + LINK_SIZE;
6435
6436     /* If it was a capturing subpattern, check to see if it contained any
6437     recursive back references. If so, we must wrap it in atomic brackets.
6438     In any event, remove the block from the chain. */
6439
6440     if (capnumber > 0)
6441       {
6442       if (cd->open_caps->flag)
6443         {
6444         memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
6445           code - start_bracket);
6446         *start_bracket = OP_ONCE;
6447         code += 1 + LINK_SIZE;
6448         PUT(start_bracket, 1, (int)(code - start_bracket));
6449         *code = OP_KET;
6450         PUT(code, 1, (int)(code - start_bracket));
6451         code += 1 + LINK_SIZE;
6452         length += 2 + 2*LINK_SIZE;
6453         }
6454       cd->open_caps = cd->open_caps->next;
6455       }
6456
6457     /* Reset options if needed. */
6458
6459     if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS)
6460       {
6461       *code++ = OP_OPT;
6462       *code++ = oldims;
6463       length += 2;
6464       }
6465
6466     /* Retain the highest bracket number, in case resetting was used. */
6467
6468     cd->bracount = max_bracount;
6469
6470     /* Set values to pass back */
6471
6472     *codeptr = code;
6473     *ptrptr = ptr;
6474     *firstbyteptr = firstbyte;
6475     *reqbyteptr = reqbyte;
6476     if (lengthptr != NULL)
6477       {
6478       if (OFLOW_MAX - *lengthptr < length)
6479         {
6480         *errorcodeptr = ERR20;
6481         return FALSE;
6482         }
6483       *lengthptr += length;
6484       }
6485     return TRUE;
6486     }
6487
6488   /* Another branch follows. In the pre-compile phase, we can move the code
6489   pointer back to where it was for the start of the first branch. (That is,
6490   pretend that each branch is the only one.)
6491
6492   In the real compile phase, insert an ALT node. Its length field points back
6493   to the previous branch while the bracket remains open. At the end the chain
6494   is reversed. It's done like this so that the start of the bracket has a
6495   zero offset until it is closed, making it possible to detect recursion. */
6496
6497   if (lengthptr != NULL)
6498     {
6499     code = *codeptr + 1 + LINK_SIZE + skipbytes;
6500     length += 1 + LINK_SIZE;
6501     }
6502   else
6503     {
6504     *code = OP_ALT;
6505     PUT(code, 1, (int)(code - last_branch));
6506     bc.current_branch = last_branch = code;
6507     code += 1 + LINK_SIZE;
6508     }
6509
6510   ptr++;
6511   }
6512 /* Control never reaches here */
6513 }
6514
6515
6516
6517
6518 /*************************************************
6519 *          Check for anchored expression         *
6520 *************************************************/
6521
6522 /* Try to find out if this is an anchored regular expression. Consider each
6523 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
6524 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
6525 it's anchored. However, if this is a multiline pattern, then only OP_SOD
6526 counts, since OP_CIRC can match in the middle.
6527
6528 We can also consider a regex to be anchored if OP_SOM starts all its branches.
6529 This is the code for \G, which means "match at start of match position, taking
6530 into account the match offset".
6531
6532 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
6533 because that will try the rest of the pattern at all possible matching points,
6534 so there is no point trying again.... er ....
6535
6536 .... except when the .* appears inside capturing parentheses, and there is a
6537 subsequent back reference to those parentheses. We haven't enough information
6538 to catch that case precisely.
6539
6540 At first, the best we could do was to detect when .* was in capturing brackets
6541 and the highest back reference was greater than or equal to that level.
6542 However, by keeping a bitmap of the first 31 back references, we can catch some
6543 of the more common cases more precisely.
6544
6545 Arguments:
6546   code           points to start of expression (the bracket)
6547   options        points to the options setting
6548   bracket_map    a bitmap of which brackets we are inside while testing; this
6549                   handles up to substring 31; after that we just have to take
6550                   the less precise approach
6551   backref_map    the back reference bitmap
6552
6553 Returns:     TRUE or FALSE
6554 */
6555
6556 static BOOL
6557 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
6558   unsigned int backref_map)
6559 {
6560 do {
6561    const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6562      options, PCRE_MULTILINE, FALSE);
6563    register int op = *scode;
6564
6565    /* Non-capturing brackets */
6566
6567    if (op == OP_BRA)
6568      {
6569      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
6570      }
6571
6572    /* Capturing brackets */
6573
6574    else if (op == OP_CBRA)
6575      {
6576      int n = GET2(scode, 1+LINK_SIZE);
6577      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6578      if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
6579      }
6580
6581    /* Other brackets */
6582
6583    else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
6584      {
6585      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
6586      }
6587
6588    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
6589    it isn't in brackets that are or may be referenced. */
6590
6591    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
6592              op == OP_TYPEPOSSTAR))
6593      {
6594      if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
6595        return FALSE;
6596      }
6597
6598    /* Check for explicit anchoring */
6599
6600    else if (op != OP_SOD && op != OP_SOM &&
6601            ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
6602      return FALSE;
6603    code += GET(code, 1);
6604    }
6605 while (*code == OP_ALT);   /* Loop for each alternative */
6606 return TRUE;
6607 }
6608
6609
6610
6611 /*************************************************
6612 *         Check for starting with ^ or .*        *
6613 *************************************************/
6614
6615 /* This is called to find out if every branch starts with ^ or .* so that
6616 "first char" processing can be done to speed things up in multiline
6617 matching and for non-DOTALL patterns that start with .* (which must start at
6618 the beginning or after \n). As in the case of is_anchored() (see above), we
6619 have to take account of back references to capturing brackets that contain .*
6620 because in that case we can't make the assumption.
6621
6622 Arguments:
6623   code           points to start of expression (the bracket)
6624   bracket_map    a bitmap of which brackets we are inside while testing; this
6625                   handles up to substring 31; after that we just have to take
6626                   the less precise approach
6627   backref_map    the back reference bitmap
6628
6629 Returns:         TRUE or FALSE
6630 */
6631
6632 static BOOL
6633 is_startline(const uschar *code, unsigned int bracket_map,
6634   unsigned int backref_map)
6635 {
6636 do {
6637    const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6638      NULL, 0, FALSE);
6639    register int op = *scode;
6640
6641    /* If we are at the start of a conditional assertion group, *both* the
6642    conditional assertion *and* what follows the condition must satisfy the test
6643    for start of line. Other kinds of condition fail. Note that there may be an
6644    auto-callout at the start of a condition. */
6645
6646    if (op == OP_COND)
6647      {
6648      scode += 1 + LINK_SIZE;
6649      if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];
6650      switch (*scode)
6651        {
6652        case OP_CREF:
6653        case OP_NCREF:
6654        case OP_RREF:
6655        case OP_NRREF:
6656        case OP_DEF:
6657        return FALSE;
6658
6659        default:     /* Assertion */
6660        if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6661        do scode += GET(scode, 1); while (*scode == OP_ALT);
6662        scode += 1 + LINK_SIZE;
6663        break;
6664        }
6665      scode = first_significant_code(scode, NULL, 0, FALSE);
6666      op = *scode;
6667      }
6668
6669    /* Non-capturing brackets */
6670
6671    if (op == OP_BRA)
6672      {
6673      if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6674      }
6675
6676    /* Capturing brackets */
6677
6678    else if (op == OP_CBRA)
6679      {
6680      int n = GET2(scode, 1+LINK_SIZE);
6681      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6682      if (!is_startline(scode, new_map, backref_map)) return FALSE;
6683      }
6684
6685    /* Other brackets */
6686
6687    else if (op == OP_ASSERT || op == OP_ONCE)
6688      {
6689      if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6690      }
6691
6692    /* .* means "start at start or after \n" if it isn't in brackets that
6693    may be referenced. */
6694
6695    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
6696      {
6697      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
6698      }
6699
6700    /* Check for explicit circumflex */
6701
6702    else if (op != OP_CIRC) return FALSE;
6703
6704    /* Move on to the next alternative */
6705
6706    code += GET(code, 1);
6707    }
6708 while (*code == OP_ALT);  /* Loop for each alternative */
6709 return TRUE;
6710 }
6711
6712
6713
6714 /*************************************************
6715 *       Check for asserted fixed first char      *
6716 *************************************************/
6717
6718 /* During compilation, the "first char" settings from forward assertions are
6719 discarded, because they can cause conflicts with actual literals that follow.
6720 However, if we end up without a first char setting for an unanchored pattern,
6721 it is worth scanning the regex to see if there is an initial asserted first
6722 char. If all branches start with the same asserted char, or with a bracket all
6723 of whose alternatives start with the same asserted char (recurse ad lib), then
6724 we return that char, otherwise -1.
6725
6726 Arguments:
6727   code       points to start of expression (the bracket)
6728   options    pointer to the options (used to check casing changes)
6729   inassert   TRUE if in an assertion
6730
6731 Returns:     -1 or the fixed first char
6732 */
6733
6734 static int
6735 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
6736 {
6737 register int c = -1;
6738 do {
6739    int d;
6740    const uschar *scode =
6741      first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
6742    register int op = *scode;
6743
6744    switch(op)
6745      {
6746      default:
6747      return -1;
6748
6749      case OP_BRA:
6750      case OP_CBRA:
6751      case OP_ASSERT:
6752      case OP_ONCE:
6753      case OP_COND:
6754      if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
6755        return -1;
6756      if (c < 0) c = d; else if (c != d) return -1;
6757      break;
6758
6759      case OP_EXACT:       /* Fall through */
6760      scode += 2;
6761
6762      case OP_CHAR:
6763      case OP_CHARNC:
6764      case OP_PLUS:
6765      case OP_MINPLUS:
6766      case OP_POSPLUS:
6767      if (!inassert) return -1;
6768      if (c < 0)
6769        {
6770        c = scode[1];
6771        if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
6772        }
6773      else if (c != scode[1]) return -1;
6774      break;
6775      }
6776
6777    code += GET(code, 1);
6778    }
6779 while (*code == OP_ALT);
6780 return c;
6781 }
6782
6783
6784
6785 /*************************************************
6786 *        Compile a Regular Expression            *
6787 *************************************************/
6788
6789 /* This function takes a string and returns a pointer to a block of store
6790 holding a compiled version of the expression. The original API for this
6791 function had no error code return variable; it is retained for backwards
6792 compatibility. The new function is given a new name.
6793
6794 Arguments:
6795   pattern       the regular expression
6796   options       various option bits
6797   errorcodeptr  pointer to error code variable (pcre_compile2() only)
6798                   can be NULL if you don't want a code value
6799   errorptr      pointer to pointer to error text
6800   erroroffset   ptr offset in pattern where error was detected
6801   tables        pointer to character tables or NULL
6802
6803 Returns:        pointer to compiled data block, or NULL on error,
6804                 with errorptr and erroroffset set
6805 */
6806
6807 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6808 pcre_compile(const char *pattern, int options, const char **errorptr,
6809   int *erroroffset, const unsigned char *tables)
6810 {
6811 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
6812 }
6813
6814
6815 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6816 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
6817   const char **errorptr, int *erroroffset, const unsigned char *tables)
6818 {
6819 real_pcre *re;
6820 int length = 1;  /* For final END opcode */
6821 int firstbyte, reqbyte, newline;
6822 int errorcode = 0;
6823 int skipatstart = 0;
6824 BOOL utf8;
6825 size_t size;
6826 uschar *code;
6827 const uschar *codestart;
6828 const uschar *ptr;
6829 compile_data compile_block;
6830 compile_data *cd = &compile_block;
6831
6832 /* This space is used for "compiling" into during the first phase, when we are
6833 computing the amount of memory that is needed. Compiled items are thrown away
6834 as soon as possible, so that a fairly large buffer should be sufficient for
6835 this purpose. The same space is used in the second phase for remembering where
6836 to fill in forward references to subpatterns. */
6837
6838 uschar cworkspace[COMPILE_WORK_SIZE];
6839
6840 /* Set this early so that early errors get offset 0. */
6841
6842 ptr = (const uschar *)pattern;
6843
6844 /* We can't pass back an error message if errorptr is NULL; I guess the best we
6845 can do is just return NULL, but we can set a code value if there is a code
6846 pointer. */
6847
6848 if (errorptr == NULL)
6849   {
6850   if (errorcodeptr != NULL) *errorcodeptr = 99;
6851   return NULL;
6852   }
6853
6854 *errorptr = NULL;
6855 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
6856
6857 /* However, we can give a message for this error */
6858
6859 if (erroroffset == NULL)
6860   {
6861   errorcode = ERR16;
6862   goto PCRE_EARLY_ERROR_RETURN2;
6863   }
6864
6865 *erroroffset = 0;
6866
6867 /* Set up pointers to the individual character tables */
6868
6869 if (tables == NULL) tables = _pcre_default_tables;
6870 cd->lcc = tables + lcc_offset;
6871 cd->fcc = tables + fcc_offset;
6872 cd->cbits = tables + cbits_offset;
6873 cd->ctypes = tables + ctypes_offset;
6874
6875 /* Check that all undefined public option bits are zero */
6876
6877 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
6878   {
6879   errorcode = ERR17;
6880   goto PCRE_EARLY_ERROR_RETURN;
6881   }
6882
6883 /* Check for global one-time settings at the start of the pattern, and remember
6884 the offset for later. */
6885
6886 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
6887        ptr[skipatstart+1] == CHAR_ASTERISK)
6888   {
6889   int newnl = 0;
6890   int newbsr = 0;
6891
6892   if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
6893     { skipatstart += 7; options |= PCRE_UTF8; continue; }
6894   else if (strncmp((char *)(ptr+skipatstart+2), STRING_UCP_RIGHTPAR, 4) == 0)
6895     { skipatstart += 6; options |= PCRE_UCP; continue; }
6896   else if (strncmp((char *)(ptr+skipatstart+2), STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
6897     { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
6898
6899   if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
6900     { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6901   else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3)  == 0)
6902     { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
6903   else if (strncmp((char *)(ptr+skipatstart+2), STRING_CRLF_RIGHTPAR, 5)  == 0)
6904     { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
6905   else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANY_RIGHTPAR, 4) == 0)
6906     { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
6907   else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANYCRLF_RIGHTPAR, 8) == 0)
6908     { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
6909
6910   else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
6911     { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
6912   else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
6913     { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
6914
6915   if (newnl != 0)
6916     options = (options & ~PCRE_NEWLINE_BITS) | newnl;
6917   else if (newbsr != 0)
6918     options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
6919   else break;
6920   }
6921
6922 utf8 = (options & PCRE_UTF8) != 0;
6923
6924 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
6925
6926 #ifdef SUPPORT_UTF8
6927 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6928      (*erroroffset = _pcre_valid_utf8((USPTR)pattern, -1)) >= 0)
6929   {
6930   errorcode = ERR44;
6931   goto PCRE_EARLY_ERROR_RETURN2;
6932   }
6933 #else
6934 if (utf8)
6935   {
6936   errorcode = ERR32;
6937   goto PCRE_EARLY_ERROR_RETURN;
6938   }
6939 #endif
6940
6941 /* Can't support UCP unless PCRE has been compiled to include the code. */
6942
6943 #ifndef SUPPORT_UCP
6944 if ((options & PCRE_UCP) != 0)
6945   {
6946   errorcode = ERR67;
6947   goto PCRE_EARLY_ERROR_RETURN;
6948   }
6949 #endif
6950
6951 /* Check validity of \R options. */
6952
6953 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6954   {
6955   case 0:
6956   case PCRE_BSR_ANYCRLF:
6957   case PCRE_BSR_UNICODE:
6958   break;
6959   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6960   }
6961
6962 /* Handle different types of newline. The three bits give seven cases. The
6963 current code allows for fixed one- or two-byte sequences, plus "any" and
6964 "anycrlf". */
6965
6966 switch (options & PCRE_NEWLINE_BITS)
6967   {
6968   case 0: newline = NEWLINE; break;   /* Build-time default */
6969   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6970   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6971   case PCRE_NEWLINE_CR+
6972        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6973   case PCRE_NEWLINE_ANY: newline = -1; break;
6974   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6975   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6976   }
6977
6978 if (newline == -2)
6979   {
6980   cd->nltype = NLTYPE_ANYCRLF;
6981   }
6982 else if (newline < 0)
6983   {
6984   cd->nltype = NLTYPE_ANY;
6985   }
6986 else
6987   {
6988   cd->nltype = NLTYPE_FIXED;
6989   if (newline > 255)
6990     {
6991     cd->nllen = 2;
6992     cd->nl[0] = (newline >> 8) & 255;
6993     cd->nl[1] = newline & 255;
6994     }
6995   else
6996     {
6997     cd->nllen = 1;
6998     cd->nl[0] = newline;
6999     }
7000   }
7001
7002 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
7003 references to help in deciding whether (.*) can be treated as anchored or not.
7004 */
7005
7006 cd->top_backref = 0;
7007 cd->backref_map = 0;
7008
7009 /* Reflect pattern for debugging output */
7010
7011 DPRINTF(("------------------------------------------------------------------\n"));
7012 DPRINTF(("%s\n", pattern));
7013
7014 /* Pretend to compile the pattern while actually just accumulating the length
7015 of memory required. This behaviour is triggered by passing a non-NULL final
7016 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
7017 to compile parts of the pattern into; the compiled code is discarded when it is
7018 no longer needed, so hopefully this workspace will never overflow, though there
7019 is a test for its doing so. */
7020
7021 cd->bracount = cd->final_bracount = 0;
7022 cd->names_found = 0;
7023 cd->name_entry_size = 0;
7024 cd->name_table = NULL;
7025 cd->start_workspace = cworkspace;
7026 cd->start_code = cworkspace;
7027 cd->hwm = cworkspace;
7028 cd->start_pattern = (const uschar *)pattern;
7029 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
7030 cd->req_varyopt = 0;
7031 cd->external_options = options;
7032 cd->external_flags = 0;
7033 cd->open_caps = NULL;
7034
7035 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
7036 don't need to look at the result of the function here. The initial options have
7037 been put into the cd block so that they can be changed if an option setting is
7038 found within the regex right at the beginning. Bringing initial option settings
7039 outside can help speed up starting point checks. */
7040
7041 ptr += skipatstart;
7042 code = cworkspace;
7043 *code = OP_BRA;
7044 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
7045   &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
7046   &length);
7047 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
7048
7049 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
7050   cd->hwm - cworkspace));
7051
7052 if (length > MAX_PATTERN_SIZE)
7053   {
7054   errorcode = ERR20;
7055   goto PCRE_EARLY_ERROR_RETURN;
7056   }
7057
7058 /* Compute the size of data block needed and get it, either from malloc or
7059 externally provided function. Integer overflow should no longer be possible
7060 because nowadays we limit the maximum value of cd->names_found and
7061 cd->name_entry_size. */
7062
7063 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
7064 re = (real_pcre *)(pcre_malloc)(size);
7065
7066 if (re == NULL)
7067   {
7068   errorcode = ERR21;
7069   goto PCRE_EARLY_ERROR_RETURN;
7070   }
7071
7072 /* Put in the magic number, and save the sizes, initial options, internal
7073 flags, and character table pointer. NULL is used for the default character
7074 tables. The nullpad field is at the end; it's there to help in the case when a
7075 regex compiled on a system with 4-byte pointers is run on another with 8-byte
7076 pointers. */
7077
7078 re->magic_number = MAGIC_NUMBER;
7079 re->size = (int)size;
7080 re->options = cd->external_options;
7081 re->flags = cd->external_flags;
7082 re->dummy1 = 0;
7083 re->first_byte = 0;
7084 re->req_byte = 0;
7085 re->name_table_offset = sizeof(real_pcre);
7086 re->name_entry_size = cd->name_entry_size;
7087 re->name_count = cd->names_found;
7088 re->ref_count = 0;
7089 re->tables = (tables == _pcre_default_tables)? NULL : tables;
7090 re->nullpad = NULL;
7091
7092 /* The starting points of the name/number translation table and of the code are
7093 passed around in the compile data block. The start/end pattern and initial
7094 options are already set from the pre-compile phase, as is the name_entry_size
7095 field. Reset the bracket count and the names_found field. Also reset the hwm
7096 field; this time it's used for remembering forward references to subpatterns.
7097 */
7098
7099 cd->final_bracount = cd->bracount;  /* Save for checking forward references */
7100 cd->bracount = 0;
7101 cd->names_found = 0;
7102 cd->name_table = (uschar *)re + re->name_table_offset;
7103 codestart = cd->name_table + re->name_entry_size * re->name_count;
7104 cd->start_code = codestart;
7105 cd->hwm = cworkspace;
7106 cd->req_varyopt = 0;
7107 cd->had_accept = FALSE;
7108 cd->check_lookbehind = FALSE;
7109 cd->open_caps = NULL;
7110
7111 /* Set up a starting, non-extracting bracket, then compile the expression. On
7112 error, errorcode will be set non-zero, so we don't need to look at the result
7113 of the function here. */
7114
7115 ptr = (const uschar *)pattern + skipatstart;
7116 code = (uschar *)codestart;
7117 *code = OP_BRA;
7118 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
7119   &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
7120 re->top_bracket = cd->bracount;
7121 re->top_backref = cd->top_backref;
7122 re->flags = cd->external_flags;
7123
7124 if (cd->had_accept) reqbyte = -1;   /* Must disable after (*ACCEPT) */
7125
7126 /* If not reached end of pattern on success, there's an excess bracket. */
7127
7128 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
7129
7130 /* Fill in the terminating state and check for disastrous overflow, but
7131 if debugging, leave the test till after things are printed out. */
7132
7133 *code++ = OP_END;
7134
7135 #ifndef PCRE_DEBUG
7136 if (code - codestart > length) errorcode = ERR23;
7137 #endif
7138
7139 /* Fill in any forward references that are required. */
7140
7141 while (errorcode == 0 && cd->hwm > cworkspace)
7142   {
7143   int offset, recno;
7144   const uschar *groupptr;
7145   cd->hwm -= LINK_SIZE;
7146   offset = GET(cd->hwm, 0);
7147   recno = GET(codestart, offset);
7148   groupptr = _pcre_find_bracket(codestart, utf8, recno);
7149   if (groupptr == NULL) errorcode = ERR53;
7150     else PUT(((uschar *)codestart), offset, (int)(groupptr - codestart));
7151   }
7152
7153 /* Give an error if there's back reference to a non-existent capturing
7154 subpattern. */
7155
7156 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
7157
7158 /* If there were any lookbehind assertions that contained OP_RECURSE
7159 (recursions or subroutine calls), a flag is set for them to be checked here,
7160 because they may contain forward references. Actual recursions can't be fixed
7161 length, but subroutine calls can. It is done like this so that those without
7162 OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
7163 exceptional ones forgo this. We scan the pattern to check that they are fixed
7164 length, and set their lengths. */
7165
7166 if (cd->check_lookbehind)
7167   {
7168   uschar *cc = (uschar *)codestart;
7169
7170   /* Loop, searching for OP_REVERSE items, and process those that do not have
7171   their length set. (Actually, it will also re-process any that have a length
7172   of zero, but that is a pathological case, and it does no harm.) When we find
7173   one, we temporarily terminate the branch it is in while we scan it. */
7174
7175   for (cc = (uschar *)_pcre_find_bracket(codestart, utf8, -1);
7176        cc != NULL;
7177        cc = (uschar *)_pcre_find_bracket(cc, utf8, -1))
7178     {
7179     if (GET(cc, 1) == 0)
7180       {
7181       int fixed_length;
7182       uschar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
7183       int end_op = *be;
7184       *be = OP_END;
7185       fixed_length = find_fixedlength(cc, re->options, TRUE, cd);
7186       *be = end_op;
7187       DPRINTF(("fixed length = %d\n", fixed_length));
7188       if (fixed_length < 0)
7189         {
7190         errorcode = (fixed_length == -2)? ERR36 : ERR25;
7191         break;
7192         }
7193       PUT(cc, 1, fixed_length);
7194       }
7195     cc += 1 + LINK_SIZE;
7196     }
7197   }
7198
7199 /* Failed to compile, or error while post-processing */
7200
7201 if (errorcode != 0)
7202   {
7203   (pcre_free)(re);
7204   PCRE_EARLY_ERROR_RETURN:
7205   *erroroffset = (int)(ptr - (const uschar *)pattern);
7206   PCRE_EARLY_ERROR_RETURN2:
7207   *errorptr = find_error_text(errorcode);
7208   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
7209   return NULL;
7210   }
7211
7212 /* If the anchored option was not passed, set the flag if we can determine that
7213 the pattern is anchored by virtue of ^ characters or \A or anything else (such
7214 as starting with .* when DOTALL is set).
7215
7216 Otherwise, if we know what the first byte has to be, save it, because that
7217 speeds up unanchored matches no end. If not, see if we can set the
7218 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
7219 start with ^. and also when all branches start with .* for non-DOTALL matches.
7220 */
7221
7222 if ((re->options & PCRE_ANCHORED) == 0)
7223   {
7224   int temp_options = re->options;   /* May get changed during these scans */
7225   if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
7226     re->options |= PCRE_ANCHORED;
7227   else
7228     {
7229     if (firstbyte < 0)
7230       firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
7231     if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
7232       {
7233       int ch = firstbyte & 255;
7234       re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
7235          cd->fcc[ch] == ch)? ch : firstbyte;
7236       re->flags |= PCRE_FIRSTSET;
7237       }
7238     else if (is_startline(codestart, 0, cd->backref_map))
7239       re->flags |= PCRE_STARTLINE;
7240     }
7241   }
7242
7243 /* For an anchored pattern, we use the "required byte" only if it follows a
7244 variable length item in the regex. Remove the caseless flag for non-caseable
7245 bytes. */
7246
7247 if (reqbyte >= 0 &&
7248      ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
7249   {
7250   int ch = reqbyte & 255;
7251   re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
7252     cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
7253   re->flags |= PCRE_REQCHSET;
7254   }
7255
7256 /* Print out the compiled data if debugging is enabled. This is never the
7257 case when building a production library. */
7258
7259 #ifdef PCRE_DEBUG
7260 printf("Length = %d top_bracket = %d top_backref = %d\n",
7261   length, re->top_bracket, re->top_backref);
7262
7263 printf("Options=%08x\n", re->options);
7264
7265 if ((re->flags & PCRE_FIRSTSET) != 0)
7266   {
7267   int ch = re->first_byte & 255;
7268   const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
7269     "" : " (caseless)";
7270   if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
7271     else printf("First char = \\x%02x%s\n", ch, caseless);
7272   }
7273
7274 if ((re->flags & PCRE_REQCHSET) != 0)
7275   {
7276   int ch = re->req_byte & 255;
7277   const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
7278     "" : " (caseless)";
7279   if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
7280     else printf("Req char = \\x%02x%s\n", ch, caseless);
7281   }
7282
7283 pcre_printint(re, stdout, TRUE);
7284
7285 /* This check is done here in the debugging case so that the code that
7286 was compiled can be seen. */
7287
7288 if (code - codestart > length)
7289   {
7290   (pcre_free)(re);
7291   *errorptr = find_error_text(ERR23);
7292   *erroroffset = ptr - (uschar *)pattern;
7293   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
7294   return NULL;
7295   }
7296 #endif   /* PCRE_DEBUG */
7297
7298 return (pcre *)re;
7299 }
7300
7301 /* End of pcre_compile.c */