src/pcre_compile.c

   1 /*************************************************
   2 *      Perl-Compatible Regular Expressions       *
   3 *************************************************/
   4
   5 /* PCRE is a library of functions to support regular expressions whose syntax
   6 and semantics are as close as possible to those of the Perl 5 language.
   7
   8                        Written by Philip Hazel
   9            Copyright (c) 1997-2008 University of Cambridge
  10
  11 -----------------------------------------------------------------------------
  12 Redistribution and use in source and binary forms, with or without
  13 modification, are permitted provided that the following conditions are met:
  14
  15     * Redistributions of source code must retain the above copyright notice,
  16       this list of conditions and the following disclaimer.
  17
  18     * Redistributions in binary form must reproduce the above copyright
  19       notice, this list of conditions and the following disclaimer in the
  20       documentation and/or other materials provided with the distribution.
  21
  22     * Neither the name of the University of Cambridge nor the names of its
  23       contributors may be used to endorse or promote products derived from
  24       this software without specific prior written permission.
  25
  26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  36 POSSIBILITY OF SUCH DAMAGE.
  37 -----------------------------------------------------------------------------
  38 */
  39
  40
  41 /* This module contains the external function pcre_compile(), along with
  42 supporting internal functions that are not used by other modules. */
  43
  44
  45 #ifdef HAVE_CONFIG_H
  46 #include "config.h"
  47 #endif
  48
  49 #define NLBLOCK cd             /* Block containing newline information */
  50 #define PSSTART start_pattern  /* Field containing processed string start */
  51 #define PSEND   end_pattern    /* Field containing processed string end */
  52
  53 #include "pcre_internal.h"
  54
  55
  56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
  57 used by pcretest. DEBUG is not defined when building a production library. */
  58
  59 #ifdef DEBUG
  60 #include "pcre_printint.src"
  61 #endif
  62
  63
  64 /* Macro for setting individual bits in class bitmaps. */
  65
  66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
  67
  68 /* Maximum length value to check against when making sure that the integer that
  69 holds the compiled pattern length does not overflow. We make it a bit less than
  70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
  71 to check them every time. */
  72
  73 #define OFLOW_MAX (INT_MAX - 20)
  74
  75
  76 /*************************************************
  77 *      Code parameters and static tables         *
  78 *************************************************/
  79
  80 /* This value specifies the size of stack workspace that is used during the
  81 first pre-compile phase that determines how much memory is required. The regex
  82 is partly compiled into this space, but the compiled parts are discarded as
  83 soon as they can be, so that hopefully there will never be an overrun. The code
  84 does, however, check for an overrun. The largest amount I've seen used is 218,
  85 so this number is very generous.
  86
  87 The same workspace is used during the second, actual compile phase for
  88 remembering forward references to groups so that they can be filled in at the
  89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
  90 is 4 there is plenty of room. */
  91
  92 #define COMPILE_WORK_SIZE (4096)
  93
  94
  95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
  96 are simple data values; negative values are for special things like \d and so
  97 on. Zero means further processing is needed (for things like \x), or the escape
  98 is invalid. */
  99
 100 #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
 101 static const short int escapes[] = {
 102      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
 103      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
 104    '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
 105 -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
 106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
 107 -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
 108    '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
 109 -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
 110 -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
 111      0,      0, -ESC_z                                            /* x - z */
 112 };
 113
 114 #else           /* This is the "abnormal" table for EBCDIC systems */
 115 static const short int escapes[] = {
 116 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
 117 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
 118 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
 119 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
 120 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
 121 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
 122 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
 123 /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
 124 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
 125 /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
 126 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
 127 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
 128 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
 129 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
 130 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
 131 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
 132 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
 133 /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
 134 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
 135 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
 136 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
 137 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
 138 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
 139 };
 140 #endif
 141
 142
 143 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
 144 searched linearly. Put all the names into a single string, in order to reduce
 145 the number of relocations when a shared library is dynamically linked. */
 146
 147 typedef struct verbitem {
 148   int   len;
 149   int   op;
 150 } verbitem;
 151
 152 static const char verbnames[] =
 153   "ACCEPT\0"
 154   "COMMIT\0"
 155   "F\0"
 156   "FAIL\0"
 157   "PRUNE\0"
 158   "SKIP\0"
 159   "THEN";
 160
 161 static verbitem verbs[] = {
 162   { 6, OP_ACCEPT },
 163   { 6, OP_COMMIT },
 164   { 1, OP_FAIL },
 165   { 4, OP_FAIL },
 166   { 5, OP_PRUNE },
 167   { 4, OP_SKIP  },
 168   { 4, OP_THEN  }
 169 };
 170
 171 static int verbcount = sizeof(verbs)/sizeof(verbitem);
 172
 173
 174 /* Tables of names of POSIX character classes and their lengths. The names are
 175 now all in a single string, to reduce the number of relocations when a shared
 176 library is dynamically loaded. The list of lengths is terminated by a zero
 177 length entry. The first three must be alpha, lower, upper, as this is assumed
 178 for handling case independence. */
 179
 180 static const char posix_names[] =
 181   "alpha\0"  "lower\0"  "upper\0"  "alnum\0"  "ascii\0"  "blank\0"
 182   "cntrl\0"  "digit\0"  "graph\0"  "print\0"  "punct\0"  "space\0"
 183   "word\0"   "xdigit";
 184
 185 static const uschar posix_name_lengths[] = {
 186   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
 187
 188 /* Table of class bit maps for each POSIX class. Each class is formed from a
 189 base map, with an optional addition or removal of another map. Then, for some
 190 classes, there is some additional tweaking: for [:blank:] the vertical space
 191 characters are removed, and for [:alpha:] and [:alnum:] the underscore
 192 character is removed. The triples in the table consist of the base map offset,
 193 second map offset or -1 if no second map, and a non-negative value for map
 194 addition or a negative value for map subtraction (if there are two maps). The
 195 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
 196 remove vertical space characters, 2 => remove underscore. */
 197
 198 static const int posix_class_maps[] = {
 199   cbit_word,  cbit_digit, -2,             /* alpha */
 200   cbit_lower, -1,          0,             /* lower */
 201   cbit_upper, -1,          0,             /* upper */
 202   cbit_word,  -1,          2,             /* alnum - word without underscore */
 203   cbit_print, cbit_cntrl,  0,             /* ascii */
 204   cbit_space, -1,          1,             /* blank - a GNU extension */
 205   cbit_cntrl, -1,          0,             /* cntrl */
 206   cbit_digit, -1,          0,             /* digit */
 207   cbit_graph, -1,          0,             /* graph */
 208   cbit_print, -1,          0,             /* print */
 209   cbit_punct, -1,          0,             /* punct */
 210   cbit_space, -1,          0,             /* space */
 211   cbit_word,  -1,          0,             /* word - a Perl extension */
 212   cbit_xdigit,-1,          0              /* xdigit */
 213 };
 214
 215
 216 #define STRING(a)  # a
 217 #define XSTRING(s) STRING(s)
 218
 219 /* The texts of compile-time error messages. These are "char *" because they
 220 are passed to the outside world. Do not ever re-use any error number, because
 221 they are documented. Always add a new error instead. Messages marked DEAD below
 222 are no longer used. This used to be a table of strings, but in order to reduce
 223 the number of relocations needed when a shared library is loaded dynamically,
 224 it is now one long string. We cannot use a table of offsets, because the
 225 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
 226 simply count through to the one we want - this isn't a performance issue
 227 because these strings are used only when there is a compilation error. */
 228
 229 static const char error_texts[] =
 230   "no error\0"
 231   "\\ at end of pattern\0"
 232   "\\c at end of pattern\0"
 233   "unrecognized character follows \\\0"
 234   "numbers out of order in {} quantifier\0"
 235   /* 5 */
 236   "number too big in {} quantifier\0"
 237   "missing terminating ] for character class\0"
 238   "invalid escape sequence in character class\0"
 239   "range out of order in character class\0"
 240   "nothing to repeat\0"
 241   /* 10 */
 242   "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
 243   "internal error: unexpected repeat\0"
 244   "unrecognized character after (? or (?-\0"
 245   "POSIX named classes are supported only within a class\0"
 246   "missing )\0"
 247   /* 15 */
 248   "reference to non-existent subpattern\0"
 249   "erroffset passed as NULL\0"
 250   "unknown option bit(s) set\0"
 251   "missing ) after comment\0"
 252   "parentheses nested too deeply\0"  /** DEAD **/
 253   /* 20 */
 254   "regular expression is too large\0"
 255   "failed to get memory\0"
 256   "unmatched parentheses\0"
 257   "internal error: code overflow\0"
 258   "unrecognized character after (?<\0"
 259   /* 25 */
 260   "lookbehind assertion is not fixed length\0"
 261   "malformed number or name after (?(\0"
 262   "conditional group contains more than two branches\0"
 263   "assertion expected after (?(\0"
 264   "(?R or (?[+-]digits must be followed by )\0"
 265   /* 30 */
 266   "unknown POSIX class name\0"
 267   "POSIX collating elements are not supported\0"
 268   "this version of PCRE is not compiled with PCRE_UTF8 support\0"
 269   "spare error\0"  /** DEAD **/
 270   "character value in \\x{...} sequence is too large\0"
 271   /* 35 */
 272   "invalid condition (?(0)\0"
 273   "\\C not allowed in lookbehind assertion\0"
 274   "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
 275   "number after (?C is > 255\0"
 276   "closing ) for (?C expected\0"
 277   /* 40 */
 278   "recursive call could loop indefinitely\0"
 279   "unrecognized character after (?P\0"
 280   "syntax error in subpattern name (missing terminator)\0"
 281   "two named subpatterns have the same name\0"
 282   "invalid UTF-8 string\0"
 283   /* 45 */
 284   "support for \\P, \\p, and \\X has not been compiled\0"
 285   "malformed \\P or \\p sequence\0"
 286   "unknown property name after \\P or \\p\0"
 287   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
 288   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
 289   /* 50 */
 290   "repeated subpattern is too long\0"    /** DEAD **/
 291   "octal value is greater than \\377 (not in UTF-8 mode)\0"
 292   "internal error: overran compiling workspace\0"
 293   "internal error: previously-checked referenced subpattern not found\0"
 294   "DEFINE group contains more than one branch\0"
 295   /* 55 */
 296   "repeating a DEFINE group is not allowed\0"
 297   "inconsistent NEWLINE options\0"
 298   "\\g is not followed by a braced name or an optionally braced non-zero number\0"
 299   "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0"
 300   "(*VERB) with an argument is not supported\0"
 301   /* 60 */
 302   "(*VERB) not recognized\0"
 303   "number is too big\0"
 304   "subpattern name expected\0"
 305   "digit expected after (?+";
 306
 307
 308 /* Table to identify digits and hex digits. This is used when compiling
 309 patterns. Note that the tables in chartables are dependent on the locale, and
 310 may mark arbitrary characters as digits - but the PCRE compiling code expects
 311 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
 312 a private table here. It costs 256 bytes, but it is a lot faster than doing
 313 character value tests (at least in some simple cases I timed), and in some
 314 applications one wants PCRE to compile efficiently as well as match
 315 efficiently.
 316
 317 For convenience, we use the same bit definitions as in chartables:
 318
 319   0x04   decimal digit
 320   0x08   hexadecimal digit
 321
 322 Then we can use ctype_digit and ctype_xdigit in the code. */
 323
 324 #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
 325 static const unsigned char digitab[] =
 326   {
 327   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
 328   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
 329   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
 330   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
 331   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
 332   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
 333   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
 334   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
 335   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
 336   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
 337   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
 338   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
 339   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
 340   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
 341   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
 342   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
 343   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
 344   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
 345   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
 346   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
 347   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
 348   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
 349   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
 350   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
 351   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
 352   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
 353   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
 354   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
 355   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
 356   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
 357   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
 358   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
 359
 360 #else           /* This is the "abnormal" case, for EBCDIC systems */
 361 static const unsigned char digitab[] =
 362   {
 363   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
 364   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
 365   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
 366   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
 367   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
 368   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
 369   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
 370   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
 371   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
 372   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
 373   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
 374   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
 375   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
 376   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
 377   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
 378   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
 379   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
 380   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
 381   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
 382   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
 383   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
 384   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
 385   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
 386   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
 387   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
 388   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
 389   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
 390   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
 391   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
 392   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
 393   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
 394   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
 395
 396 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
 397   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
 398   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
 399   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
 400   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
 401   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
 402   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
 403   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
 404   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
 405   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
 406   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
 407   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
 408   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
 409   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
 410   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
 411   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
 412   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
 413   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
 414   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
 415   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
 416   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
 417   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
 418   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
 419   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
 420   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
 421   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
 422   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
 423   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
 424   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
 425   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
 426   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
 427   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
 428   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
 429 #endif
 430
 431
 432 /* Definition to allow mutual recursion */
 433
 434 static BOOL
 435   compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
 436     int *, int *, branch_chain *, compile_data *, int *);
 437
 438
 439
 440 /*************************************************
 441 *            Find an error text                  *
 442 *************************************************/
 443
 444 /* The error texts are now all in one long string, to save on relocations. As
 445 some of the text is of unknown length, we can't use a table of offsets.
 446 Instead, just count through the strings. This is not a performance issue
 447 because it happens only when there has been a compilation error.
 448
 449 Argument:   the error number
 450 Returns:    pointer to the error string
 451 */
 452
 453 static const char *
 454 find_error_text(int n)
 455 {
 456 const char *s = error_texts;
 457 for (; n > 0; n--) while (*s++ != 0);
 458 return s;
 459 }
 460
 461
 462 /*************************************************
 463 *            Handle escapes                      *
 464 *************************************************/
 465
 466 /* This function is called when a \ has been encountered. It either returns a
 467 positive value for a simple escape such as \n, or a negative value which
 468 encodes one of the more complicated things such as \d. A backreference to group
 469 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
 470 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
 471 ptr is pointing at the \. On exit, it is on the final character of the escape
 472 sequence.
 473
 474 Arguments:
 475   ptrptr         points to the pattern position pointer
 476   errorcodeptr   points to the errorcode variable
 477   bracount       number of previous extracting brackets
 478   options        the options bits
 479   isclass        TRUE if inside a character class
 480
 481 Returns:         zero or positive => a data character
 482                  negative => a special escape sequence
 483                  on error, errorcodeptr is set
 484 */
 485
 486 static int
 487 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
 488   int options, BOOL isclass)
 489 {
 490 BOOL utf8 = (options & PCRE_UTF8) != 0;
 491 const uschar *ptr = *ptrptr + 1;
 492 int c, i;
 493
 494 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
 495 ptr--;                            /* Set pointer back to the last byte */
 496
 497 /* If backslash is at the end of the pattern, it's an error. */
 498
 499 if (c == 0) *errorcodeptr = ERR1;
 500
 501 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
 502 in a table. A non-zero result is something that can be returned immediately.
 503 Otherwise further processing may be required. */
 504
 505 #ifndef EBCDIC  /* ASCII coding */
 506 else if (c < '0' || c > 'z') {}                           /* Not alphanumeric */
 507 else if ((i = escapes[c - '0']) != 0) c = i;
 508
 509 #else           /* EBCDIC coding */
 510 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
 511 else if ((i = escapes[c - 0x48]) != 0)  c = i;
 512 #endif
 513
 514 /* Escapes that need further processing, or are illegal. */
 515
 516 else
 517   {
 518   const uschar *oldptr;
 519   BOOL braced, negated;
 520
 521   switch (c)
 522     {
 523     /* A number of Perl escapes are not handled by PCRE. We give an explicit
 524     error. */
 525
 526     case 'l':
 527     case 'L':
 528     case 'N':
 529     case 'u':
 530     case 'U':
 531     *errorcodeptr = ERR37;
 532     break;
 533
 534     /* \g must be followed by a number, either plain or braced. If positive, it
 535     is an absolute backreference. If negative, it is a relative backreference.
 536     This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
 537     reference to a named group. This is part of Perl's movement towards a
 538     unified syntax for back references. As this is synonymous with \k{name}, we
 539     fudge it up by pretending it really was \k. */
 540
 541     case 'g':
 542     if (ptr[1] == '{')
 543       {
 544       const uschar *p;
 545       for (p = ptr+2; *p != 0 && *p != '}'; p++)
 546         if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
 547       if (*p != 0 && *p != '}')
 548         {
 549         c = -ESC_k;
 550         break;
 551         }
 552       braced = TRUE;
 553       ptr++;
 554       }
 555     else braced = FALSE;
 556
 557     if (ptr[1] == '-')
 558       {
 559       negated = TRUE;
 560       ptr++;
 561       }
 562     else negated = FALSE;
 563
 564     c = 0;
 565     while ((digitab[ptr[1]] & ctype_digit) != 0)
 566       c = c * 10 + *(++ptr) - '0';
 567
 568     if (c < 0)
 569       {
 570       *errorcodeptr = ERR61;
 571       break;
 572       }
 573
 574     if (c == 0 || (braced && *(++ptr) != '}'))
 575       {
 576       *errorcodeptr = ERR57;
 577       break;
 578       }
 579
 580     if (negated)
 581       {
 582       if (c > bracount)
 583         {
 584         *errorcodeptr = ERR15;
 585         break;
 586         }
 587       c = bracount - (c - 1);
 588       }
 589
 590     c = -(ESC_REF + c);
 591     break;
 592
 593     /* The handling of escape sequences consisting of a string of digits
 594     starting with one that is not zero is not straightforward. By experiment,
 595     the way Perl works seems to be as follows:
 596
 597     Outside a character class, the digits are read as a decimal number. If the
 598     number is less than 10, or if there are that many previous extracting
 599     left brackets, then it is a back reference. Otherwise, up to three octal
 600     digits are read to form an escaped byte. Thus \123 is likely to be octal
 601     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
 602     value is greater than 377, the least significant 8 bits are taken. Inside a
 603     character class, \ followed by a digit is always an octal number. */
 604
 605     case '1': case '2': case '3': case '4': case '5':
 606     case '6': case '7': case '8': case '9':
 607
 608     if (!isclass)
 609       {
 610       oldptr = ptr;
 611       c -= '0';
 612       while ((digitab[ptr[1]] & ctype_digit) != 0)
 613         c = c * 10 + *(++ptr) - '0';
 614       if (c < 0)
 615         {
 616         *errorcodeptr = ERR61;
 617         break;
 618         }
 619       if (c < 10 || c <= bracount)
 620         {
 621         c = -(ESC_REF + c);
 622         break;
 623         }
 624       ptr = oldptr;      /* Put the pointer back and fall through */
 625       }
 626
 627     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
 628     generates a binary zero byte and treats the digit as a following literal.
 629     Thus we have to pull back the pointer by one. */
 630
 631     if ((c = *ptr) >= '8')
 632       {
 633       ptr--;
 634       c = 0;
 635       break;
 636       }
 637
 638     /* \0 always starts an octal number, but we may drop through to here with a
 639     larger first octal digit. The original code used just to take the least
 640     significant 8 bits of octal numbers (I think this is what early Perls used
 641     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
 642     than 3 octal digits. */
 643
 644     case '0':
 645     c -= '0';
 646     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
 647         c = c * 8 + *(++ptr) - '0';
 648     if (!utf8 && c > 255) *errorcodeptr = ERR51;
 649     break;
 650
 651     /* \x is complicated. \x{ddd} is a character number which can be greater
 652     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
 653     treated as a data character. */
 654
 655     case 'x':
 656     if (ptr[1] == '{')
 657       {
 658       const uschar *pt = ptr + 2;
 659       int count = 0;
 660
 661       c = 0;
 662       while ((digitab[*pt] & ctype_xdigit) != 0)
 663         {
 664         register int cc = *pt++;
 665         if (c == 0 && cc == '0') continue;     /* Leading zeroes */
 666         count++;
 667
 668 #ifndef EBCDIC  /* ASCII coding */
 669         if (cc >= 'a') cc -= 32;               /* Convert to upper case */
 670         c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
 671 #else           /* EBCDIC coding */
 672         if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
 673         c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
 674 #endif
 675         }
 676
 677       if (*pt == '}')
 678         {
 679         if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
 680         ptr = pt;
 681         break;
 682         }
 683
 684       /* If the sequence of hex digits does not end with '}', then we don't
 685       recognize this construct; fall through to the normal \x handling. */
 686       }
 687
 688     /* Read just a single-byte hex-defined char */
 689
 690     c = 0;
 691     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
 692       {
 693       int cc;                               /* Some compilers don't like ++ */
 694       cc = *(++ptr);                        /* in initializers */
 695 #ifndef EBCDIC  /* ASCII coding */
 696       if (cc >= 'a') cc -= 32;              /* Convert to upper case */
 697       c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
 698 #else           /* EBCDIC coding */
 699       if (cc <= 'z') cc += 64;              /* Convert to upper case */
 700       c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
 701 #endif
 702       }
 703     break;
 704
 705     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
 706     This coding is ASCII-specific, but then the whole concept of \cx is
 707     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
 708
 709     case 'c':
 710     c = *(++ptr);
 711     if (c == 0)
 712       {
 713       *errorcodeptr = ERR2;
 714       break;
 715       }
 716
 717 #ifndef EBCDIC  /* ASCII coding */
 718     if (c >= 'a' && c <= 'z') c -= 32;
 719     c ^= 0x40;
 720 #else           /* EBCDIC coding */
 721     if (c >= 'a' && c <= 'z') c += 64;
 722     c ^= 0xC0;
 723 #endif
 724     break;
 725
 726     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
 727     other alphanumeric following \ is an error if PCRE_EXTRA was set;
 728     otherwise, for Perl compatibility, it is a literal. This code looks a bit
 729     odd, but there used to be some cases other than the default, and there may
 730     be again in future, so I haven't "optimized" it. */
 731
 732     default:
 733     if ((options & PCRE_EXTRA) != 0) switch(c)
 734       {
 735       default:
 736       *errorcodeptr = ERR3;
 737       break;
 738       }
 739     break;
 740     }
 741   }
 742
 743 *ptrptr = ptr;
 744 return c;
 745 }
 746
 747
 748
 749 #ifdef SUPPORT_UCP
 750 /*************************************************
 751 *               Handle \P and \p                 *
 752 *************************************************/
 753
 754 /* This function is called after \P or \p has been encountered, provided that
 755 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
 756 pointing at the P or p. On exit, it is pointing at the final character of the
 757 escape sequence.
 758
 759 Argument:
 760   ptrptr         points to the pattern position pointer
 761   negptr         points to a boolean that is set TRUE for negation else FALSE
 762   dptr           points to an int that is set to the detailed property value
 763   errorcodeptr   points to the error code variable
 764
 765 Returns:         type value from ucp_type_table, or -1 for an invalid type
 766 */
 767
 768 static int
 769 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
 770 {
 771 int c, i, bot, top;
 772 const uschar *ptr = *ptrptr;
 773 char name[32];
 774
 775 c = *(++ptr);
 776 if (c == 0) goto ERROR_RETURN;
 777
 778 *negptr = FALSE;
 779
 780 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
 781 negation. */
 782
 783 if (c == '{')
 784   {
 785   if (ptr[1] == '^')
 786     {
 787     *negptr = TRUE;
 788     ptr++;
 789     }
 790   for (i = 0; i < (int)sizeof(name) - 1; i++)
 791     {
 792     c = *(++ptr);
 793     if (c == 0) goto ERROR_RETURN;
 794     if (c == '}') break;
 795     name[i] = c;
 796     }
 797   if (c !='}') goto ERROR_RETURN;
 798   name[i] = 0;
 799   }
 800
 801 /* Otherwise there is just one following character */
 802
 803 else
 804   {
 805   name[0] = c;
 806   name[1] = 0;
 807   }
 808
 809 *ptrptr = ptr;
 810
 811 /* Search for a recognized property name using binary chop */
 812
 813 bot = 0;
 814 top = _pcre_utt_size;
 815
 816 while (bot < top)
 817   {
 818   i = (bot + top) >> 1;
 819   c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
 820   if (c == 0)
 821     {
 822     *dptr = _pcre_utt[i].value;
 823     return _pcre_utt[i].type;
 824     }
 825   if (c > 0) bot = i + 1; else top = i;
 826   }
 827
 828 *errorcodeptr = ERR47;
 829 *ptrptr = ptr;
 830 return -1;
 831
 832 ERROR_RETURN:
 833 *errorcodeptr = ERR46;
 834 *ptrptr = ptr;
 835 return -1;
 836 }
 837 #endif
 838
 839
 840
 841
 842 /*************************************************
 843 *            Check for counted repeat            *
 844 *************************************************/
 845
 846 /* This function is called when a '{' is encountered in a place where it might
 847 start a quantifier. It looks ahead to see if it really is a quantifier or not.
 848 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
 849 where the ddds are digits.
 850
 851 Arguments:
 852   p         pointer to the first char after '{'
 853
 854 Returns:    TRUE or FALSE
 855 */
 856
 857 static BOOL
 858 is_counted_repeat(const uschar *p)
 859 {
 860 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
 861 while ((digitab[*p] & ctype_digit) != 0) p++;
 862 if (*p == '}') return TRUE;
 863
 864 if (*p++ != ',') return FALSE;
 865 if (*p == '}') return TRUE;
 866
 867 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
 868 while ((digitab[*p] & ctype_digit) != 0) p++;
 869
 870 return (*p == '}');
 871 }
 872
 873
 874
 875 /*************************************************
 876 *         Read repeat counts                     *
 877 *************************************************/
 878
 879 /* Read an item of the form {n,m} and return the values. This is called only
 880 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
 881 so the syntax is guaranteed to be correct, but we need to check the values.
 882
 883 Arguments:
 884   p              pointer to first char after '{'
 885   minp           pointer to int for min
 886   maxp           pointer to int for max
 887                  returned as -1 if no max
 888   errorcodeptr   points to error code variable
 889
 890 Returns:         pointer to '}' on success;
 891                  current ptr on error, with errorcodeptr set non-zero
 892 */
 893
 894 static const uschar *
 895 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
 896 {
 897 int min = 0;
 898 int max = -1;
 899
 900 /* Read the minimum value and do a paranoid check: a negative value indicates
 901 an integer overflow. */
 902
 903 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
 904 if (min < 0 || min > 65535)
 905   {
 906   *errorcodeptr = ERR5;
 907   return p;
 908   }
 909
 910 /* Read the maximum value if there is one, and again do a paranoid on its size.
 911 Also, max must not be less than min. */
 912
 913 if (*p == '}') max = min; else
 914   {
 915   if (*(++p) != '}')
 916     {
 917     max = 0;
 918     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
 919     if (max < 0 || max > 65535)
 920       {
 921       *errorcodeptr = ERR5;
 922       return p;
 923       }
 924     if (max < min)
 925       {
 926       *errorcodeptr = ERR4;
 927       return p;
 928       }
 929     }
 930   }
 931
 932 /* Fill in the required variables, and pass back the pointer to the terminating
 933 '}'. */
 934
 935 *minp = min;
 936 *maxp = max;
 937 return p;
 938 }
 939
 940
 941
 942 /*************************************************
 943 *       Find forward referenced subpattern       *
 944 *************************************************/
 945
 946 /* This function scans along a pattern's text looking for capturing
 947 subpatterns, and counting them. If it finds a named pattern that matches the
 948 name it is given, it returns its number. Alternatively, if the name is NULL, it
 949 returns when it reaches a given numbered subpattern. This is used for forward
 950 references to subpatterns. We know that if (?P< is encountered, the name will
 951 be terminated by '>' because that is checked in the first pass.
 952
 953 Arguments:
 954   ptr          current position in the pattern
 955   count        current count of capturing parens so far encountered
 956   name         name to seek, or NULL if seeking a numbered subpattern
 957   lorn         name length, or subpattern number if name is NULL
 958   xmode        TRUE if we are in /x mode
 959
 960 Returns:       the number of the named subpattern, or -1 if not found
 961 */
 962
 963 static int
 964 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
 965   BOOL xmode)
 966 {
 967 const uschar *thisname;
 968
 969 for (; *ptr != 0; ptr++)
 970   {
 971   int term;
 972
 973   /* Skip over backslashed characters and also entire \Q...\E */
 974
 975   if (*ptr == '\\')
 976     {
 977     if (*(++ptr) == 0) return -1;
 978     if (*ptr == 'Q') for (;;)
 979       {
 980       while (*(++ptr) != 0 && *ptr != '\\');
 981       if (*ptr == 0) return -1;
 982       if (*(++ptr) == 'E') break;
 983       }
 984     continue;
 985     }
 986
 987   /* Skip over character classes */
 988
 989   if (*ptr == '[')
 990     {
 991     while (*(++ptr) != ']')
 992       {
 993       if (*ptr == 0) return -1;
 994       if (*ptr == '\\')
 995         {
 996         if (*(++ptr) == 0) return -1;
 997         if (*ptr == 'Q') for (;;)
 998           {
 999           while (*(++ptr) != 0 && *ptr != '\\');
1000           if (*ptr == 0) return -1;
1001           if (*(++ptr) == 'E') break;
1002           }
1003         continue;
1004         }
1005       }
1006     continue;
1007     }
1008
1009   /* Skip comments in /x mode */
1010
1011   if (xmode && *ptr == '#')
1012     {
1013     while (*(++ptr) != 0 && *ptr != '\n');
1014     if (*ptr == 0) return -1;
1015     continue;
1016     }
1017
1018   /* An opening parens must now be a real metacharacter */
1019
1020   if (*ptr != '(') continue;
1021   if (ptr[1] != '?' && ptr[1] != '*')
1022     {
1023     count++;
1024     if (name == NULL && count == lorn) return count;
1025     continue;
1026     }
1027
1028   ptr += 2;
1029   if (*ptr == 'P') ptr++;                      /* Allow optional P */
1030
1031   /* We have to disambiguate (?<! and (?<= from (?<name> */
1032
1033   if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1034        *ptr != '\'')
1035     continue;
1036
1037   count++;
1038
1039   if (name == NULL && count == lorn) return count;
1040   term = *ptr++;
1041   if (term == '<') term = '>';
1042   thisname = ptr;
1043   while (*ptr != term) ptr++;
1044   if (name != NULL && lorn == ptr - thisname &&
1045       strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1046     return count;
1047   }
1048
1049 return -1;
1050 }
1051
1052
1053
1054 /*************************************************
1055 *      Find first significant op code            *
1056 *************************************************/
1057
1058 /* This is called by several functions that scan a compiled expression looking
1059 for a fixed first character, or an anchoring op code etc. It skips over things
1060 that do not influence this. For some calls, a change of option is important.
1061 For some calls, it makes sense to skip negative forward and all backward
1062 assertions, and also the \b assertion; for others it does not.
1063
1064 Arguments:
1065   code         pointer to the start of the group
1066   options      pointer to external options
1067   optbit       the option bit whose changing is significant, or
1068                  zero if none are
1069   skipassert   TRUE if certain assertions are to be skipped
1070
1071 Returns:       pointer to the first significant opcode
1072 */
1073
1074 static const uschar*
1075 first_significant_code(const uschar *code, int *options, int optbit,
1076   BOOL skipassert)
1077 {
1078 for (;;)
1079   {
1080   switch ((int)*code)
1081     {
1082     case OP_OPT:
1083     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1084       *options = (int)code[1];
1085     code += 2;
1086     break;
1087
1088     case OP_ASSERT_NOT:
1089     case OP_ASSERTBACK:
1090     case OP_ASSERTBACK_NOT:
1091     if (!skipassert) return code;
1092     do code += GET(code, 1); while (*code == OP_ALT);
1093     code += _pcre_OP_lengths[*code];
1094     break;
1095
1096     case OP_WORD_BOUNDARY:
1097     case OP_NOT_WORD_BOUNDARY:
1098     if (!skipassert) return code;
1099     /* Fall through */
1100
1101     case OP_CALLOUT:
1102     case OP_CREF:
1103     case OP_RREF:
1104     case OP_DEF:
1105     code += _pcre_OP_lengths[*code];
1106     break;
1107
1108     default:
1109     return code;
1110     }
1111   }
1112 /* Control never reaches here */
1113 }
1114
1115
1116
1117
1118 /*************************************************
1119 *        Find the fixed length of a pattern      *
1120 *************************************************/
1121
1122 /* Scan a pattern and compute the fixed length of subject that will match it,
1123 if the length is fixed. This is needed for dealing with backward assertions.
1124 In UTF8 mode, the result is in characters rather than bytes.
1125
1126 Arguments:
1127   code     points to the start of the pattern (the bracket)
1128   options  the compiling options
1129
1130 Returns:   the fixed length, or -1 if there is no fixed length,
1131              or -2 if \C was encountered
1132 */
1133
1134 static int
1135 find_fixedlength(uschar *code, int options)
1136 {
1137 int length = -1;
1138
1139 register int branchlength = 0;
1140 register uschar *cc = code + 1 + LINK_SIZE;
1141
1142 /* Scan along the opcodes for this branch. If we get to the end of the
1143 branch, check the length against that of the other branches. */
1144
1145 for (;;)
1146   {
1147   int d;
1148   register int op = *cc;
1149   switch (op)
1150     {
1151     case OP_CBRA:
1152     case OP_BRA:
1153     case OP_ONCE:
1154     case OP_COND:
1155     d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1156     if (d < 0) return d;
1157     branchlength += d;
1158     do cc += GET(cc, 1); while (*cc == OP_ALT);
1159     cc += 1 + LINK_SIZE;
1160     break;
1161
1162     /* Reached end of a branch; if it's a ket it is the end of a nested
1163     call. If it's ALT it is an alternation in a nested call. If it is
1164     END it's the end of the outer call. All can be handled by the same code. */
1165
1166     case OP_ALT:
1167     case OP_KET:
1168     case OP_KETRMAX:
1169     case OP_KETRMIN:
1170     case OP_END:
1171     if (length < 0) length = branchlength;
1172       else if (length != branchlength) return -1;
1173     if (*cc != OP_ALT) return length;
1174     cc += 1 + LINK_SIZE;
1175     branchlength = 0;
1176     break;
1177
1178     /* Skip over assertive subpatterns */
1179
1180     case OP_ASSERT:
1181     case OP_ASSERT_NOT:
1182     case OP_ASSERTBACK:
1183     case OP_ASSERTBACK_NOT:
1184     do cc += GET(cc, 1); while (*cc == OP_ALT);
1185     /* Fall through */
1186
1187     /* Skip over things that don't match chars */
1188
1189     case OP_REVERSE:
1190     case OP_CREF:
1191     case OP_RREF:
1192     case OP_DEF:
1193     case OP_OPT:
1194     case OP_CALLOUT:
1195     case OP_SOD:
1196     case OP_SOM:
1197     case OP_EOD:
1198     case OP_EODN:
1199     case OP_CIRC:
1200     case OP_DOLL:
1201     case OP_NOT_WORD_BOUNDARY:
1202     case OP_WORD_BOUNDARY:
1203     cc += _pcre_OP_lengths[*cc];
1204     break;
1205
1206     /* Handle literal characters */
1207
1208     case OP_CHAR:
1209     case OP_CHARNC:
1210     case OP_NOT:
1211     branchlength++;
1212     cc += 2;
1213 #ifdef SUPPORT_UTF8
1214     if ((options & PCRE_UTF8) != 0)
1215       {
1216       while ((*cc & 0xc0) == 0x80) cc++;
1217       }
1218 #endif
1219     break;
1220
1221     /* Handle exact repetitions. The count is already in characters, but we
1222     need to skip over a multibyte character in UTF8 mode.  */
1223
1224     case OP_EXACT:
1225     branchlength += GET2(cc,1);
1226     cc += 4;
1227 #ifdef SUPPORT_UTF8
1228     if ((options & PCRE_UTF8) != 0)
1229       {
1230       while((*cc & 0x80) == 0x80) cc++;
1231       }
1232 #endif
1233     break;
1234
1235     case OP_TYPEEXACT:
1236     branchlength += GET2(cc,1);
1237     if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1238     cc += 4;
1239     break;
1240
1241     /* Handle single-char matchers */
1242
1243     case OP_PROP:
1244     case OP_NOTPROP:
1245     cc += 2;
1246     /* Fall through */
1247
1248     case OP_NOT_DIGIT:
1249     case OP_DIGIT:
1250     case OP_NOT_WHITESPACE:
1251     case OP_WHITESPACE:
1252     case OP_NOT_WORDCHAR:
1253     case OP_WORDCHAR:
1254     case OP_ANY:
1255     branchlength++;
1256     cc++;
1257     break;
1258
1259     /* The single-byte matcher isn't allowed */
1260
1261     case OP_ANYBYTE:
1262     return -2;
1263
1264     /* Check a class for variable quantification */
1265
1266 #ifdef SUPPORT_UTF8
1267     case OP_XCLASS:
1268     cc += GET(cc, 1) - 33;
1269     /* Fall through */
1270 #endif
1271
1272     case OP_CLASS:
1273     case OP_NCLASS:
1274     cc += 33;
1275
1276     switch (*cc)
1277       {
1278       case OP_CRSTAR:
1279       case OP_CRMINSTAR:
1280       case OP_CRQUERY:
1281       case OP_CRMINQUERY:
1282       return -1;
1283
1284       case OP_CRRANGE:
1285       case OP_CRMINRANGE:
1286       if (GET2(cc,1) != GET2(cc,3)) return -1;
1287       branchlength += GET2(cc,1);
1288       cc += 5;
1289       break;
1290
1291       default:
1292       branchlength++;
1293       }
1294     break;
1295
1296     /* Anything else is variable length */
1297
1298     default:
1299     return -1;
1300     }
1301   }
1302 /* Control never gets here */
1303 }
1304
1305
1306
1307
1308 /*************************************************
1309 *    Scan compiled regex for numbered bracket    *
1310 *************************************************/
1311
1312 /* This little function scans through a compiled pattern until it finds a
1313 capturing bracket with the given number.
1314
1315 Arguments:
1316   code        points to start of expression
1317   utf8        TRUE in UTF-8 mode
1318   number      the required bracket number
1319
1320 Returns:      pointer to the opcode for the bracket, or NULL if not found
1321 */
1322
1323 static const uschar *
1324 find_bracket(const uschar *code, BOOL utf8, int number)
1325 {
1326 for (;;)
1327   {
1328   register int c = *code;
1329   if (c == OP_END) return NULL;
1330
1331   /* XCLASS is used for classes that cannot be represented just by a bit
1332   map. This includes negated single high-valued characters. The length in
1333   the table is zero; the actual length is stored in the compiled code. */
1334
1335   if (c == OP_XCLASS) code += GET(code, 1);
1336
1337   /* Handle capturing bracket */
1338
1339   else if (c == OP_CBRA)
1340     {
1341     int n = GET2(code, 1+LINK_SIZE);
1342     if (n == number) return (uschar *)code;
1343     code += _pcre_OP_lengths[c];
1344     }
1345
1346   /* Otherwise, we can get the item's length from the table, except that for
1347   repeated character types, we have to test for \p and \P, which have an extra
1348   two bytes of parameters. */
1349
1350   else
1351     {
1352     switch(c)
1353       {
1354       case OP_TYPESTAR:
1355       case OP_TYPEMINSTAR:
1356       case OP_TYPEPLUS:
1357       case OP_TYPEMINPLUS:
1358       case OP_TYPEQUERY:
1359       case OP_TYPEMINQUERY:
1360       case OP_TYPEPOSSTAR:
1361       case OP_TYPEPOSPLUS:
1362       case OP_TYPEPOSQUERY:
1363       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1364       break;
1365
1366       case OP_TYPEUPTO:
1367       case OP_TYPEMINUPTO:
1368       case OP_TYPEEXACT:
1369       case OP_TYPEPOSUPTO:
1370       if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1371       break;
1372       }
1373
1374     /* Add in the fixed length from the table */
1375
1376     code += _pcre_OP_lengths[c];
1377
1378   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1379   a multi-byte character. The length in the table is a minimum, so we have to
1380   arrange to skip the extra bytes. */
1381
1382 #ifdef SUPPORT_UTF8
1383     if (utf8) switch(c)
1384       {
1385       case OP_CHAR:
1386       case OP_CHARNC:
1387       case OP_EXACT:
1388       case OP_UPTO:
1389       case OP_MINUPTO:
1390       case OP_POSUPTO:
1391       case OP_STAR:
1392       case OP_MINSTAR:
1393       case OP_POSSTAR:
1394       case OP_PLUS:
1395       case OP_MINPLUS:
1396       case OP_POSPLUS:
1397       case OP_QUERY:
1398       case OP_MINQUERY:
1399       case OP_POSQUERY:
1400       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1401       break;
1402       }
1403 #endif
1404     }
1405   }
1406 }
1407
1408
1409
1410 /*************************************************
1411 *   Scan compiled regex for recursion reference  *
1412 *************************************************/
1413
1414 /* This little function scans through a compiled pattern until it finds an
1415 instance of OP_RECURSE.
1416
1417 Arguments:
1418   code        points to start of expression
1419   utf8        TRUE in UTF-8 mode
1420
1421 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
1422 */
1423
1424 static const uschar *
1425 find_recurse(const uschar *code, BOOL utf8)
1426 {
1427 for (;;)
1428   {
1429   register int c = *code;
1430   if (c == OP_END) return NULL;
1431   if (c == OP_RECURSE) return code;
1432
1433   /* XCLASS is used for classes that cannot be represented just by a bit
1434   map. This includes negated single high-valued characters. The length in
1435   the table is zero; the actual length is stored in the compiled code. */
1436
1437   if (c == OP_XCLASS) code += GET(code, 1);
1438
1439   /* Otherwise, we can get the item's length from the table, except that for
1440   repeated character types, we have to test for \p and \P, which have an extra
1441   two bytes of parameters. */
1442
1443   else
1444     {
1445     switch(c)
1446       {
1447       case OP_TYPESTAR:
1448       case OP_TYPEMINSTAR:
1449       case OP_TYPEPLUS:
1450       case OP_TYPEMINPLUS:
1451       case OP_TYPEQUERY:
1452       case OP_TYPEMINQUERY:
1453       case OP_TYPEPOSSTAR:
1454       case OP_TYPEPOSPLUS:
1455       case OP_TYPEPOSQUERY:
1456       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1457       break;
1458
1459       case OP_TYPEPOSUPTO:
1460       case OP_TYPEUPTO:
1461       case OP_TYPEMINUPTO:
1462       case OP_TYPEEXACT:
1463       if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1464       break;
1465       }
1466
1467     /* Add in the fixed length from the table */
1468
1469     code += _pcre_OP_lengths[c];
1470
1471     /* In UTF-8 mode, opcodes that are followed by a character may be followed
1472     by a multi-byte character. The length in the table is a minimum, so we have
1473     to arrange to skip the extra bytes. */
1474
1475 #ifdef SUPPORT_UTF8
1476     if (utf8) switch(c)
1477       {
1478       case OP_CHAR:
1479       case OP_CHARNC:
1480       case OP_EXACT:
1481       case OP_UPTO:
1482       case OP_MINUPTO:
1483       case OP_POSUPTO:
1484       case OP_STAR:
1485       case OP_MINSTAR:
1486       case OP_POSSTAR:
1487       case OP_PLUS:
1488       case OP_MINPLUS:
1489       case OP_POSPLUS:
1490       case OP_QUERY:
1491       case OP_MINQUERY:
1492       case OP_POSQUERY:
1493       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1494       break;
1495       }
1496 #endif
1497     }
1498   }
1499 }
1500
1501
1502
1503 /*************************************************
1504 *    Scan compiled branch for non-emptiness      *
1505 *************************************************/
1506
1507 /* This function scans through a branch of a compiled pattern to see whether it
1508 can match the empty string or not. It is called from could_be_empty()
1509 below and from compile_branch() when checking for an unlimited repeat of a
1510 group that can match nothing. Note that first_significant_code() skips over
1511 backward and negative forward assertions when its final argument is TRUE. If we
1512 hit an unclosed bracket, we return "empty" - this means we've struck an inner
1513 bracket whose current branch will already have been scanned.
1514
1515 Arguments:
1516   code        points to start of search
1517   endcode     points to where to stop
1518   utf8        TRUE if in UTF8 mode
1519
1520 Returns:      TRUE if what is matched could be empty
1521 */
1522
1523 static BOOL
1524 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1525 {
1526 register int c;
1527 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1528      code < endcode;
1529      code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1530   {
1531   const uschar *ccode;
1532
1533   c = *code;
1534
1535   /* Skip over forward assertions; the other assertions are skipped by
1536   first_significant_code() with a TRUE final argument. */
1537
1538   if (c == OP_ASSERT)
1539     {
1540     do code += GET(code, 1); while (*code == OP_ALT);
1541     c = *code;
1542     continue;
1543     }
1544
1545   /* Groups with zero repeats can of course be empty; skip them. */
1546
1547   if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1548     {
1549     code += _pcre_OP_lengths[c];
1550     do code += GET(code, 1); while (*code == OP_ALT);
1551     c = *code;
1552     continue;
1553     }
1554
1555   /* For other groups, scan the branches. */
1556
1557   if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1558     {
1559     BOOL empty_branch;
1560     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1561
1562     /* Scan a closed bracket */
1563
1564     empty_branch = FALSE;
1565     do
1566       {
1567       if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1568         empty_branch = TRUE;
1569       code += GET(code, 1);
1570       }
1571     while (*code == OP_ALT);
1572     if (!empty_branch) return FALSE;   /* All branches are non-empty */
1573     c = *code;
1574     continue;
1575     }
1576
1577   /* Handle the other opcodes */
1578
1579   switch (c)
1580     {
1581     /* Check for quantifiers after a class. XCLASS is used for classes that
1582     cannot be represented just by a bit map. This includes negated single
1583     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1584     actual length is stored in the compiled code, so we must update "code"
1585     here. */
1586
1587 #ifdef SUPPORT_UTF8
1588     case OP_XCLASS:
1589     ccode = code += GET(code, 1);
1590     goto CHECK_CLASS_REPEAT;
1591 #endif
1592
1593     case OP_CLASS:
1594     case OP_NCLASS:
1595     ccode = code + 33;
1596
1597 #ifdef SUPPORT_UTF8
1598     CHECK_CLASS_REPEAT:
1599 #endif
1600
1601     switch (*ccode)
1602       {
1603       case OP_CRSTAR:            /* These could be empty; continue */
1604       case OP_CRMINSTAR:
1605       case OP_CRQUERY:
1606       case OP_CRMINQUERY:
1607       break;
1608
1609       default:                   /* Non-repeat => class must match */
1610       case OP_CRPLUS:            /* These repeats aren't empty */
1611       case OP_CRMINPLUS:
1612       return FALSE;
1613
1614       case OP_CRRANGE:
1615       case OP_CRMINRANGE:
1616       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
1617       break;
1618       }
1619     break;
1620
1621     /* Opcodes that must match a character */
1622
1623     case OP_PROP:
1624     case OP_NOTPROP:
1625     case OP_EXTUNI:
1626     case OP_NOT_DIGIT:
1627     case OP_DIGIT:
1628     case OP_NOT_WHITESPACE:
1629     case OP_WHITESPACE:
1630     case OP_NOT_WORDCHAR:
1631     case OP_WORDCHAR:
1632     case OP_ANY:
1633     case OP_ANYBYTE:
1634     case OP_CHAR:
1635     case OP_CHARNC:
1636     case OP_NOT:
1637     case OP_PLUS:
1638     case OP_MINPLUS:
1639     case OP_POSPLUS:
1640     case OP_EXACT:
1641     case OP_NOTPLUS:
1642     case OP_NOTMINPLUS:
1643     case OP_NOTPOSPLUS:
1644     case OP_NOTEXACT:
1645     case OP_TYPEPLUS:
1646     case OP_TYPEMINPLUS:
1647     case OP_TYPEPOSPLUS:
1648     case OP_TYPEEXACT:
1649     return FALSE;
1650
1651     /* These are going to continue, as they may be empty, but we have to
1652     fudge the length for the \p and \P cases. */
1653
1654     case OP_TYPESTAR:
1655     case OP_TYPEMINSTAR:
1656     case OP_TYPEPOSSTAR:
1657     case OP_TYPEQUERY:
1658     case OP_TYPEMINQUERY:
1659     case OP_TYPEPOSQUERY:
1660     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1661     break;
1662
1663     /* Same for these */
1664
1665     case OP_TYPEUPTO:
1666     case OP_TYPEMINUPTO:
1667     case OP_TYPEPOSUPTO:
1668     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1669     break;
1670
1671     /* End of branch */
1672
1673     case OP_KET:
1674     case OP_KETRMAX:
1675     case OP_KETRMIN:
1676     case OP_ALT:
1677     return TRUE;
1678
1679     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1680     MINUPTO, and POSUPTO may be followed by a multibyte character */
1681
1682 #ifdef SUPPORT_UTF8
1683     case OP_STAR:
1684     case OP_MINSTAR:
1685     case OP_POSSTAR:
1686     case OP_QUERY:
1687     case OP_MINQUERY:
1688     case OP_POSQUERY:
1689     case OP_UPTO:
1690     case OP_MINUPTO:
1691     case OP_POSUPTO:
1692     if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1693     break;
1694 #endif
1695     }
1696   }
1697
1698 return TRUE;
1699 }
1700
1701
1702
1703 /*************************************************
1704 *    Scan compiled regex for non-emptiness       *
1705 *************************************************/
1706
1707 /* This function is called to check for left recursive calls. We want to check
1708 the current branch of the current pattern to see if it could match the empty
1709 string. If it could, we must look outwards for branches at other levels,
1710 stopping when we pass beyond the bracket which is the subject of the recursion.
1711
1712 Arguments:
1713   code        points to start of the recursion
1714   endcode     points to where to stop (current RECURSE item)
1715   bcptr       points to the chain of current (unclosed) branch starts
1716   utf8        TRUE if in UTF-8 mode
1717
1718 Returns:      TRUE if what is matched could be empty
1719 */
1720
1721 static BOOL
1722 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1723   BOOL utf8)
1724 {
1725 while (bcptr != NULL && bcptr->current >= code)
1726   {
1727   if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1728   bcptr = bcptr->outer;
1729   }
1730 return TRUE;
1731 }
1732
1733
1734
1735 /*************************************************
1736 *           Check for POSIX class syntax         *
1737 *************************************************/
1738
1739 /* This function is called when the sequence "[:" or "[." or "[=" is
1740 encountered in a character class. It checks whether this is followed by a
1741 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1742 reach an unescaped ']' without the special preceding character, return FALSE.
1743
1744 Originally, this function only recognized a sequence of letters between the
1745 terminators, but it seems that Perl recognizes any sequence of characters,
1746 though of course unknown POSIX names are subsequently rejected. Perl gives an
1747 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1748 didn't consider this to be a POSIX class. Likewise for [:1234:].
1749
1750 The problem in trying to be exactly like Perl is in the handling of escapes. We
1751 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1752 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1753 below handles the special case of \], but does not try to do any other escape
1754 processing. This makes it different from Perl for cases such as [:l\ower:]
1755 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1756 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1757 I think.
1758
1759 Arguments:
1760   ptr      pointer to the initial [
1761   endptr   where to return the end pointer
1762
1763 Returns:   TRUE or FALSE
1764 */
1765
1766 static BOOL
1767 check_posix_syntax(const uschar *ptr, const uschar **endptr)
1768 {
1769 int terminator;          /* Don't combine these lines; the Solaris cc */
1770 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1771 for (++ptr; *ptr != 0; ptr++)
1772   {
1773   if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1774     {
1775     if (*ptr == ']') return FALSE;
1776     if (*ptr == terminator && ptr[1] == ']')
1777       {
1778       *endptr = ptr;
1779       return TRUE;
1780       }
1781     }
1782   }
1783 return FALSE;
1784 }
1785
1786
1787
1788
1789 /*************************************************
1790 *          Check POSIX class name                *
1791 *************************************************/
1792
1793 /* This function is called to check the name given in a POSIX-style class entry
1794 such as [:alnum:].
1795
1796 Arguments:
1797   ptr        points to the first letter
1798   len        the length of the name
1799
1800 Returns:     a value representing the name, or -1 if unknown
1801 */
1802
1803 static int
1804 check_posix_name(const uschar *ptr, int len)
1805 {
1806 const char *pn = posix_names;
1807 register int yield = 0;
1808 while (posix_name_lengths[yield] != 0)
1809   {
1810   if (len == posix_name_lengths[yield] &&
1811     strncmp((const char *)ptr, pn, len) == 0) return yield;
1812   pn += posix_name_lengths[yield] + 1;
1813   yield++;
1814   }
1815 return -1;
1816 }
1817
1818
1819 /*************************************************
1820 *    Adjust OP_RECURSE items in repeated group   *
1821 *************************************************/
1822
1823 /* OP_RECURSE items contain an offset from the start of the regex to the group
1824 that is referenced. This means that groups can be replicated for fixed
1825 repetition simply by copying (because the recursion is allowed to refer to
1826 earlier groups that are outside the current group). However, when a group is
1827 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1828 it, after it has been compiled. This means that any OP_RECURSE items within it
1829 that refer to the group itself or any contained groups have to have their
1830 offsets adjusted. That one of the jobs of this function. Before it is called,
1831 the partially compiled regex must be temporarily terminated with OP_END.
1832
1833 This function has been extended with the possibility of forward references for
1834 recursions and subroutine calls. It must also check the list of such references
1835 for the group we are dealing with. If it finds that one of the recursions in
1836 the current group is on this list, it adjusts the offset in the list, not the
1837 value in the reference (which is a group number).
1838
1839 Arguments:
1840   group      points to the start of the group
1841   adjust     the amount by which the group is to be moved
1842   utf8       TRUE in UTF-8 mode
1843   cd         contains pointers to tables etc.
1844   save_hwm   the hwm forward reference pointer at the start of the group
1845
1846 Returns:     nothing
1847 */
1848
1849 static void
1850 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1851   uschar *save_hwm)
1852 {
1853 uschar *ptr = group;
1854
1855 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1856   {
1857   int offset;
1858   uschar *hc;
1859
1860   /* See if this recursion is on the forward reference list. If so, adjust the
1861   reference. */
1862
1863   for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1864     {
1865     offset = GET(hc, 0);
1866     if (cd->start_code + offset == ptr + 1)
1867       {
1868       PUT(hc, 0, offset + adjust);
1869       break;
1870       }
1871     }
1872
1873   /* Otherwise, adjust the recursion offset if it's after the start of this
1874   group. */
1875
1876   if (hc >= cd->hwm)
1877     {
1878     offset = GET(ptr, 1);
1879     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1880     }
1881
1882   ptr += 1 + LINK_SIZE;
1883   }
1884 }
1885
1886
1887
1888 /*************************************************
1889 *        Insert an automatic callout point       *
1890 *************************************************/
1891
1892 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1893 callout points before each pattern item.
1894
1895 Arguments:
1896   code           current code pointer
1897   ptr            current pattern pointer
1898   cd             pointers to tables etc
1899
1900 Returns:         new code pointer
1901 */
1902
1903 static uschar *
1904 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1905 {
1906 *code++ = OP_CALLOUT;
1907 *code++ = 255;
1908 PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */
1909 PUT(code, LINK_SIZE, 0);                /* Default length */
1910 return code + 2*LINK_SIZE;
1911 }
1912
1913
1914
1915 /*************************************************
1916 *         Complete a callout item                *
1917 *************************************************/
1918
1919 /* A callout item contains the length of the next item in the pattern, which
1920 we can't fill in till after we have reached the relevant point. This is used
1921 for both automatic and manual callouts.
1922
1923 Arguments:
1924   previous_callout   points to previous callout item
1925   ptr                current pattern pointer
1926   cd                 pointers to tables etc
1927
1928 Returns:             nothing
1929 */
1930
1931 static void
1932 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1933 {
1934 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1935 PUT(previous_callout, 2 + LINK_SIZE, length);
1936 }
1937
1938
1939
1940 #ifdef SUPPORT_UCP
1941 /*************************************************
1942 *           Get othercase range                  *
1943 *************************************************/
1944
1945 /* This function is passed the start and end of a class range, in UTF-8 mode
1946 with UCP support. It searches up the characters, looking for internal ranges of
1947 characters in the "other" case. Each call returns the next one, updating the
1948 start address.
1949
1950 Arguments:
1951   cptr        points to starting character value; updated
1952   d           end value
1953   ocptr       where to put start of othercase range
1954   odptr       where to put end of othercase range
1955
1956 Yield:        TRUE when range returned; FALSE when no more
1957 */
1958
1959 static BOOL
1960 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1961   unsigned int *odptr)
1962 {
1963 unsigned int c, othercase, next;
1964
1965 for (c = *cptr; c <= d; c++)
1966   { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1967
1968 if (c > d) return FALSE;
1969
1970 *ocptr = othercase;
1971 next = othercase + 1;
1972
1973 for (++c; c <= d; c++)
1974   {
1975   if (_pcre_ucp_othercase(c) != next) break;
1976   next++;
1977   }
1978
1979 *odptr = next - 1;
1980 *cptr = c;
1981
1982 return TRUE;
1983 }
1984 #endif  /* SUPPORT_UCP */
1985
1986
1987
1988 /*************************************************
1989 *     Check if auto-possessifying is possible    *
1990 *************************************************/
1991
1992 /* This function is called for unlimited repeats of certain items, to see
1993 whether the next thing could possibly match the repeated item. If not, it makes
1994 sense to automatically possessify the repeated item.
1995
1996 Arguments:
1997   op_code       the repeated op code
1998   this          data for this item, depends on the opcode
1999   utf8          TRUE in UTF-8 mode
2000   utf8_char     used for utf8 character bytes, NULL if not relevant
2001   ptr           next character in pattern
2002   options       options bits
2003   cd            contains pointers to tables etc.
2004
2005 Returns:        TRUE if possessifying is wanted
2006 */
2007
2008 static BOOL
2009 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2010   const uschar *ptr, int options, compile_data *cd)
2011 {
2012 int next;
2013
2014 /* Skip whitespace and comments in extended mode */
2015
2016 if ((options & PCRE_EXTENDED) != 0)
2017   {
2018   for (;;)
2019     {
2020     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2021     if (*ptr == '#')
2022       {
2023       while (*(++ptr) != 0)
2024         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2025       }
2026     else break;
2027     }
2028   }
2029
2030 /* If the next item is one that we can handle, get its value. A non-negative
2031 value is a character, a negative value is an escape value. */
2032
2033 if (*ptr == '\\')
2034   {
2035   int temperrorcode = 0;
2036   next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2037   if (temperrorcode != 0) return FALSE;
2038   ptr++;    /* Point after the escape sequence */
2039   }
2040
2041 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2042   {
2043 #ifdef SUPPORT_UTF8
2044   if (utf8) { GETCHARINC(next, ptr); } else
2045 #endif
2046   next = *ptr++;
2047   }
2048
2049 else return FALSE;
2050
2051 /* Skip whitespace and comments in extended mode */
2052
2053 if ((options & PCRE_EXTENDED) != 0)
2054   {
2055   for (;;)
2056     {
2057     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2058     if (*ptr == '#')
2059       {
2060       while (*(++ptr) != 0)
2061         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2062       }
2063     else break;
2064     }
2065   }
2066
2067 /* If the next thing is itself optional, we have to give up. */
2068
2069 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2070   return FALSE;
2071
2072 /* Now compare the next item with the previous opcode. If the previous is a
2073 positive single character match, "item" either contains the character or, if
2074 "item" is greater than 127 in utf8 mode, the character's bytes are in
2075 utf8_char. */
2076
2077
2078 /* Handle cases when the next item is a character. */
2079
2080 if (next >= 0) switch(op_code)
2081   {
2082   case OP_CHAR:
2083 #ifdef SUPPORT_UTF8
2084   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2085 #endif
2086   return item != next;
2087
2088   /* For CHARNC (caseless character) we must check the other case. If we have
2089   Unicode property support, we can use it to test the other case of
2090   high-valued characters. */
2091
2092   case OP_CHARNC:
2093 #ifdef SUPPORT_UTF8
2094   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2095 #endif
2096   if (item == next) return FALSE;
2097 #ifdef SUPPORT_UTF8
2098   if (utf8)
2099     {
2100     unsigned int othercase;
2101     if (next < 128) othercase = cd->fcc[next]; else
2102 #ifdef SUPPORT_UCP
2103     othercase = _pcre_ucp_othercase((unsigned int)next);
2104 #else
2105     othercase = NOTACHAR;
2106 #endif
2107     return (unsigned int)item != othercase;
2108     }
2109   else
2110 #endif  /* SUPPORT_UTF8 */
2111   return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
2112
2113   /* For OP_NOT, "item" must be a single-byte character. */
2114
2115   case OP_NOT:
2116   if (next < 0) return FALSE;  /* Not a character */
2117   if (item == next) return TRUE;
2118   if ((options & PCRE_CASELESS) == 0) return FALSE;
2119 #ifdef SUPPORT_UTF8
2120   if (utf8)
2121     {
2122     unsigned int othercase;
2123     if (next < 128) othercase = cd->fcc[next]; else
2124 #ifdef SUPPORT_UCP
2125     othercase = _pcre_ucp_othercase(next);
2126 #else
2127     othercase = NOTACHAR;
2128 #endif
2129     return (unsigned int)item == othercase;
2130     }
2131   else
2132 #endif  /* SUPPORT_UTF8 */
2133   return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
2134
2135   case OP_DIGIT:
2136   return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2137
2138   case OP_NOT_DIGIT:
2139   return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2140
2141   case OP_WHITESPACE:
2142   return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2143
2144   case OP_NOT_WHITESPACE:
2145   return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2146
2147   case OP_WORDCHAR:
2148   return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2149
2150   case OP_NOT_WORDCHAR:
2151   return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2152
2153   case OP_HSPACE:
2154   case OP_NOT_HSPACE:
2155   switch(next)
2156     {
2157     case 0x09:
2158     case 0x20:
2159     case 0xa0:
2160     case 0x1680:
2161     case 0x180e:
2162     case 0x2000:
2163     case 0x2001:
2164     case 0x2002:
2165     case 0x2003:
2166     case 0x2004:
2167     case 0x2005:
2168     case 0x2006:
2169     case 0x2007:
2170     case 0x2008:
2171     case 0x2009:
2172     case 0x200A:
2173     case 0x202f:
2174     case 0x205f:
2175     case 0x3000:
2176     return op_code != OP_HSPACE;
2177     default:
2178     return op_code == OP_HSPACE;
2179     }
2180
2181   case OP_VSPACE:
2182   case OP_NOT_VSPACE:
2183   switch(next)
2184     {
2185     case 0x0a:
2186     case 0x0b:
2187     case 0x0c:
2188     case 0x0d:
2189     case 0x85:
2190     case 0x2028:
2191     case 0x2029:
2192     return op_code != OP_VSPACE;
2193     default:
2194     return op_code == OP_VSPACE;
2195     }
2196
2197   default:
2198   return FALSE;
2199   }
2200
2201
2202 /* Handle the case when the next item is \d, \s, etc. */
2203
2204 switch(op_code)
2205   {
2206   case OP_CHAR:
2207   case OP_CHARNC:
2208 #ifdef SUPPORT_UTF8
2209   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2210 #endif
2211   switch(-next)
2212     {
2213     case ESC_d:
2214     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2215
2216     case ESC_D:
2217     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2218
2219     case ESC_s:
2220     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2221
2222     case ESC_S:
2223     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2224
2225     case ESC_w:
2226     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2227
2228     case ESC_W:
2229     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2230
2231     case ESC_h:
2232     case ESC_H:
2233     switch(item)
2234       {
2235       case 0x09:
2236       case 0x20:
2237       case 0xa0:
2238       case 0x1680:
2239       case 0x180e:
2240       case 0x2000:
2241       case 0x2001:
2242       case 0x2002:
2243       case 0x2003:
2244       case 0x2004:
2245       case 0x2005:
2246       case 0x2006:
2247       case 0x2007:
2248       case 0x2008:
2249       case 0x2009:
2250       case 0x200A:
2251       case 0x202f:
2252       case 0x205f:
2253       case 0x3000:
2254       return -next != ESC_h;
2255       default:
2256       return -next == ESC_h;
2257       }
2258
2259     case ESC_v:
2260     case ESC_V:
2261     switch(item)
2262       {
2263       case 0x0a:
2264       case 0x0b:
2265       case 0x0c:
2266       case 0x0d:
2267       case 0x85:
2268       case 0x2028:
2269       case 0x2029:
2270       return -next != ESC_v;
2271       default:
2272       return -next == ESC_v;
2273       }
2274
2275     default:
2276     return FALSE;
2277     }
2278
2279   case OP_DIGIT:
2280   return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2281          next == -ESC_h || next == -ESC_v;
2282
2283   case OP_NOT_DIGIT:
2284   return next == -ESC_d;
2285
2286   case OP_WHITESPACE:
2287   return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2288
2289   case OP_NOT_WHITESPACE:
2290   return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2291
2292   case OP_HSPACE:
2293   return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2294
2295   case OP_NOT_HSPACE:
2296   return next == -ESC_h;
2297
2298   /* Can't have \S in here because VT matches \S (Perl anomaly) */
2299   case OP_VSPACE:
2300   return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2301
2302   case OP_NOT_VSPACE:
2303   return next == -ESC_v;
2304
2305   case OP_WORDCHAR:
2306   return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2307
2308   case OP_NOT_WORDCHAR:
2309   return next == -ESC_w || next == -ESC_d;
2310
2311   default:
2312   return FALSE;
2313   }
2314
2315 /* Control does not reach here */
2316 }
2317
2318
2319
2320 /*************************************************
2321 *           Compile one branch                   *
2322 *************************************************/
2323
2324 /* Scan the pattern, compiling it into the a vector. If the options are
2325 changed during the branch, the pointer is used to change the external options
2326 bits. This function is used during the pre-compile phase when we are trying
2327 to find out the amount of memory needed, as well as during the real compile
2328 phase. The value of lengthptr distinguishes the two phases.
2329
2330 Arguments:
2331   optionsptr     pointer to the option bits
2332   codeptr        points to the pointer to the current code point
2333   ptrptr         points to the current pattern pointer
2334   errorcodeptr   points to error code variable
2335   firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2336   reqbyteptr     set to the last literal character required, else < 0
2337   bcptr          points to current branch chain
2338   cd             contains pointers to tables etc.
2339   lengthptr      NULL during the real compile phase
2340                  points to length accumulator during pre-compile phase
2341
2342 Returns:         TRUE on success
2343                  FALSE, with *errorcodeptr set non-zero on error
2344 */
2345
2346 static BOOL
2347 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2348   int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2349   compile_data *cd, int *lengthptr)
2350 {
2351 int repeat_type, op_type;
2352 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2353 int bravalue = 0;
2354 int greedy_default, greedy_non_default;
2355 int firstbyte, reqbyte;
2356 int zeroreqbyte, zerofirstbyte;
2357 int req_caseopt, reqvary, tempreqvary;
2358 int options = *optionsptr;
2359 int after_manual_callout = 0;
2360 int length_prevgroup = 0;
2361 register int c;
2362 register uschar *code = *codeptr;
2363 uschar *last_code = code;
2364 uschar *orig_code = code;
2365 uschar *tempcode;
2366 BOOL inescq = FALSE;
2367 BOOL groupsetfirstbyte = FALSE;
2368 const uschar *ptr = *ptrptr;
2369 const uschar *tempptr;
2370 uschar *previous = NULL;
2371 uschar *previous_callout = NULL;
2372 uschar *save_hwm = NULL;
2373 uschar classbits[32];
2374
2375 #ifdef SUPPORT_UTF8
2376 BOOL class_utf8;
2377 BOOL utf8 = (options & PCRE_UTF8) != 0;
2378 uschar *class_utf8data;
2379 uschar *class_utf8data_base;
2380 uschar utf8_char[6];
2381 #else
2382 BOOL utf8 = FALSE;
2383 uschar *utf8_char = NULL;
2384 #endif
2385
2386 #ifdef DEBUG
2387 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2388 #endif
2389
2390 /* Set up the default and non-default settings for greediness */
2391
2392 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2393 greedy_non_default = greedy_default ^ 1;
2394
2395 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2396 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2397 matches a non-fixed char first char; reqbyte just remains unset if we never
2398 find one.
2399
2400 When we hit a repeat whose minimum is zero, we may have to adjust these values
2401 to take the zero repeat into account. This is implemented by setting them to
2402 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2403 item types that can be repeated set these backoff variables appropriately. */
2404
2405 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2406
2407 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2408 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2409 value > 255. It is added into the firstbyte or reqbyte variables to record the
2410 case status of the value. This is used only for ASCII characters. */
2411
2412 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2413
2414 /* Switch on next character until the end of the branch */
2415
2416 for (;; ptr++)
2417   {
2418   BOOL negate_class;
2419   BOOL should_flip_negation;
2420   BOOL possessive_quantifier;
2421   BOOL is_quantifier;
2422   BOOL is_recurse;
2423   BOOL reset_bracount;
2424   int class_charcount;
2425   int class_lastchar;
2426   int newoptions;
2427   int recno;
2428   int refsign;
2429   int skipbytes;
2430   int subreqbyte;
2431   int subfirstbyte;
2432   int terminator;
2433   int mclength;
2434   uschar mcbuffer[8];
2435
2436   /* Get next byte in the pattern */
2437
2438   c = *ptr;
2439
2440   /* If we are in the pre-compile phase, accumulate the length used for the
2441   previous cycle of this loop. */
2442
2443   if (lengthptr != NULL)
2444     {
2445 #ifdef DEBUG
2446     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2447 #endif
2448     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2449       {
2450       *errorcodeptr = ERR52;
2451       goto FAILED;
2452       }
2453
2454     /* There is at least one situation where code goes backwards: this is the
2455     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2456     the class is simply eliminated. However, it is created first, so we have to
2457     allow memory for it. Therefore, don't ever reduce the length at this point.
2458     */
2459
2460     if (code < last_code) code = last_code;
2461
2462     /* Paranoid check for integer overflow */
2463
2464     if (OFLOW_MAX - *lengthptr < code - last_code)
2465       {
2466       *errorcodeptr = ERR20;
2467       goto FAILED;
2468       }
2469
2470     *lengthptr += code - last_code;
2471     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2472
2473     /* If "previous" is set and it is not at the start of the work space, move
2474     it back to there, in order to avoid filling up the work space. Otherwise,
2475     if "previous" is NULL, reset the current code pointer to the start. */
2476
2477     if (previous != NULL)
2478       {
2479       if (previous > orig_code)
2480         {
2481         memmove(orig_code, previous, code - previous);
2482         code -= previous - orig_code;
2483         previous = orig_code;
2484         }
2485       }
2486     else code = orig_code;
2487
2488     /* Remember where this code item starts so we can pick up the length
2489     next time round. */
2490
2491     last_code = code;
2492     }
2493
2494   /* In the real compile phase, just check the workspace used by the forward
2495   reference list. */
2496
2497   else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2498     {
2499     *errorcodeptr = ERR52;
2500     goto FAILED;
2501     }
2502
2503   /* If in \Q...\E, check for the end; if not, we have a literal */
2504
2505   if (inescq && c != 0)
2506     {
2507     if (c == '\\' && ptr[1] == 'E')
2508       {
2509       inescq = FALSE;
2510       ptr++;
2511       continue;
2512       }
2513     else
2514       {
2515       if (previous_callout != NULL)
2516         {
2517         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2518           complete_callout(previous_callout, ptr, cd);
2519         previous_callout = NULL;
2520         }
2521       if ((options & PCRE_AUTO_CALLOUT) != 0)
2522         {
2523         previous_callout = code;
2524         code = auto_callout(code, ptr, cd);
2525         }
2526       goto NORMAL_CHAR;
2527       }
2528     }
2529
2530   /* Fill in length of a previous callout, except when the next thing is
2531   a quantifier. */
2532
2533   is_quantifier = c == '*' || c == '+' || c == '?' ||
2534     (c == '{' && is_counted_repeat(ptr+1));
2535
2536   if (!is_quantifier && previous_callout != NULL &&
2537        after_manual_callout-- <= 0)
2538     {
2539     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2540       complete_callout(previous_callout, ptr, cd);
2541     previous_callout = NULL;
2542     }
2543
2544   /* In extended mode, skip white space and comments */
2545
2546   if ((options & PCRE_EXTENDED) != 0)
2547     {
2548     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2549     if (c == '#')
2550       {
2551       while (*(++ptr) != 0)
2552         {
2553         if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2554         }
2555       if (*ptr != 0) continue;
2556
2557       /* Else fall through to handle end of string */
2558       c = 0;
2559       }
2560     }
2561
2562   /* No auto callout for quantifiers. */
2563
2564   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2565     {
2566     previous_callout = code;
2567     code = auto_callout(code, ptr, cd);
2568     }
2569
2570   switch(c)
2571     {
2572     /* ===================================================================*/
2573     case 0:                        /* The branch terminates at string end */
2574     case '|':                      /* or | or ) */
2575     case ')':
2576     *firstbyteptr = firstbyte;
2577     *reqbyteptr = reqbyte;
2578     *codeptr = code;
2579     *ptrptr = ptr;
2580     if (lengthptr != NULL)
2581       {
2582       if (OFLOW_MAX - *lengthptr < code - last_code)
2583         {
2584         *errorcodeptr = ERR20;
2585         goto FAILED;
2586         }
2587       *lengthptr += code - last_code;   /* To include callout length */
2588       DPRINTF((">> end branch\n"));
2589       }
2590     return TRUE;
2591
2592
2593     /* ===================================================================*/
2594     /* Handle single-character metacharacters. In multiline mode, ^ disables
2595     the setting of any following char as a first character. */
2596
2597     case '^':
2598     if ((options & PCRE_MULTILINE) != 0)
2599       {
2600       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2601       }
2602     previous = NULL;
2603     *code++ = OP_CIRC;
2604     break;
2605
2606     case '$':
2607     previous = NULL;
2608     *code++ = OP_DOLL;
2609     break;
2610
2611     /* There can never be a first char if '.' is first, whatever happens about
2612     repeats. The value of reqbyte doesn't change either. */
2613
2614     case '.':
2615     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2616     zerofirstbyte = firstbyte;
2617     zeroreqbyte = reqbyte;
2618     previous = code;
2619     *code++ = OP_ANY;
2620     break;
2621
2622
2623     /* ===================================================================*/
2624     /* Character classes. If the included characters are all < 256, we build a
2625     32-byte bitmap of the permitted characters, except in the special case
2626     where there is only one such character. For negated classes, we build the
2627     map as usual, then invert it at the end. However, we use a different opcode
2628     so that data characters > 255 can be handled correctly.
2629
2630     If the class contains characters outside the 0-255 range, a different
2631     opcode is compiled. It may optionally have a bit map for characters < 256,
2632     but those above are are explicitly listed afterwards. A flag byte tells
2633     whether the bitmap is present, and whether this is a negated class or not.
2634     */
2635
2636     case '[':
2637     previous = code;
2638
2639     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2640     they are encountered at the top level, so we'll do that too. */
2641
2642     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2643         check_posix_syntax(ptr, &tempptr))
2644       {
2645       *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2646       goto FAILED;
2647       }
2648
2649     /* If the first character is '^', set the negation flag and skip it. Also,
2650     if the first few characters (either before or after ^) are \Q\E or \E we
2651     skip them too. This makes for compatibility with Perl. */
2652
2653     negate_class = FALSE;
2654     for (;;)
2655       {
2656       c = *(++ptr);
2657       if (c == '\\')
2658         {
2659         if (ptr[1] == 'E') ptr++;
2660           else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2661             else break;
2662         }
2663       else if (!negate_class && c == '^')
2664         negate_class = TRUE;
2665       else break;
2666       }
2667
2668     /* If a class contains a negative special such as \S, we need to flip the
2669     negation flag at the end, so that support for characters > 255 works
2670     correctly (they are all included in the class). */
2671
2672     should_flip_negation = FALSE;
2673
2674     /* Keep a count of chars with values < 256 so that we can optimize the case
2675     of just a single character (as long as it's < 256). However, For higher
2676     valued UTF-8 characters, we don't yet do any optimization. */
2677
2678     class_charcount = 0;
2679     class_lastchar = -1;
2680
2681     /* Initialize the 32-char bit map to all zeros. We build the map in a
2682     temporary bit of memory, in case the class contains only 1 character (less
2683     than 256), because in that case the compiled code doesn't use the bit map.
2684     */
2685
2686     memset(classbits, 0, 32 * sizeof(uschar));
2687
2688 #ifdef SUPPORT_UTF8
2689     class_utf8 = FALSE;                       /* No chars >= 256 */
2690     class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2691     class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
2692 #endif
2693
2694     /* Process characters until ] is reached. By writing this as a "do" it
2695     means that an initial ] is taken as a data character. At the start of the
2696     loop, c contains the first byte of the character. */
2697
2698     if (c != 0) do
2699       {
2700       const uschar *oldptr;
2701
2702 #ifdef SUPPORT_UTF8
2703       if (utf8 && c > 127)
2704         {                           /* Braces are required because the */
2705         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
2706         }
2707
2708       /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2709       data and reset the pointer. This is so that very large classes that
2710       contain a zillion UTF-8 characters no longer overwrite the work space
2711       (which is on the stack). */
2712
2713       if (lengthptr != NULL)
2714         {
2715         *lengthptr += class_utf8data - class_utf8data_base;
2716         class_utf8data = class_utf8data_base;
2717         }
2718
2719 #endif
2720
2721       /* Inside \Q...\E everything is literal except \E */
2722
2723       if (inescq)
2724         {
2725         if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2726           {
2727           inescq = FALSE;                   /* Reset literal state */
2728           ptr++;                            /* Skip the 'E' */
2729           continue;                         /* Carry on with next */
2730           }
2731         goto CHECK_RANGE;                   /* Could be range if \E follows */
2732         }
2733
2734       /* Handle POSIX class names. Perl allows a negation extension of the
2735       form [:^name:]. A square bracket that doesn't match the syntax is
2736       treated as a literal. We also recognize the POSIX constructions
2737       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2738       5.6 and 5.8 do. */
2739
2740       if (c == '[' &&
2741           (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2742           check_posix_syntax(ptr, &tempptr))
2743         {
2744         BOOL local_negate = FALSE;
2745         int posix_class, taboffset, tabopt;
2746         register const uschar *cbits = cd->cbits;
2747         uschar pbits[32];
2748
2749         if (ptr[1] != ':')
2750           {
2751           *errorcodeptr = ERR31;
2752           goto FAILED;
2753           }
2754
2755         ptr += 2;
2756         if (*ptr == '^')
2757           {
2758           local_negate = TRUE;
2759           should_flip_negation = TRUE;  /* Note negative special */
2760           ptr++;
2761           }
2762
2763         posix_class = check_posix_name(ptr, tempptr - ptr);
2764         if (posix_class < 0)
2765           {
2766           *errorcodeptr = ERR30;
2767           goto FAILED;
2768           }
2769
2770         /* If matching is caseless, upper and lower are converted to
2771         alpha. This relies on the fact that the class table starts with
2772         alpha, lower, upper as the first 3 entries. */
2773
2774         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2775           posix_class = 0;
2776
2777         /* We build the bit map for the POSIX class in a chunk of local store
2778         because we may be adding and subtracting from it, and we don't want to
2779         subtract bits that may be in the main map already. At the end we or the
2780         result into the bit map that is being built. */
2781
2782         posix_class *= 3;
2783
2784         /* Copy in the first table (always present) */
2785
2786         memcpy(pbits, cbits + posix_class_maps[posix_class],
2787           32 * sizeof(uschar));
2788
2789         /* If there is a second table, add or remove it as required. */
2790
2791         taboffset = posix_class_maps[posix_class + 1];
2792         tabopt = posix_class_maps[posix_class + 2];
2793
2794         if (taboffset >= 0)
2795           {
2796           if (tabopt >= 0)
2797             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2798           else
2799             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2800           }
2801
2802         /* Not see if we need to remove any special characters. An option
2803         value of 1 removes vertical space and 2 removes underscore. */
2804
2805         if (tabopt < 0) tabopt = -tabopt;
2806         if (tabopt == 1) pbits[1] &= ~0x3c;
2807           else if (tabopt == 2) pbits[11] &= 0x7f;
2808
2809         /* Add the POSIX table or its complement into the main table that is
2810         being built and we are done. */
2811
2812         if (local_negate)
2813           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2814         else
2815           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2816
2817         ptr = tempptr + 1;
2818         class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
2819         continue;    /* End of POSIX syntax handling */
2820         }
2821
2822       /* Backslash may introduce a single character, or it may introduce one
2823       of the specials, which just set a flag. The sequence \b is a special
2824       case. Inside a class (and only there) it is treated as backspace.
2825       Elsewhere it marks a word boundary. Other escapes have preset maps ready
2826       to 'or' into the one we are building. We assume they have more than one
2827       character in them, so set class_charcount bigger than one. */
2828
2829       if (c == '\\')
2830         {
2831         c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2832         if (*errorcodeptr != 0) goto FAILED;
2833
2834         if (-c == ESC_b) c = '\b';       /* \b is backspace in a class */
2835         else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2836         else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2837         else if (-c == ESC_Q)            /* Handle start of quoted string */
2838           {
2839           if (ptr[1] == '\\' && ptr[2] == 'E')
2840             {
2841             ptr += 2; /* avoid empty string */
2842             }
2843           else inescq = TRUE;
2844           continue;
2845           }
2846         else if (-c == ESC_E) continue;  /* Ignore orphan \E */
2847
2848         if (c < 0)
2849           {
2850           register const uschar *cbits = cd->cbits;
2851           class_charcount += 2;     /* Greater than 1 is what matters */
2852
2853           /* Save time by not doing this in the pre-compile phase. */
2854
2855           if (lengthptr == NULL) switch (-c)
2856             {
2857             case ESC_d:
2858             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2859             continue;
2860
2861             case ESC_D:
2862             should_flip_negation = TRUE;
2863             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2864             continue;
2865
2866             case ESC_w:
2867             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2868             continue;
2869
2870             case ESC_W:
2871             should_flip_negation = TRUE;
2872             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2873             continue;
2874
2875             case ESC_s:
2876             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2877             classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
2878             continue;
2879
2880             case ESC_S:
2881             should_flip_negation = TRUE;
2882             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2883             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2884             continue;
2885
2886             default:    /* Not recognized; fall through */
2887             break;      /* Need "default" setting to stop compiler warning. */
2888             }
2889
2890           /* In the pre-compile phase, just do the recognition. */
2891
2892           else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2893                    c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2894
2895           /* We need to deal with \H, \h, \V, and \v in both phases because
2896           they use extra memory. */
2897
2898           if (-c == ESC_h)
2899             {
2900             SETBIT(classbits, 0x09); /* VT */
2901             SETBIT(classbits, 0x20); /* SPACE */
2902             SETBIT(classbits, 0xa0); /* NSBP */
2903 #ifdef SUPPORT_UTF8
2904             if (utf8)
2905               {
2906               class_utf8 = TRUE;
2907               *class_utf8data++ = XCL_SINGLE;
2908               class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2909               *class_utf8data++ = XCL_SINGLE;
2910               class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2911               *class_utf8data++ = XCL_RANGE;
2912               class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2913               class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2914               *class_utf8data++ = XCL_SINGLE;
2915               class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2916               *class_utf8data++ = XCL_SINGLE;
2917               class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2918               *class_utf8data++ = XCL_SINGLE;
2919               class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2920               }
2921 #endif
2922             continue;
2923             }
2924
2925           if (-c == ESC_H)
2926             {
2927             for (c = 0; c < 32; c++)
2928               {
2929               int x = 0xff;
2930               switch (c)
2931                 {
2932                 case 0x09/8: x ^= 1 << (0x09%8); break;
2933                 case 0x20/8: x ^= 1 << (0x20%8); break;
2934                 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2935                 default: break;
2936                 }
2937               classbits[c] |= x;
2938               }
2939
2940 #ifdef SUPPORT_UTF8
2941             if (utf8)
2942               {
2943               class_utf8 = TRUE;
2944               *class_utf8data++ = XCL_RANGE;
2945               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2946               class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2947               *class_utf8data++ = XCL_RANGE;
2948               class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2949               class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2950               *class_utf8data++ = XCL_RANGE;
2951               class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2952               class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2953               *class_utf8data++ = XCL_RANGE;
2954               class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2955               class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2956               *class_utf8data++ = XCL_RANGE;
2957               class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2958               class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2959               *class_utf8data++ = XCL_RANGE;
2960               class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2961               class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2962               *class_utf8data++ = XCL_RANGE;
2963               class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2964               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2965               }
2966 #endif
2967             continue;
2968             }
2969
2970           if (-c == ESC_v)
2971             {
2972             SETBIT(classbits, 0x0a); /* LF */
2973             SETBIT(classbits, 0x0b); /* VT */
2974             SETBIT(classbits, 0x0c); /* FF */
2975             SETBIT(classbits, 0x0d); /* CR */
2976             SETBIT(classbits, 0x85); /* NEL */
2977 #ifdef SUPPORT_UTF8
2978             if (utf8)
2979               {
2980               class_utf8 = TRUE;
2981               *class_utf8data++ = XCL_RANGE;
2982               class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2983               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2984               }
2985 #endif
2986             continue;
2987             }
2988
2989           if (-c == ESC_V)
2990             {
2991             for (c = 0; c < 32; c++)
2992               {
2993               int x = 0xff;
2994               switch (c)
2995                 {
2996                 case 0x0a/8: x ^= 1 << (0x0a%8);
2997                              x ^= 1 << (0x0b%8);
2998                              x ^= 1 << (0x0c%8);
2999                              x ^= 1 << (0x0d%8);
3000                              break;
3001                 case 0x85/8: x ^= 1 << (0x85%8); break;
3002                 default: break;
3003                 }
3004               classbits[c] |= x;
3005               }
3006
3007 #ifdef SUPPORT_UTF8
3008             if (utf8)
3009               {
3010               class_utf8 = TRUE;
3011               *class_utf8data++ = XCL_RANGE;
3012               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3013               class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3014               *class_utf8data++ = XCL_RANGE;
3015               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3016               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3017               }
3018 #endif
3019             continue;
3020             }
3021
3022           /* We need to deal with \P and \p in both phases. */
3023
3024 #ifdef SUPPORT_UCP
3025           if (-c == ESC_p || -c == ESC_P)
3026             {
3027             BOOL negated;
3028             int pdata;
3029             int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3030             if (ptype < 0) goto FAILED;
3031             class_utf8 = TRUE;
3032             *class_utf8data++ = ((-c == ESC_p) != negated)?
3033               XCL_PROP : XCL_NOTPROP;
3034             *class_utf8data++ = ptype;
3035             *class_utf8data++ = pdata;
3036             class_charcount -= 2;   /* Not a < 256 character */
3037             continue;
3038             }
3039 #endif
3040           /* Unrecognized escapes are faulted if PCRE is running in its
3041           strict mode. By default, for compatibility with Perl, they are
3042           treated as literals. */
3043
3044           if ((options & PCRE_EXTRA) != 0)
3045             {
3046             *errorcodeptr = ERR7;
3047             goto FAILED;
3048             }
3049
3050           class_charcount -= 2;  /* Undo the default count from above */
3051           c = *ptr;              /* Get the final character and fall through */
3052           }
3053
3054         /* Fall through if we have a single character (c >= 0). This may be
3055         greater than 256 in UTF-8 mode. */
3056
3057         }   /* End of backslash handling */
3058
3059       /* A single character may be followed by '-' to form a range. However,
3060       Perl does not permit ']' to be the end of the range. A '-' character
3061       at the end is treated as a literal. Perl ignores orphaned \E sequences
3062       entirely. The code for handling \Q and \E is messy. */
3063
3064       CHECK_RANGE:
3065       while (ptr[1] == '\\' && ptr[2] == 'E')
3066         {
3067         inescq = FALSE;
3068         ptr += 2;
3069         }
3070
3071       oldptr = ptr;
3072
3073       /* Remember \r or \n */
3074
3075       if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3076
3077       /* Check for range */
3078
3079       if (!inescq && ptr[1] == '-')
3080         {
3081         int d;
3082         ptr += 2;
3083         while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3084
3085         /* If we hit \Q (not followed by \E) at this point, go into escaped
3086         mode. */
3087
3088         while (*ptr == '\\' && ptr[1] == 'Q')
3089           {
3090           ptr += 2;
3091           if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3092           inescq = TRUE;
3093           break;
3094           }
3095
3096         if (*ptr == 0 || (!inescq && *ptr == ']'))
3097           {
3098           ptr = oldptr;
3099           goto LONE_SINGLE_CHARACTER;
3100           }
3101
3102 #ifdef SUPPORT_UTF8
3103         if (utf8)
3104           {                           /* Braces are required because the */
3105           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
3106           }
3107         else
3108 #endif
3109         d = *ptr;  /* Not UTF-8 mode */
3110
3111         /* The second part of a range can be a single-character escape, but
3112         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3113         in such circumstances. */
3114
3115         if (!inescq && d == '\\')
3116           {
3117           d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3118           if (*errorcodeptr != 0) goto FAILED;
3119
3120           /* \b is backspace; \X is literal X; \R is literal R; any other
3121           special means the '-' was literal */
3122
3123           if (d < 0)
3124             {
3125             if (d == -ESC_b) d = '\b';
3126             else if (d == -ESC_X) d = 'X';
3127             else if (d == -ESC_R) d = 'R'; else
3128               {
3129               ptr = oldptr;
3130               goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3131               }
3132             }
3133           }
3134
3135         /* Check that the two values are in the correct order. Optimize
3136         one-character ranges */
3137
3138         if (d < c)
3139           {
3140           *errorcodeptr = ERR8;
3141           goto FAILED;
3142           }
3143
3144         if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3145
3146         /* Remember \r or \n */
3147
3148         if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3149
3150         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3151         matching, we have to use an XCLASS with extra data items. Caseless
3152         matching for characters > 127 is available only if UCP support is
3153         available. */
3154
3155 #ifdef SUPPORT_UTF8
3156         if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3157           {
3158           class_utf8 = TRUE;
3159
3160           /* With UCP support, we can find the other case equivalents of
3161           the relevant characters. There may be several ranges. Optimize how
3162           they fit with the basic range. */
3163
3164 #ifdef SUPPORT_UCP
3165           if ((options & PCRE_CASELESS) != 0)
3166             {
3167             unsigned int occ, ocd;
3168             unsigned int cc = c;
3169             unsigned int origd = d;
3170             while (get_othercase_range(&cc, origd, &occ, &ocd))
3171               {
3172               if (occ >= (unsigned int)c &&
3173                   ocd <= (unsigned int)d)
3174                 continue;                          /* Skip embedded ranges */
3175
3176               if (occ < (unsigned int)c  &&
3177                   ocd >= (unsigned int)c - 1)      /* Extend the basic range */
3178                 {                                  /* if there is overlap,   */
3179                 c = occ;                           /* noting that if occ < c */
3180                 continue;                          /* we can't have ocd > d  */
3181                 }                                  /* because a subrange is  */
3182               if (ocd > (unsigned int)d &&
3183                   occ <= (unsigned int)d + 1)      /* always shorter than    */
3184                 {                                  /* the basic range.       */
3185                 d = ocd;
3186                 continue;
3187                 }
3188
3189               if (occ == ocd)
3190                 {
3191                 *class_utf8data++ = XCL_SINGLE;
3192                 }
3193               else
3194                 {
3195                 *class_utf8data++ = XCL_RANGE;
3196                 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3197                 }
3198               class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3199               }
3200             }
3201 #endif  /* SUPPORT_UCP */
3202
3203           /* Now record the original range, possibly modified for UCP caseless
3204           overlapping ranges. */
3205
3206           *class_utf8data++ = XCL_RANGE;
3207           class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3208           class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3209
3210           /* With UCP support, we are done. Without UCP support, there is no
3211           caseless matching for UTF-8 characters > 127; we can use the bit map
3212           for the smaller ones. */
3213
3214 #ifdef SUPPORT_UCP
3215           continue;    /* With next character in the class */
3216 #else
3217           if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3218
3219           /* Adjust upper limit and fall through to set up the map */
3220
3221           d = 127;
3222
3223 #endif  /* SUPPORT_UCP */
3224           }
3225 #endif  /* SUPPORT_UTF8 */
3226
3227         /* We use the bit map for all cases when not in UTF-8 mode; else
3228         ranges that lie entirely within 0-127 when there is UCP support; else
3229         for partial ranges without UCP support. */
3230
3231         class_charcount += d - c + 1;
3232         class_lastchar = d;
3233
3234         /* We can save a bit of time by skipping this in the pre-compile. */
3235
3236         if (lengthptr == NULL) for (; c <= d; c++)
3237           {
3238           classbits[c/8] |= (1 << (c&7));
3239           if ((options & PCRE_CASELESS) != 0)
3240             {
3241             int uc = cd->fcc[c];           /* flip case */
3242             classbits[uc/8] |= (1 << (uc&7));
3243             }
3244           }
3245
3246         continue;   /* Go get the next char in the class */
3247         }
3248
3249       /* Handle a lone single character - we can get here for a normal
3250       non-escape char, or after \ that introduces a single character or for an
3251       apparent range that isn't. */
3252
3253       LONE_SINGLE_CHARACTER:
3254
3255       /* Handle a character that cannot go in the bit map */
3256
3257 #ifdef SUPPORT_UTF8
3258       if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3259         {
3260         class_utf8 = TRUE;
3261         *class_utf8data++ = XCL_SINGLE;
3262         class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3263
3264 #ifdef SUPPORT_UCP
3265         if ((options & PCRE_CASELESS) != 0)
3266           {
3267           unsigned int othercase;
3268           if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3269             {
3270             *class_utf8data++ = XCL_SINGLE;
3271             class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3272             }
3273           }
3274 #endif  /* SUPPORT_UCP */
3275
3276         }
3277       else
3278 #endif  /* SUPPORT_UTF8 */
3279
3280       /* Handle a single-byte character */
3281         {
3282         classbits[c/8] |= (1 << (c&7));
3283         if ((options & PCRE_CASELESS) != 0)
3284           {
3285           c = cd->fcc[c];   /* flip case */
3286           classbits[c/8] |= (1 << (c&7));
3287           }
3288         class_charcount++;
3289         class_lastchar = c;
3290         }
3291       }
3292
3293     /* Loop until ']' reached. This "while" is the end of the "do" above. */
3294
3295     while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3296
3297     if (c == 0)                          /* Missing terminating ']' */
3298       {
3299       *errorcodeptr = ERR6;
3300       goto FAILED;
3301       }
3302
3303
3304 /* This code has been disabled because it would mean that \s counts as
3305 an explicit \r or \n reference, and that's not really what is wanted. Now
3306 we set the flag only if there is a literal "\r" or "\n" in the class. */
3307
3308 #if 0
3309     /* Remember whether \r or \n are in this class */
3310
3311     if (negate_class)
3312       {
3313       if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3314       }
3315     else
3316       {
3317       if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3318       }
3319 #endif
3320
3321
3322     /* If class_charcount is 1, we saw precisely one character whose value is
3323     less than 256. As long as there were no characters >= 128 and there was no
3324     use of \p or \P, in other words, no use of any XCLASS features, we can
3325     optimize.
3326
3327     In UTF-8 mode, we can optimize the negative case only if there were no
3328     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3329     operate on single-bytes only. This is an historical hangover. Maybe one day
3330     we can tidy these opcodes to handle multi-byte characters.
3331
3332     The optimization throws away the bit map. We turn the item into a
3333     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3334     that OP_NOT does not support multibyte characters. In the positive case, it
3335     can cause firstbyte to be set. Otherwise, there can be no first char if
3336     this item is first, whatever repeat count may follow. In the case of
3337     reqbyte, save the previous value for reinstating. */
3338
3339 #ifdef SUPPORT_UTF8
3340     if (class_charcount == 1 && !class_utf8 &&
3341       (!utf8 || !negate_class || class_lastchar < 128))
3342 #else
3343     if (class_charcount == 1)
3344 #endif
3345       {
3346       zeroreqbyte = reqbyte;
3347
3348       /* The OP_NOT opcode works on one-byte characters only. */
3349
3350       if (negate_class)
3351         {
3352         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3353         zerofirstbyte = firstbyte;
3354         *code++ = OP_NOT;
3355         *code++ = class_lastchar;
3356         break;
3357         }
3358
3359       /* For a single, positive character, get the value into mcbuffer, and
3360       then we can handle this with the normal one-character code. */
3361
3362 #ifdef SUPPORT_UTF8
3363       if (utf8 && class_lastchar > 127)
3364         mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3365       else
3366 #endif
3367         {
3368         mcbuffer[0] = class_lastchar;
3369         mclength = 1;
3370         }
3371       goto ONE_CHAR;
3372       }       /* End of 1-char optimization */
3373
3374     /* The general case - not the one-char optimization. If this is the first
3375     thing in the branch, there can be no first char setting, whatever the
3376     repeat count. Any reqbyte setting must remain unchanged after any kind of
3377     repeat. */
3378
3379     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3380     zerofirstbyte = firstbyte;
3381     zeroreqbyte = reqbyte;
3382
3383     /* If there are characters with values > 255, we have to compile an
3384     extended class, with its own opcode, unless there was a negated special
3385     such as \S in the class, because in that case all characters > 255 are in
3386     the class, so any that were explicitly given as well can be ignored. If
3387     (when there are explicit characters > 255 that must be listed) there are no
3388     characters < 256, we can omit the bitmap in the actual compiled code. */
3389
3390 #ifdef SUPPORT_UTF8
3391     if (class_utf8 && !should_flip_negation)
3392       {
3393       *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
3394       *code++ = OP_XCLASS;
3395       code += LINK_SIZE;
3396       *code = negate_class? XCL_NOT : 0;
3397
3398       /* If the map is required, move up the extra data to make room for it;
3399       otherwise just move the code pointer to the end of the extra data. */
3400
3401       if (class_charcount > 0)
3402         {
3403         *code++ |= XCL_MAP;
3404         memmove(code + 32, code, class_utf8data - code);
3405         memcpy(code, classbits, 32);
3406         code = class_utf8data + 32;
3407         }
3408       else code = class_utf8data;
3409
3410       /* Now fill in the complete length of the item */
3411
3412       PUT(previous, 1, code - previous);
3413       break;   /* End of class handling */
3414       }
3415 #endif
3416
3417     /* If there are no characters > 255, set the opcode to OP_CLASS or
3418     OP_NCLASS, depending on whether the whole class was negated and whether
3419     there were negative specials such as \S in the class. Then copy the 32-byte
3420     map into the code vector, negating it if necessary. */
3421
3422     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3423     if (negate_class)
3424       {
3425       if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3426         for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3427       }
3428     else
3429       {
3430       memcpy(code, classbits, 32);
3431       }
3432     code += 32;
3433     break;
3434
3435
3436     /* ===================================================================*/
3437     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3438     has been tested above. */
3439
3440     case '{':
3441     if (!is_quantifier) goto NORMAL_CHAR;
3442     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3443     if (*errorcodeptr != 0) goto FAILED;
3444     goto REPEAT;
3445
3446     case '*':
3447     repeat_min = 0;
3448     repeat_max = -1;
3449     goto REPEAT;
3450
3451     case '+':
3452     repeat_min = 1;
3453     repeat_max = -1;
3454     goto REPEAT;
3455
3456     case '?':
3457     repeat_min = 0;
3458     repeat_max = 1;
3459
3460     REPEAT:
3461     if (previous == NULL)
3462       {
3463       *errorcodeptr = ERR9;
3464       goto FAILED;
3465       }
3466
3467     if (repeat_min == 0)
3468       {
3469       firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
3470       reqbyte = zeroreqbyte;        /* Ditto */
3471       }
3472
3473     /* Remember whether this is a variable length repeat */
3474
3475     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3476
3477     op_type = 0;                    /* Default single-char op codes */
3478     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
3479
3480     /* Save start of previous item, in case we have to move it up to make space
3481     for an inserted OP_ONCE for the additional '+' extension. */
3482
3483     tempcode = previous;
3484
3485     /* If the next character is '+', we have a possessive quantifier. This
3486     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3487     If the next character is '?' this is a minimizing repeat, by default,
3488     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3489     repeat type to the non-default. */
3490
3491     if (ptr[1] == '+')
3492       {
3493       repeat_type = 0;                  /* Force greedy */
3494       possessive_quantifier = TRUE;
3495       ptr++;
3496       }
3497     else if (ptr[1] == '?')
3498       {
3499       repeat_type = greedy_non_default;
3500       ptr++;
3501       }
3502     else repeat_type = greedy_default;
3503
3504     /* If previous was a character match, abolish the item and generate a
3505     repeat item instead. If a char item has a minumum of more than one, ensure
3506     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3507     the first thing in a branch because the x will have gone into firstbyte
3508     instead.  */
3509
3510     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3511       {
3512       /* Deal with UTF-8 characters that take up more than one byte. It's
3513       easier to write this out separately than try to macrify it. Use c to
3514       hold the length of the character in bytes, plus 0x80 to flag that it's a
3515       length rather than a small character. */
3516
3517 #ifdef SUPPORT_UTF8
3518       if (utf8 && (code[-1] & 0x80) != 0)
3519         {
3520         uschar *lastchar = code - 1;
3521         while((*lastchar & 0xc0) == 0x80) lastchar--;
3522         c = code - lastchar;            /* Length of UTF-8 character */
3523         memcpy(utf8_char, lastchar, c); /* Save the char */
3524         c |= 0x80;                      /* Flag c as a length */
3525         }
3526       else
3527 #endif
3528
3529       /* Handle the case of a single byte - either with no UTF8 support, or
3530       with UTF-8 disabled, or for a UTF-8 character < 128. */
3531
3532         {
3533         c = code[-1];
3534         if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3535         }
3536
3537       /* If the repetition is unlimited, it pays to see if the next thing on
3538       the line is something that cannot possibly match this character. If so,
3539       automatically possessifying this item gains some performance in the case
3540       where the match fails. */
3541
3542       if (!possessive_quantifier &&
3543           repeat_max < 0 &&
3544           check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3545             options, cd))
3546         {
3547         repeat_type = 0;    /* Force greedy */
3548         possessive_quantifier = TRUE;
3549         }
3550
3551       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3552       }
3553
3554     /* If previous was a single negated character ([^a] or similar), we use
3555     one of the special opcodes, replacing it. The code is shared with single-
3556     character repeats by setting opt_type to add a suitable offset into
3557     repeat_type. We can also test for auto-possessification. OP_NOT is
3558     currently used only for single-byte chars. */
3559
3560     else if (*previous == OP_NOT)
3561       {
3562       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3563       c = previous[1];
3564       if (!possessive_quantifier &&
3565           repeat_max < 0 &&
3566           check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3567         {
3568         repeat_type = 0;    /* Force greedy */
3569         possessive_quantifier = TRUE;
3570         }
3571       goto OUTPUT_SINGLE_REPEAT;
3572       }
3573
3574     /* If previous was a character type match (\d or similar), abolish it and
3575     create a suitable repeat item. The code is shared with single-character
3576     repeats by setting op_type to add a suitable offset into repeat_type. Note
3577     the the Unicode property types will be present only when SUPPORT_UCP is
3578     defined, but we don't wrap the little bits of code here because it just
3579     makes it horribly messy. */
3580
3581     else if (*previous < OP_EODN)
3582       {
3583       uschar *oldcode;
3584       int prop_type, prop_value;
3585       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3586       c = *previous;
3587
3588       if (!possessive_quantifier &&
3589           repeat_max < 0 &&
3590           check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3591         {
3592         repeat_type = 0;    /* Force greedy */
3593         possessive_quantifier = TRUE;
3594         }
3595
3596       OUTPUT_SINGLE_REPEAT:
3597       if (*previous == OP_PROP || *previous == OP_NOTPROP)
3598         {
3599         prop_type = previous[1];
3600         prop_value = previous[2];
3601         }
3602       else prop_type = prop_value = -1;
3603
3604       oldcode = code;
3605       code = previous;                  /* Usually overwrite previous item */
3606
3607       /* If the maximum is zero then the minimum must also be zero; Perl allows
3608       this case, so we do too - by simply omitting the item altogether. */
3609
3610       if (repeat_max == 0) goto END_REPEAT;
3611
3612       /* All real repeats make it impossible to handle partial matching (maybe
3613       one day we will be able to remove this restriction). */
3614
3615       if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3616
3617       /* Combine the op_type with the repeat_type */
3618
3619       repeat_type += op_type;
3620
3621       /* A minimum of zero is handled either as the special case * or ?, or as
3622       an UPTO, with the maximum given. */
3623
3624       if (repeat_min == 0)
3625         {
3626         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3627           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3628         else
3629           {
3630           *code++ = OP_UPTO + repeat_type;
3631           PUT2INC(code, 0, repeat_max);
3632           }
3633         }
3634
3635       /* A repeat minimum of 1 is optimized into some special cases. If the
3636       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3637       left in place and, if the maximum is greater than 1, we use OP_UPTO with
3638       one less than the maximum. */
3639
3640       else if (repeat_min == 1)
3641         {
3642         if (repeat_max == -1)
3643           *code++ = OP_PLUS + repeat_type;
3644         else
3645           {
3646           code = oldcode;                 /* leave previous item in place */
3647           if (repeat_max == 1) goto END_REPEAT;
3648           *code++ = OP_UPTO + repeat_type;
3649           PUT2INC(code, 0, repeat_max - 1);
3650           }
3651         }
3652
3653       /* The case {n,n} is just an EXACT, while the general case {n,m} is
3654       handled as an EXACT followed by an UPTO. */
3655
3656       else
3657         {
3658         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
3659         PUT2INC(code, 0, repeat_min);
3660
3661         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3662         we have to insert the character for the previous code. For a repeated
3663         Unicode property match, there are two extra bytes that define the
3664         required property. In UTF-8 mode, long characters have their length in
3665         c, with the 0x80 bit as a flag. */
3666
3667         if (repeat_max < 0)
3668           {
3669 #ifdef SUPPORT_UTF8
3670           if (utf8 && c >= 128)
3671             {
3672             memcpy(code, utf8_char, c & 7);
3673             code += c & 7;
3674             }
3675           else
3676 #endif
3677             {
3678             *code++ = c;
3679             if (prop_type >= 0)
3680               {
3681               *code++ = prop_type;
3682               *code++ = prop_value;
3683               }
3684             }
3685           *code++ = OP_STAR + repeat_type;
3686           }
3687
3688         /* Else insert an UPTO if the max is greater than the min, again
3689         preceded by the character, for the previously inserted code. If the
3690         UPTO is just for 1 instance, we can use QUERY instead. */
3691
3692         else if (repeat_max != repeat_min)
3693           {
3694 #ifdef SUPPORT_UTF8
3695           if (utf8 && c >= 128)
3696             {
3697             memcpy(code, utf8_char, c & 7);
3698             code += c & 7;
3699             }
3700           else
3701 #endif
3702           *code++ = c;
3703           if (prop_type >= 0)
3704             {
3705             *code++ = prop_type;
3706             *code++ = prop_value;
3707             }
3708           repeat_max -= repeat_min;
3709
3710           if (repeat_max == 1)
3711             {
3712             *code++ = OP_QUERY + repeat_type;
3713             }
3714           else
3715             {
3716             *code++ = OP_UPTO + repeat_type;
3717             PUT2INC(code, 0, repeat_max);
3718             }
3719           }
3720         }
3721
3722       /* The character or character type itself comes last in all cases. */
3723
3724 #ifdef SUPPORT_UTF8
3725       if (utf8 && c >= 128)
3726         {
3727         memcpy(code, utf8_char, c & 7);
3728         code += c & 7;
3729         }
3730       else
3731 #endif
3732       *code++ = c;
3733
3734       /* For a repeated Unicode property match, there are two extra bytes that
3735       define the required property. */
3736
3737 #ifdef SUPPORT_UCP
3738       if (prop_type >= 0)
3739         {
3740         *code++ = prop_type;
3741         *code++ = prop_value;
3742         }
3743 #endif
3744       }
3745
3746     /* If previous was a character class or a back reference, we put the repeat
3747     stuff after it, but just skip the item if the repeat was {0,0}. */
3748
3749     else if (*previous == OP_CLASS ||
3750              *previous == OP_NCLASS ||
3751 #ifdef SUPPORT_UTF8
3752              *previous == OP_XCLASS ||
3753 #endif
3754              *previous == OP_REF)
3755       {
3756       if (repeat_max == 0)
3757         {
3758         code = previous;
3759         goto END_REPEAT;
3760         }
3761
3762       /* All real repeats make it impossible to handle partial matching (maybe
3763       one day we will be able to remove this restriction). */
3764
3765       if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3766
3767       if (repeat_min == 0 && repeat_max == -1)
3768         *code++ = OP_CRSTAR + repeat_type;
3769       else if (repeat_min == 1 && repeat_max == -1)
3770         *code++ = OP_CRPLUS + repeat_type;
3771       else if (repeat_min == 0 && repeat_max == 1)
3772         *code++ = OP_CRQUERY + repeat_type;
3773       else
3774         {
3775         *code++ = OP_CRRANGE + repeat_type;
3776         PUT2INC(code, 0, repeat_min);
3777         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
3778         PUT2INC(code, 0, repeat_max);
3779         }
3780       }
3781
3782     /* If previous was a bracket group, we may have to replicate it in certain
3783     cases. */
3784
3785     else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3786              *previous == OP_ONCE || *previous == OP_COND)
3787       {
3788       register int i;
3789       int ketoffset = 0;
3790       int len = code - previous;
3791       uschar *bralink = NULL;
3792
3793       /* Repeating a DEFINE group is pointless */
3794
3795       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3796         {
3797         *errorcodeptr = ERR55;
3798         goto FAILED;
3799         }
3800
3801       /* If the maximum repeat count is unlimited, find the end of the bracket
3802       by scanning through from the start, and compute the offset back to it
3803       from the current code pointer. There may be an OP_OPT setting following
3804       the final KET, so we can't find the end just by going back from the code
3805       pointer. */
3806
3807       if (repeat_max == -1)
3808         {
3809         register uschar *ket = previous;
3810         do ket += GET(ket, 1); while (*ket != OP_KET);
3811         ketoffset = code - ket;
3812         }
3813
3814       /* The case of a zero minimum is special because of the need to stick
3815       OP_BRAZERO in front of it, and because the group appears once in the
3816       data, whereas in other cases it appears the minimum number of times. For
3817       this reason, it is simplest to treat this case separately, as otherwise
3818       the code gets far too messy. There are several special subcases when the
3819       minimum is zero. */
3820
3821       if (repeat_min == 0)
3822         {
3823         /* If the maximum is also zero, we just omit the group from the output
3824         altogether. */
3825
3826         if (repeat_max == 0)
3827           {
3828           code = previous;
3829           goto END_REPEAT;
3830           }
3831
3832         /* If the maximum is 1 or unlimited, we just have to stick in the
3833         BRAZERO and do no more at this point. However, we do need to adjust
3834         any OP_RECURSE calls inside the group that refer to the group itself or
3835         any internal or forward referenced group, because the offset is from
3836         the start of the whole regex. Temporarily terminate the pattern while
3837         doing this. */
3838
3839         if (repeat_max <= 1)
3840           {
3841           *code = OP_END;
3842           adjust_recurse(previous, 1, utf8, cd, save_hwm);
3843           memmove(previous+1, previous, len);
3844           code++;
3845           *previous++ = OP_BRAZERO + repeat_type;
3846           }
3847
3848         /* If the maximum is greater than 1 and limited, we have to replicate
3849         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3850         The first one has to be handled carefully because it's the original
3851         copy, which has to be moved up. The remainder can be handled by code
3852         that is common with the non-zero minimum case below. We have to
3853         adjust the value or repeat_max, since one less copy is required. Once
3854         again, we may have to adjust any OP_RECURSE calls inside the group. */
3855
3856         else
3857           {
3858           int offset;
3859           *code = OP_END;
3860           adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3861           memmove(previous + 2 + LINK_SIZE, previous, len);
3862           code += 2 + LINK_SIZE;
3863           *previous++ = OP_BRAZERO + repeat_type;
3864           *previous++ = OP_BRA;
3865
3866           /* We chain together the bracket offset fields that have to be
3867           filled in later when the ends of the brackets are reached. */
3868
3869           offset = (bralink == NULL)? 0 : previous - bralink;
3870           bralink = previous;
3871           PUTINC(previous, 0, offset);
3872           }
3873
3874         repeat_max--;
3875         }
3876
3877       /* If the minimum is greater than zero, replicate the group as many
3878       times as necessary, and adjust the maximum to the number of subsequent
3879       copies that we need. If we set a first char from the group, and didn't
3880       set a required char, copy the latter from the former. If there are any
3881       forward reference subroutine calls in the group, there will be entries on
3882       the workspace list; replicate these with an appropriate increment. */
3883
3884       else
3885         {
3886         if (repeat_min > 1)
3887           {
3888           /* In the pre-compile phase, we don't actually do the replication. We
3889           just adjust the length as if we had. Do some paranoid checks for
3890           potential integer overflow. */
3891
3892           if (lengthptr != NULL)
3893             {
3894             int delta = (repeat_min - 1)*length_prevgroup;
3895             if ((double)(repeat_min - 1)*(double)length_prevgroup >
3896                                                             (double)INT_MAX ||
3897                 OFLOW_MAX - *lengthptr < delta)
3898               {
3899               *errorcodeptr = ERR20;
3900               goto FAILED;
3901               }
3902             *lengthptr += delta;
3903             }
3904
3905           /* This is compiling for real */
3906
3907           else
3908             {
3909             if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3910             for (i = 1; i < repeat_min; i++)
3911               {
3912               uschar *hc;
3913               uschar *this_hwm = cd->hwm;
3914               memcpy(code, previous, len);
3915               for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3916                 {
3917                 PUT(cd->hwm, 0, GET(hc, 0) + len);
3918                 cd->hwm += LINK_SIZE;
3919                 }
3920               save_hwm = this_hwm;
3921               code += len;
3922               }
3923             }
3924           }
3925
3926         if (repeat_max > 0) repeat_max -= repeat_min;
3927         }
3928
3929       /* This code is common to both the zero and non-zero minimum cases. If
3930       the maximum is limited, it replicates the group in a nested fashion,
3931       remembering the bracket starts on a stack. In the case of a zero minimum,
3932       the first one was set up above. In all cases the repeat_max now specifies
3933       the number of additional copies needed. Again, we must remember to
3934       replicate entries on the forward reference list. */
3935
3936       if (repeat_max >= 0)
3937         {
3938         /* In the pre-compile phase, we don't actually do the replication. We
3939         just adjust the length as if we had. For each repetition we must add 1
3940         to the length for BRAZERO and for all but the last repetition we must
3941         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3942         paranoid checks to avoid integer overflow. */
3943
3944         if (lengthptr != NULL && repeat_max > 0)
3945           {
3946           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3947                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
3948           if ((double)repeat_max *
3949                 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3950                   > (double)INT_MAX ||
3951               OFLOW_MAX - *lengthptr < delta)
3952             {
3953             *errorcodeptr = ERR20;
3954             goto FAILED;
3955             }
3956           *lengthptr += delta;
3957           }
3958
3959         /* This is compiling for real */
3960
3961         else for (i = repeat_max - 1; i >= 0; i--)
3962           {
3963           uschar *hc;
3964           uschar *this_hwm = cd->hwm;
3965
3966           *code++ = OP_BRAZERO + repeat_type;
3967
3968           /* All but the final copy start a new nesting, maintaining the
3969           chain of brackets outstanding. */
3970
3971           if (i != 0)
3972             {
3973             int offset;
3974             *code++ = OP_BRA;
3975             offset = (bralink == NULL)? 0 : code - bralink;
3976             bralink = code;
3977             PUTINC(code, 0, offset);
3978             }
3979
3980           memcpy(code, previous, len);
3981           for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3982             {
3983             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3984             cd->hwm += LINK_SIZE;
3985             }
3986           save_hwm = this_hwm;
3987           code += len;
3988           }
3989
3990         /* Now chain through the pending brackets, and fill in their length
3991         fields (which are holding the chain links pro tem). */
3992
3993         while (bralink != NULL)
3994           {
3995           int oldlinkoffset;
3996           int offset = code - bralink + 1;
3997           uschar *bra = code - offset;
3998           oldlinkoffset = GET(bra, 1);
3999           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4000           *code++ = OP_KET;
4001           PUTINC(code, 0, offset);
4002           PUT(bra, 1, offset);
4003           }
4004         }
4005
4006       /* If the maximum is unlimited, set a repeater in the final copy. We
4007       can't just offset backwards from the current code point, because we
4008       don't know if there's been an options resetting after the ket. The
4009       correct offset was computed above.
4010
4011       Then, when we are doing the actual compile phase, check to see whether
4012       this group is a non-atomic one that could match an empty string. If so,
4013       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4014       that runtime checking can be done. [This check is also applied to
4015       atomic groups at runtime, but in a different way.] */
4016
4017       else
4018         {
4019         uschar *ketcode = code - ketoffset;
4020         uschar *bracode = ketcode - GET(ketcode, 1);
4021         *ketcode = OP_KETRMAX + repeat_type;
4022         if (lengthptr == NULL && *bracode != OP_ONCE)
4023           {
4024           uschar *scode = bracode;
4025           do
4026             {
4027             if (could_be_empty_branch(scode, ketcode, utf8))
4028               {
4029               *bracode += OP_SBRA - OP_BRA;
4030               break;
4031               }
4032             scode += GET(scode, 1);
4033             }
4034           while (*scode == OP_ALT);
4035           }
4036         }
4037       }
4038
4039     /* Else there's some kind of shambles */
4040
4041     else
4042       {
4043       *errorcodeptr = ERR11;
4044       goto FAILED;
4045       }
4046
4047     /* If the character following a repeat is '+', or if certain optimization
4048     tests above succeeded, possessive_quantifier is TRUE. For some of the
4049     simpler opcodes, there is an special alternative opcode for this. For
4050     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4051     The '+' notation is just syntactic sugar, taken from Sun's Java package,
4052     but the special opcodes can optimize it a bit. The repeated item starts at
4053     tempcode, not at previous, which might be the first part of a string whose
4054     (former) last char we repeated.
4055
4056     Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4057     an 'upto' may follow. We skip over an 'exact' item, and then test the
4058     length of what remains before proceeding. */
4059
4060     if (possessive_quantifier)
4061       {
4062       int len;
4063       if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4064           *tempcode == OP_NOTEXACT)
4065         tempcode += _pcre_OP_lengths[*tempcode] +
4066           ((*tempcode == OP_TYPEEXACT &&
4067              (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4068       len = code - tempcode;
4069       if (len > 0) switch (*tempcode)
4070         {
4071         case OP_STAR:  *tempcode = OP_POSSTAR; break;
4072         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
4073         case OP_QUERY: *tempcode = OP_POSQUERY; break;
4074         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
4075
4076         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
4077         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
4078         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4079         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
4080
4081         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
4082         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
4083         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4084         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
4085
4086         default:
4087         memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4088         code += 1 + LINK_SIZE;
4089         len += 1 + LINK_SIZE;
4090         tempcode[0] = OP_ONCE;
4091         *code++ = OP_KET;
4092         PUTINC(code, 0, len);
4093         PUT(tempcode, 1, len);
4094         break;
4095         }
4096       }
4097
4098     /* In all case we no longer have a previous item. We also set the
4099     "follows varying string" flag for subsequently encountered reqbytes if
4100     it isn't already set and we have just passed a varying length item. */
4101
4102     END_REPEAT:
4103     previous = NULL;
4104     cd->req_varyopt |= reqvary;
4105     break;
4106
4107
4108     /* ===================================================================*/
4109     /* Start of nested parenthesized sub-expression, or comment or lookahead or
4110     lookbehind or option setting or condition or all the other extended
4111     parenthesis forms.  */
4112
4113     case '(':
4114     newoptions = options;
4115     skipbytes = 0;
4116     bravalue = OP_CBRA;
4117     save_hwm = cd->hwm;
4118     reset_bracount = FALSE;
4119
4120     /* First deal with various "verbs" that can be introduced by '*'. */
4121
4122     if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4123       {
4124       int i, namelen;
4125       const char *vn = verbnames;
4126       const uschar *name = ++ptr;
4127       previous = NULL;
4128       while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4129       if (*ptr == ':')
4130         {
4131         *errorcodeptr = ERR59;   /* Not supported */
4132         goto FAILED;
4133         }
4134       if (*ptr != ')')
4135         {
4136         *errorcodeptr = ERR60;
4137         goto FAILED;
4138         }
4139       namelen = ptr - name;
4140       for (i = 0; i < verbcount; i++)
4141         {
4142         if (namelen == verbs[i].len &&
4143             strncmp((char *)name, vn, namelen) == 0)
4144           {
4145           *code = verbs[i].op;
4146           if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4147           break;
4148           }
4149         vn += verbs[i].len + 1;
4150         }
4151       if (i < verbcount) continue;
4152       *errorcodeptr = ERR60;
4153       goto FAILED;
4154       }
4155
4156     /* Deal with the extended parentheses; all are introduced by '?', and the
4157     appearance of any of them means that this is not a capturing group. */
4158
4159     else if (*ptr == '?')
4160       {
4161       int i, set, unset, namelen;
4162       int *optset;
4163       const uschar *name;
4164       uschar *slot;
4165
4166       switch (*(++ptr))
4167         {
4168         case '#':                 /* Comment; skip to ket */
4169         ptr++;
4170         while (*ptr != 0 && *ptr != ')') ptr++;
4171         if (*ptr == 0)
4172           {
4173           *errorcodeptr = ERR18;
4174           goto FAILED;
4175           }
4176         continue;
4177
4178
4179         /* ------------------------------------------------------------ */
4180         case '|':                 /* Reset capture count for each branch */
4181         reset_bracount = TRUE;
4182         /* Fall through */
4183
4184         /* ------------------------------------------------------------ */
4185         case ':':                 /* Non-capturing bracket */
4186         bravalue = OP_BRA;
4187         ptr++;
4188         break;
4189
4190
4191         /* ------------------------------------------------------------ */
4192         case '(':
4193         bravalue = OP_COND;       /* Conditional group */
4194
4195         /* A condition can be an assertion, a number (referring to a numbered
4196         group), a name (referring to a named group), or 'R', referring to
4197         recursion. R<digits> and R&name are also permitted for recursion tests.
4198
4199         There are several syntaxes for testing a named group: (?(name)) is used
4200         by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4201
4202         There are two unfortunate ambiguities, caused by history. (a) 'R' can
4203         be the recursive thing or the name 'R' (and similarly for 'R' followed
4204         by digits), and (b) a number could be a name that consists of digits.
4205         In both cases, we look for a name first; if not found, we try the other
4206         cases. */
4207
4208         /* For conditions that are assertions, check the syntax, and then exit
4209         the switch. This will take control down to where bracketed groups,
4210         including assertions, are processed. */
4211
4212         if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4213           break;
4214
4215         /* Most other conditions use OP_CREF (a couple change to OP_RREF
4216         below), and all need to skip 3 bytes at the start of the group. */
4217
4218         code[1+LINK_SIZE] = OP_CREF;
4219         skipbytes = 3;
4220         refsign = -1;
4221
4222         /* Check for a test for recursion in a named group. */
4223
4224         if (ptr[1] == 'R' && ptr[2] == '&')
4225           {
4226           terminator = -1;
4227           ptr += 2;
4228           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
4229           }
4230
4231         /* Check for a test for a named group's having been set, using the Perl
4232         syntax (?(<name>) or (?('name') */
4233
4234         else if (ptr[1] == '<')
4235           {
4236           terminator = '>';
4237           ptr++;
4238           }
4239         else if (ptr[1] == '\'')
4240           {
4241           terminator = '\'';
4242           ptr++;
4243           }
4244         else
4245           {
4246           terminator = 0;
4247           if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4248           }
4249
4250         /* We now expect to read a name; any thing else is an error */
4251
4252         if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4253           {
4254           ptr += 1;  /* To get the right offset */
4255           *errorcodeptr = ERR28;
4256           goto FAILED;
4257           }
4258
4259         /* Read the name, but also get it as a number if it's all digits */
4260
4261         recno = 0;
4262         name = ++ptr;
4263         while ((cd->ctypes[*ptr] & ctype_word) != 0)
4264           {
4265           if (recno >= 0)
4266             recno = ((digitab[*ptr] & ctype_digit) != 0)?
4267               recno * 10 + *ptr - '0' : -1;
4268           ptr++;
4269           }
4270         namelen = ptr - name;
4271
4272         if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4273           {
4274           ptr--;      /* Error offset */
4275           *errorcodeptr = ERR26;
4276           goto FAILED;
4277           }
4278
4279         /* Do no further checking in the pre-compile phase. */
4280
4281         if (lengthptr != NULL) break;
4282
4283         /* In the real compile we do the work of looking for the actual
4284         reference. If the string started with "+" or "-" we require the rest to
4285         be digits, in which case recno will be set. */
4286
4287         if (refsign > 0)
4288           {
4289           if (recno <= 0)
4290             {
4291             *errorcodeptr = ERR58;
4292             goto FAILED;
4293             }
4294           recno = (refsign == '-')?
4295             cd->bracount - recno + 1 : recno +cd->bracount;
4296           if (recno <= 0 || recno > cd->final_bracount)
4297             {
4298             *errorcodeptr = ERR15;
4299             goto FAILED;
4300             }
4301           PUT2(code, 2+LINK_SIZE, recno);
4302           break;
4303           }
4304
4305         /* Otherwise (did not start with "+" or "-"), start by looking for the
4306         name. */
4307
4308         slot = cd->name_table;
4309         for (i = 0; i < cd->names_found; i++)
4310           {
4311           if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4312           slot += cd->name_entry_size;
4313           }
4314
4315         /* Found a previous named subpattern */
4316
4317         if (i < cd->names_found)
4318           {
4319           recno = GET2(slot, 0);
4320           PUT2(code, 2+LINK_SIZE, recno);
4321           }
4322
4323         /* Search the pattern for a forward reference */
4324
4325         else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4326                         (options & PCRE_EXTENDED) != 0)) > 0)
4327           {
4328           PUT2(code, 2+LINK_SIZE, i);
4329           }
4330
4331         /* If terminator == 0 it means that the name followed directly after
4332         the opening parenthesis [e.g. (?(abc)...] and in this case there are
4333         some further alternatives to try. For the cases where terminator != 0
4334         [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4335         now checked all the possibilities, so give an error. */
4336
4337         else if (terminator != 0)
4338           {
4339           *errorcodeptr = ERR15;
4340           goto FAILED;
4341           }
4342
4343         /* Check for (?(R) for recursion. Allow digits after R to specify a
4344         specific group number. */
4345
4346         else if (*name == 'R')
4347           {
4348           recno = 0;
4349           for (i = 1; i < namelen; i++)
4350             {
4351             if ((digitab[name[i]] & ctype_digit) == 0)
4352               {
4353               *errorcodeptr = ERR15;
4354               goto FAILED;
4355               }
4356             recno = recno * 10 + name[i] - '0';
4357             }
4358           if (recno == 0) recno = RREF_ANY;
4359           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
4360           PUT2(code, 2+LINK_SIZE, recno);
4361           }
4362
4363         /* Similarly, check for the (?(DEFINE) "condition", which is always
4364         false. */
4365
4366         else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4367           {
4368           code[1+LINK_SIZE] = OP_DEF;
4369           skipbytes = 1;
4370           }
4371
4372         /* Check for the "name" actually being a subpattern number. We are
4373         in the second pass here, so final_bracount is set. */
4374
4375         else if (recno > 0 && recno <= cd->final_bracount)
4376           {
4377           PUT2(code, 2+LINK_SIZE, recno);
4378           }
4379
4380         /* Either an unidentified subpattern, or a reference to (?(0) */
4381
4382         else
4383           {
4384           *errorcodeptr = (recno == 0)? ERR35: ERR15;
4385           goto FAILED;
4386           }
4387         break;
4388
4389
4390         /* ------------------------------------------------------------ */
4391         case '=':                 /* Positive lookahead */
4392         bravalue = OP_ASSERT;
4393         ptr++;
4394         break;
4395
4396
4397         /* ------------------------------------------------------------ */
4398         case '!':                 /* Negative lookahead */
4399         ptr++;
4400         if (*ptr == ')')          /* Optimize (?!) */
4401           {
4402           *code++ = OP_FAIL;
4403           previous = NULL;
4404           continue;
4405           }
4406         bravalue = OP_ASSERT_NOT;
4407         break;
4408
4409
4410         /* ------------------------------------------------------------ */
4411         case '<':                 /* Lookbehind or named define */
4412         switch (ptr[1])
4413           {
4414           case '=':               /* Positive lookbehind */
4415           bravalue = OP_ASSERTBACK;
4416           ptr += 2;
4417           break;
4418
4419           case '!':               /* Negative lookbehind */
4420           bravalue = OP_ASSERTBACK_NOT;
4421           ptr += 2;
4422           break;
4423
4424           default:                /* Could be name define, else bad */
4425           if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4426           ptr++;                  /* Correct offset for error */
4427           *errorcodeptr = ERR24;
4428           goto FAILED;
4429           }
4430         break;
4431
4432
4433         /* ------------------------------------------------------------ */
4434         case '>':                 /* One-time brackets */
4435         bravalue = OP_ONCE;
4436         ptr++;
4437         break;
4438
4439
4440         /* ------------------------------------------------------------ */
4441         case 'C':                 /* Callout - may be followed by digits; */
4442         previous_callout = code;  /* Save for later completion */
4443         after_manual_callout = 1; /* Skip one item before completing */
4444         *code++ = OP_CALLOUT;
4445           {
4446           int n = 0;
4447           while ((digitab[*(++ptr)] & ctype_digit) != 0)
4448             n = n * 10 + *ptr - '0';
4449           if (*ptr != ')')
4450             {
4451             *errorcodeptr = ERR39;
4452             goto FAILED;
4453             }
4454           if (n > 255)
4455             {
4456             *errorcodeptr = ERR38;
4457             goto FAILED;
4458             }
4459           *code++ = n;
4460           PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
4461           PUT(code, LINK_SIZE, 0);                    /* Default length */
4462           code += 2 * LINK_SIZE;
4463           }
4464         previous = NULL;
4465         continue;
4466
4467
4468         /* ------------------------------------------------------------ */
4469         case 'P':                 /* Python-style named subpattern handling */
4470         if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
4471           {
4472           is_recurse = *ptr == '>';
4473           terminator = ')';
4474           goto NAMED_REF_OR_RECURSE;
4475           }
4476         else if (*ptr != '<')    /* Test for Python-style definition */
4477           {
4478           *errorcodeptr = ERR41;
4479           goto FAILED;
4480           }
4481         /* Fall through to handle (?P< as (?< is handled */
4482
4483
4484         /* ------------------------------------------------------------ */
4485         DEFINE_NAME:    /* Come here from (?< handling */
4486         case '\'':
4487           {
4488           terminator = (*ptr == '<')? '>' : '\'';
4489           name = ++ptr;
4490
4491           while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4492           namelen = ptr - name;
4493
4494           /* In the pre-compile phase, just do a syntax check. */
4495
4496           if (lengthptr != NULL)
4497             {
4498             if (*ptr != terminator)
4499               {
4500               *errorcodeptr = ERR42;
4501               goto FAILED;
4502               }
4503             if (cd->names_found >= MAX_NAME_COUNT)
4504               {
4505               *errorcodeptr = ERR49;
4506               goto FAILED;
4507               }
4508             if (namelen + 3 > cd->name_entry_size)
4509               {
4510               cd->name_entry_size = namelen + 3;
4511               if (namelen > MAX_NAME_SIZE)
4512                 {
4513                 *errorcodeptr = ERR48;
4514                 goto FAILED;
4515                 }
4516               }
4517             }
4518
4519           /* In the real compile, create the entry in the table */
4520
4521           else
4522             {
4523             slot = cd->name_table;
4524             for (i = 0; i < cd->names_found; i++)
4525               {
4526               int crc = memcmp(name, slot+2, namelen);
4527               if (crc == 0)
4528                 {
4529                 if (slot[2+namelen] == 0)
4530                   {
4531                   if ((options & PCRE_DUPNAMES) == 0)
4532                     {
4533                     *errorcodeptr = ERR43;
4534                     goto FAILED;
4535                     }
4536                   }
4537                 else crc = -1;      /* Current name is substring */
4538                 }
4539               if (crc < 0)
4540                 {
4541                 memmove(slot + cd->name_entry_size, slot,
4542                   (cd->names_found - i) * cd->name_entry_size);
4543                 break;
4544                 }
4545               slot += cd->name_entry_size;
4546               }
4547
4548             PUT2(slot, 0, cd->bracount + 1);
4549             memcpy(slot + 2, name, namelen);
4550             slot[2+namelen] = 0;
4551             }
4552           }
4553
4554         /* In both cases, count the number of names we've encountered. */
4555
4556         ptr++;                    /* Move past > or ' */
4557         cd->names_found++;
4558         goto NUMBERED_GROUP;
4559
4560
4561         /* ------------------------------------------------------------ */
4562         case '&':                 /* Perl recursion/subroutine syntax */
4563         terminator = ')';
4564         is_recurse = TRUE;
4565         /* Fall through */
4566
4567         /* We come here from the Python syntax above that handles both
4568         references (?P=name) and recursion (?P>name), as well as falling
4569         through from the Perl recursion syntax (?&name). We also come here from
4570         the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4571         .NET syntax. */
4572
4573         NAMED_REF_OR_RECURSE:
4574         name = ++ptr;
4575         while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4576         namelen = ptr - name;
4577
4578         /* In the pre-compile phase, do a syntax check and set a dummy
4579         reference number. */
4580
4581         if (lengthptr != NULL)
4582           {
4583           if (namelen == 0)
4584             {
4585             *errorcodeptr = ERR62;
4586             goto FAILED;
4587             }
4588           if (*ptr != terminator)
4589             {
4590             *errorcodeptr = ERR42;
4591             goto FAILED;
4592             }
4593           if (namelen > MAX_NAME_SIZE)
4594             {
4595             *errorcodeptr = ERR48;
4596             goto FAILED;
4597             }
4598           recno = 0;
4599           }
4600
4601         /* In the real compile, seek the name in the table. We check the name
4602         first, and then check that we have reached the end of the name in the
4603         table. That way, if the name that is longer than any in the table,
4604         the comparison will fail without reading beyond the table entry. */
4605
4606         else
4607           {
4608           slot = cd->name_table;
4609           for (i = 0; i < cd->names_found; i++)
4610             {
4611             if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4612                 slot[2+namelen] == 0)
4613               break;
4614             slot += cd->name_entry_size;
4615             }
4616
4617           if (i < cd->names_found)         /* Back reference */
4618             {
4619             recno = GET2(slot, 0);
4620             }
4621           else if ((recno =                /* Forward back reference */
4622                     find_parens(ptr, cd->bracount, name, namelen,
4623                       (options & PCRE_EXTENDED) != 0)) <= 0)
4624             {
4625             *errorcodeptr = ERR15;
4626             goto FAILED;
4627             }
4628           }
4629
4630         /* In both phases, we can now go to the code than handles numerical
4631         recursion or backreferences. */
4632
4633         if (is_recurse) goto HANDLE_RECURSION;
4634           else goto HANDLE_REFERENCE;
4635
4636
4637         /* ------------------------------------------------------------ */
4638         case 'R':                 /* Recursion */
4639         ptr++;                    /* Same as (?0)      */
4640         /* Fall through */
4641
4642
4643         /* ------------------------------------------------------------ */
4644         case '-': case '+':
4645         case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4646         case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4647           {
4648           const uschar *called;
4649
4650           if ((refsign = *ptr) == '+')
4651             {
4652             ptr++;
4653             if ((digitab[*ptr] & ctype_digit) == 0)
4654               {
4655               *errorcodeptr = ERR63;
4656               goto FAILED;
4657               }
4658             }
4659           else if (refsign == '-')
4660             {
4661             if ((digitab[ptr[1]] & ctype_digit) == 0)
4662               goto OTHER_CHAR_AFTER_QUERY;
4663             ptr++;
4664             }
4665
4666           recno = 0;
4667           while((digitab[*ptr] & ctype_digit) != 0)
4668             recno = recno * 10 + *ptr++ - '0';
4669
4670           if (*ptr != ')')
4671             {
4672             *errorcodeptr = ERR29;
4673             goto FAILED;
4674             }
4675
4676           if (refsign == '-')
4677             {
4678             if (recno == 0)
4679               {
4680               *errorcodeptr = ERR58;
4681               goto FAILED;
4682               }
4683             recno = cd->bracount - recno + 1;
4684             if (recno <= 0)
4685               {
4686               *errorcodeptr = ERR15;
4687               goto FAILED;
4688               }
4689             }
4690           else if (refsign == '+')
4691             {
4692             if (recno == 0)
4693               {
4694               *errorcodeptr = ERR58;
4695               goto FAILED;
4696               }
4697             recno += cd->bracount;
4698             }
4699
4700           /* Come here from code above that handles a named recursion */
4701
4702           HANDLE_RECURSION:
4703
4704           previous = code;
4705           called = cd->start_code;
4706
4707           /* When we are actually compiling, find the bracket that is being
4708           referenced. Temporarily end the regex in case it doesn't exist before
4709           this point. If we end up with a forward reference, first check that
4710           the bracket does occur later so we can give the error (and position)
4711           now. Then remember this forward reference in the workspace so it can
4712           be filled in at the end. */
4713
4714           if (lengthptr == NULL)
4715             {
4716             *code = OP_END;
4717             if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4718
4719             /* Forward reference */
4720
4721             if (called == NULL)
4722               {
4723               if (find_parens(ptr, cd->bracount, NULL, recno,
4724                    (options & PCRE_EXTENDED) != 0) < 0)
4725                 {
4726                 *errorcodeptr = ERR15;
4727                 goto FAILED;
4728                 }
4729               called = cd->start_code + recno;
4730               PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4731               }
4732
4733             /* If not a forward reference, and the subpattern is still open,
4734             this is a recursive call. We check to see if this is a left
4735             recursion that could loop for ever, and diagnose that case. */
4736
4737             else if (GET(called, 1) == 0 &&
4738                      could_be_empty(called, code, bcptr, utf8))
4739               {
4740               *errorcodeptr = ERR40;
4741               goto FAILED;
4742               }
4743             }
4744
4745           /* Insert the recursion/subroutine item, automatically wrapped inside
4746           "once" brackets. Set up a "previous group" length so that a
4747           subsequent quantifier will work. */
4748
4749           *code = OP_ONCE;
4750           PUT(code, 1, 2 + 2*LINK_SIZE);
4751           code += 1 + LINK_SIZE;
4752
4753           *code = OP_RECURSE;
4754           PUT(code, 1, called - cd->start_code);
4755           code += 1 + LINK_SIZE;
4756
4757           *code = OP_KET;
4758           PUT(code, 1, 2 + 2*LINK_SIZE);
4759           code += 1 + LINK_SIZE;
4760
4761           length_prevgroup = 3 + 3*LINK_SIZE;
4762           }
4763
4764         /* Can't determine a first byte now */
4765
4766         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4767         continue;
4768
4769
4770         /* ------------------------------------------------------------ */
4771         default:              /* Other characters: check option setting */
4772         OTHER_CHAR_AFTER_QUERY:
4773         set = unset = 0;
4774         optset = &set;
4775
4776         while (*ptr != ')' && *ptr != ':')
4777           {
4778           switch (*ptr++)
4779             {
4780             case '-': optset = &unset; break;
4781
4782             case 'J':    /* Record that it changed in the external options */
4783             *optset |= PCRE_DUPNAMES;
4784             cd->external_flags |= PCRE_JCHANGED;
4785             break;
4786
4787             case 'i': *optset |= PCRE_CASELESS; break;
4788             case 'm': *optset |= PCRE_MULTILINE; break;
4789             case 's': *optset |= PCRE_DOTALL; break;
4790             case 'x': *optset |= PCRE_EXTENDED; break;
4791             case 'U': *optset |= PCRE_UNGREEDY; break;
4792             case 'X': *optset |= PCRE_EXTRA; break;
4793
4794             default:  *errorcodeptr = ERR12;
4795                       ptr--;    /* Correct the offset */
4796                       goto FAILED;
4797             }
4798           }
4799
4800         /* Set up the changed option bits, but don't change anything yet. */
4801
4802         newoptions = (options | set) & (~unset);
4803
4804         /* If the options ended with ')' this is not the start of a nested
4805         group with option changes, so the options change at this level. If this
4806         item is right at the start of the pattern, the options can be
4807         abstracted and made external in the pre-compile phase, and ignored in
4808         the compile phase. This can be helpful when matching -- for instance in
4809         caseless checking of required bytes.
4810
4811         If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4812         definitely *not* at the start of the pattern because something has been
4813         compiled. In the pre-compile phase, however, the code pointer can have
4814         that value after the start, because it gets reset as code is discarded
4815         during the pre-compile. However, this can happen only at top level - if
4816         we are within parentheses, the starting BRA will still be present. At
4817         any parenthesis level, the length value can be used to test if anything
4818         has been compiled at that level. Thus, a test for both these conditions
4819         is necessary to ensure we correctly detect the start of the pattern in
4820         both phases.
4821
4822         If we are not at the pattern start, compile code to change the ims
4823         options if this setting actually changes any of them. We also pass the
4824         new setting back so that it can be put at the start of any following
4825         branches, and when this group ends (if we are in a group), a resetting
4826         item can be compiled. */
4827
4828         if (*ptr == ')')
4829           {
4830           if (code == cd->start_code + 1 + LINK_SIZE &&
4831                (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4832             {
4833             cd->external_options = newoptions;
4834             options = newoptions;
4835             }
4836          else
4837             {
4838             if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4839               {
4840               *code++ = OP_OPT;
4841               *code++ = newoptions & PCRE_IMS;
4842               }
4843
4844             /* Change options at this level, and pass them back for use
4845             in subsequent branches. Reset the greedy defaults and the case
4846             value for firstbyte and reqbyte. */
4847
4848             *optionsptr = options = newoptions;
4849             greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4850             greedy_non_default = greedy_default ^ 1;
4851             req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4852             }
4853
4854           previous = NULL;       /* This item can't be repeated */
4855           continue;              /* It is complete */
4856           }
4857
4858         /* If the options ended with ':' we are heading into a nested group
4859         with possible change of options. Such groups are non-capturing and are
4860         not assertions of any kind. All we need to do is skip over the ':';
4861         the newoptions value is handled below. */
4862
4863         bravalue = OP_BRA;
4864         ptr++;
4865         }     /* End of switch for character following (? */
4866       }       /* End of (? handling */
4867
4868     /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4869     all unadorned brackets become non-capturing and behave like (?:...)
4870     brackets. */
4871
4872     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4873       {
4874       bravalue = OP_BRA;
4875       }
4876
4877     /* Else we have a capturing group. */
4878
4879     else
4880       {
4881       NUMBERED_GROUP:
4882       cd->bracount += 1;
4883       PUT2(code, 1+LINK_SIZE, cd->bracount);
4884       skipbytes = 2;
4885       }
4886
4887     /* Process nested bracketed regex. Assertions may not be repeated, but
4888     other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4889     non-register variable in order to be able to pass its address because some
4890     compilers complain otherwise. Pass in a new setting for the ims options if
4891     they have changed. */
4892
4893     previous = (bravalue >= OP_ONCE)? code : NULL;
4894     *code = bravalue;
4895     tempcode = code;
4896     tempreqvary = cd->req_varyopt;     /* Save value before bracket */
4897     length_prevgroup = 0;              /* Initialize for pre-compile phase */
4898
4899     if (!compile_regex(
4900          newoptions,                   /* The complete new option state */
4901          options & PCRE_IMS,           /* The previous ims option state */
4902          &tempcode,                    /* Where to put code (updated) */
4903          &ptr,                         /* Input pointer (updated) */
4904          errorcodeptr,                 /* Where to put an error message */
4905          (bravalue == OP_ASSERTBACK ||
4906           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4907          reset_bracount,               /* True if (?| group */
4908          skipbytes,                    /* Skip over bracket number */
4909          &subfirstbyte,                /* For possible first char */
4910          &subreqbyte,                  /* For possible last char */
4911          bcptr,                        /* Current branch chain */
4912          cd,                           /* Tables block */
4913          (lengthptr == NULL)? NULL :   /* Actual compile phase */
4914            &length_prevgroup           /* Pre-compile phase */
4915          ))
4916       goto FAILED;
4917
4918     /* At the end of compiling, code is still pointing to the start of the
4919     group, while tempcode has been updated to point past the end of the group
4920     and any option resetting that may follow it. The pattern pointer (ptr)
4921     is on the bracket. */
4922
4923     /* If this is a conditional bracket, check that there are no more than
4924     two branches in the group, or just one if it's a DEFINE group. We do this
4925     in the real compile phase, not in the pre-pass, where the whole group may
4926     not be available. */
4927
4928     if (bravalue == OP_COND && lengthptr == NULL)
4929       {
4930       uschar *tc = code;
4931       int condcount = 0;
4932
4933       do {
4934          condcount++;
4935          tc += GET(tc,1);
4936          }
4937       while (*tc != OP_KET);
4938
4939       /* A DEFINE group is never obeyed inline (the "condition" is always
4940       false). It must have only one branch. */
4941
4942       if (code[LINK_SIZE+1] == OP_DEF)
4943         {
4944         if (condcount > 1)
4945           {
4946           *errorcodeptr = ERR54;
4947           goto FAILED;
4948           }
4949         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
4950         }
4951
4952       /* A "normal" conditional group. If there is just one branch, we must not
4953       make use of its firstbyte or reqbyte, because this is equivalent to an
4954       empty second branch. */
4955
4956       else
4957         {
4958         if (condcount > 2)
4959           {
4960           *errorcodeptr = ERR27;
4961           goto FAILED;
4962           }
4963         if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4964         }
4965       }
4966
4967     /* Error if hit end of pattern */
4968
4969     if (*ptr != ')')
4970       {
4971       *errorcodeptr = ERR14;
4972       goto FAILED;
4973       }
4974
4975     /* In the pre-compile phase, update the length by the length of the group,
4976     less the brackets at either end. Then reduce the compiled code to just a
4977     set of non-capturing brackets so that it doesn't use much memory if it is
4978     duplicated by a quantifier.*/
4979
4980     if (lengthptr != NULL)
4981       {
4982       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4983         {
4984         *errorcodeptr = ERR20;
4985         goto FAILED;
4986         }
4987       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4988       *code++ = OP_BRA;
4989       PUTINC(code, 0, 1 + LINK_SIZE);
4990       *code++ = OP_KET;
4991       PUTINC(code, 0, 1 + LINK_SIZE);
4992       break;    /* No need to waste time with special character handling */
4993       }
4994
4995     /* Otherwise update the main code pointer to the end of the group. */
4996
4997     code = tempcode;
4998
4999     /* For a DEFINE group, required and first character settings are not
5000     relevant. */
5001
5002     if (bravalue == OP_DEF) break;
5003
5004     /* Handle updating of the required and first characters for other types of
5005     group. Update for normal brackets of all kinds, and conditions with two
5006     branches (see code above). If the bracket is followed by a quantifier with
5007     zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5008     zerofirstbyte outside the main loop so that they can be accessed for the
5009     back off. */
5010
5011     zeroreqbyte = reqbyte;
5012     zerofirstbyte = firstbyte;
5013     groupsetfirstbyte = FALSE;
5014
5015     if (bravalue >= OP_ONCE)
5016       {
5017       /* If we have not yet set a firstbyte in this branch, take it from the
5018       subpattern, remembering that it was set here so that a repeat of more
5019       than one can replicate it as reqbyte if necessary. If the subpattern has
5020       no firstbyte, set "none" for the whole branch. In both cases, a zero
5021       repeat forces firstbyte to "none". */
5022
5023       if (firstbyte == REQ_UNSET)
5024         {
5025         if (subfirstbyte >= 0)
5026           {
5027           firstbyte = subfirstbyte;
5028           groupsetfirstbyte = TRUE;
5029           }
5030         else firstbyte = REQ_NONE;
5031         zerofirstbyte = REQ_NONE;
5032         }
5033
5034       /* If firstbyte was previously set, convert the subpattern's firstbyte
5035       into reqbyte if there wasn't one, using the vary flag that was in
5036       existence beforehand. */
5037
5038       else if (subfirstbyte >= 0 && subreqbyte < 0)
5039         subreqbyte = subfirstbyte | tempreqvary;
5040
5041       /* If the subpattern set a required byte (or set a first byte that isn't
5042       really the first byte - see above), set it. */
5043
5044       if (subreqbyte >= 0) reqbyte = subreqbyte;
5045       }
5046
5047     /* For a forward assertion, we take the reqbyte, if set. This can be
5048     helpful if the pattern that follows the assertion doesn't set a different
5049     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5050     for an assertion, however because it leads to incorrect effect for patterns
5051     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5052     of a firstbyte. This is overcome by a scan at the end if there's no
5053     firstbyte, looking for an asserted first char. */
5054
5055     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5056     break;     /* End of processing '(' */
5057
5058
5059     /* ===================================================================*/
5060     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5061     are arranged to be the negation of the corresponding OP_values. For the
5062     back references, the values are ESC_REF plus the reference number. Only
5063     back references and those types that consume a character may be repeated.
5064     We can test for values between ESC_b and ESC_Z for the latter; this may
5065     have to change if any new ones are ever created. */
5066
5067     case '\\':
5068     tempptr = ptr;
5069     c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5070     if (*errorcodeptr != 0) goto FAILED;
5071
5072     if (c < 0)
5073       {
5074       if (-c == ESC_Q)            /* Handle start of quoted string */
5075         {
5076         if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
5077           else inescq = TRUE;
5078         continue;
5079         }
5080
5081       if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
5082
5083       /* For metasequences that actually match a character, we disable the
5084       setting of a first character if it hasn't already been set. */
5085
5086       if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5087         firstbyte = REQ_NONE;
5088
5089       /* Set values to reset to if this is followed by a zero repeat. */
5090
5091       zerofirstbyte = firstbyte;
5092       zeroreqbyte = reqbyte;
5093
5094       /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5095       We also support \k{name} (.NET syntax) */
5096
5097       if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
5098         {
5099         is_recurse = FALSE;
5100         terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
5101         goto NAMED_REF_OR_RECURSE;
5102         }
5103
5104       /* Back references are handled specially; must disable firstbyte if
5105       not set to cope with cases like (?=(\w+))\1: which would otherwise set
5106       ':' later. */
5107
5108       if (-c >= ESC_REF)
5109         {
5110         recno = -c - ESC_REF;
5111
5112         HANDLE_REFERENCE:    /* Come here from named backref handling */
5113         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5114         previous = code;
5115         *code++ = OP_REF;
5116         PUT2INC(code, 0, recno);
5117         cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5118         if (recno > cd->top_backref) cd->top_backref = recno;
5119         }
5120
5121       /* So are Unicode property matches, if supported. */
5122
5123 #ifdef SUPPORT_UCP
5124       else if (-c == ESC_P || -c == ESC_p)
5125         {
5126         BOOL negated;
5127         int pdata;
5128         int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5129         if (ptype < 0) goto FAILED;
5130         previous = code;
5131         *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5132         *code++ = ptype;
5133         *code++ = pdata;
5134         }
5135 #else
5136
5137       /* If Unicode properties are not supported, \X, \P, and \p are not
5138       allowed. */
5139
5140       else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5141         {
5142         *errorcodeptr = ERR45;
5143         goto FAILED;
5144         }
5145 #endif
5146
5147       /* For the rest (including \X when Unicode properties are supported), we
5148       can obtain the OP value by negating the escape value. */
5149
5150       else
5151         {
5152         previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5153         *code++ = -c;
5154         }
5155       continue;
5156       }
5157
5158     /* We have a data character whose value is in c. In UTF-8 mode it may have
5159     a value > 127. We set its representation in the length/buffer, and then
5160     handle it as a data character. */
5161
5162 #ifdef SUPPORT_UTF8
5163     if (utf8 && c > 127)
5164       mclength = _pcre_ord2utf8(c, mcbuffer);
5165     else
5166 #endif
5167
5168      {
5169      mcbuffer[0] = c;
5170      mclength = 1;
5171      }
5172     goto ONE_CHAR;
5173
5174
5175     /* ===================================================================*/
5176     /* Handle a literal character. It is guaranteed not to be whitespace or #
5177     when the extended flag is set. If we are in UTF-8 mode, it may be a
5178     multi-byte literal character. */
5179
5180     default:
5181     NORMAL_CHAR:
5182     mclength = 1;
5183     mcbuffer[0] = c;
5184
5185 #ifdef SUPPORT_UTF8
5186     if (utf8 && c >= 0xc0)
5187       {
5188       while ((ptr[1] & 0xc0) == 0x80)
5189         mcbuffer[mclength++] = *(++ptr);
5190       }
5191 #endif
5192
5193     /* At this point we have the character's bytes in mcbuffer, and the length
5194     in mclength. When not in UTF-8 mode, the length is always 1. */
5195
5196     ONE_CHAR:
5197     previous = code;
5198     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5199     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5200
5201     /* Remember if \r or \n were seen */
5202
5203     if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
5204       cd->external_flags |= PCRE_HASCRORLF;
5205
5206     /* Set the first and required bytes appropriately. If no previous first
5207     byte, set it from this character, but revert to none on a zero repeat.
5208     Otherwise, leave the firstbyte value alone, and don't change it on a zero
5209     repeat. */
5210
5211     if (firstbyte == REQ_UNSET)
5212       {
5213       zerofirstbyte = REQ_NONE;
5214       zeroreqbyte = reqbyte;
5215
5216       /* If the character is more than one byte long, we can set firstbyte
5217       only if it is not to be matched caselessly. */
5218
5219       if (mclength == 1 || req_caseopt == 0)
5220         {
5221         firstbyte = mcbuffer[0] | req_caseopt;
5222         if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5223         }
5224       else firstbyte = reqbyte = REQ_NONE;
5225       }
5226
5227     /* firstbyte was previously set; we can set reqbyte only the length is
5228     1 or the matching is caseful. */
5229
5230     else
5231       {
5232       zerofirstbyte = firstbyte;
5233       zeroreqbyte = reqbyte;
5234       if (mclength == 1 || req_caseopt == 0)
5235         reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5236       }
5237
5238     break;            /* End of literal character handling */
5239     }
5240   }                   /* end of big loop */
5241
5242
5243 /* Control never reaches here by falling through, only by a goto for all the
5244 error states. Pass back the position in the pattern so that it can be displayed
5245 to the user for diagnosing the error. */
5246
5247 FAILED:
5248 *ptrptr = ptr;
5249 return FALSE;
5250 }
5251
5252
5253
5254
5255 /*************************************************
5256 *     Compile sequence of alternatives           *
5257 *************************************************/
5258
5259 /* On entry, ptr is pointing past the bracket character, but on return it
5260 points to the closing bracket, or vertical bar, or end of string. The code
5261 variable is pointing at the byte into which the BRA operator has been stored.
5262 If the ims options are changed at the start (for a (?ims: group) or during any
5263 branch, we need to insert an OP_OPT item at the start of every following branch
5264 to ensure they get set correctly at run time, and also pass the new options
5265 into every subsequent branch compile.
5266
5267 This function is used during the pre-compile phase when we are trying to find
5268 out the amount of memory needed, as well as during the real compile phase. The
5269 value of lengthptr distinguishes the two phases.
5270
5271 Arguments:
5272   options        option bits, including any changes for this subpattern
5273   oldims         previous settings of ims option bits
5274   codeptr        -> the address of the current code pointer
5275   ptrptr         -> the address of the current pattern pointer
5276   errorcodeptr   -> pointer to error code variable
5277   lookbehind     TRUE if this is a lookbehind assertion
5278   reset_bracount TRUE to reset the count for each branch
5279   skipbytes      skip this many bytes at start (for brackets and OP_COND)
5280   firstbyteptr   place to put the first required character, or a negative number
5281   reqbyteptr     place to put the last required character, or a negative number
5282   bcptr          pointer to the chain of currently open branches
5283   cd             points to the data block with tables pointers etc.
5284   lengthptr      NULL during the real compile phase
5285                  points to length accumulator during pre-compile phase
5286
5287 Returns:         TRUE on success
5288 */
5289
5290 static BOOL
5291 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5292   int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5293   int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5294   int *lengthptr)
5295 {
5296 const uschar *ptr = *ptrptr;
5297 uschar *code = *codeptr;
5298 uschar *last_branch = code;
5299 uschar *start_bracket = code;
5300 uschar *reverse_count = NULL;
5301 int firstbyte, reqbyte;
5302 int branchfirstbyte, branchreqbyte;
5303 int length;
5304 int orig_bracount;
5305 int max_bracount;
5306 branch_chain bc;
5307
5308 bc.outer = bcptr;
5309 bc.current = code;
5310
5311 firstbyte = reqbyte = REQ_UNSET;
5312
5313 /* Accumulate the length for use in the pre-compile phase. Start with the
5314 length of the BRA and KET and any extra bytes that are required at the
5315 beginning. We accumulate in a local variable to save frequent testing of
5316 lenthptr for NULL. We cannot do this by looking at the value of code at the
5317 start and end of each alternative, because compiled items are discarded during
5318 the pre-compile phase so that the work space is not exceeded. */
5319
5320 length = 2 + 2*LINK_SIZE + skipbytes;
5321
5322 /* WARNING: If the above line is changed for any reason, you must also change
5323 the code that abstracts option settings at the start of the pattern and makes
5324 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5325 pre-compile phase to find out whether anything has yet been compiled or not. */
5326
5327 /* Offset is set zero to mark that this bracket is still open */
5328
5329 PUT(code, 1, 0);
5330 code += 1 + LINK_SIZE + skipbytes;
5331
5332 /* Loop for each alternative branch */
5333
5334 orig_bracount = max_bracount = cd->bracount;
5335 for (;;)
5336   {
5337   /* For a (?| group, reset the capturing bracket count so that each branch
5338   uses the same numbers. */
5339
5340   if (reset_bracount) cd->bracount = orig_bracount;
5341
5342   /* Handle a change of ims options at the start of the branch */
5343
5344   if ((options & PCRE_IMS) != oldims)
5345     {
5346     *code++ = OP_OPT;
5347     *code++ = options & PCRE_IMS;
5348     length += 2;
5349     }
5350
5351   /* Set up dummy OP_REVERSE if lookbehind assertion */
5352
5353   if (lookbehind)
5354     {
5355     *code++ = OP_REVERSE;
5356     reverse_count = code;
5357     PUTINC(code, 0, 0);
5358     length += 1 + LINK_SIZE;
5359     }
5360
5361   /* Now compile the branch; in the pre-compile phase its length gets added
5362   into the length. */
5363
5364   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5365         &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5366     {
5367     *ptrptr = ptr;
5368     return FALSE;
5369     }
5370
5371   /* Keep the highest bracket count in case (?| was used and some branch
5372   has fewer than the rest. */
5373
5374   if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5375
5376   /* In the real compile phase, there is some post-processing to be done. */
5377
5378   if (lengthptr == NULL)
5379     {
5380     /* If this is the first branch, the firstbyte and reqbyte values for the
5381     branch become the values for the regex. */
5382
5383     if (*last_branch != OP_ALT)
5384       {
5385       firstbyte = branchfirstbyte;
5386       reqbyte = branchreqbyte;
5387       }
5388
5389     /* If this is not the first branch, the first char and reqbyte have to
5390     match the values from all the previous branches, except that if the
5391     previous value for reqbyte didn't have REQ_VARY set, it can still match,
5392     and we set REQ_VARY for the regex. */
5393
5394     else
5395       {
5396       /* If we previously had a firstbyte, but it doesn't match the new branch,
5397       we have to abandon the firstbyte for the regex, but if there was
5398       previously no reqbyte, it takes on the value of the old firstbyte. */
5399
5400       if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5401         {
5402         if (reqbyte < 0) reqbyte = firstbyte;
5403         firstbyte = REQ_NONE;
5404         }
5405
5406       /* If we (now or from before) have no firstbyte, a firstbyte from the
5407       branch becomes a reqbyte if there isn't a branch reqbyte. */
5408
5409       if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5410           branchreqbyte = branchfirstbyte;
5411
5412       /* Now ensure that the reqbytes match */
5413
5414       if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5415         reqbyte = REQ_NONE;
5416       else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
5417       }
5418
5419     /* If lookbehind, check that this branch matches a fixed-length string, and
5420     put the length into the OP_REVERSE item. Temporarily mark the end of the
5421     branch with OP_END. */
5422
5423     if (lookbehind)
5424       {
5425       int fixed_length;
5426       *code = OP_END;
5427       fixed_length = find_fixedlength(last_branch, options);
5428       DPRINTF(("fixed length = %d\n", fixed_length));
5429       if (fixed_length < 0)
5430         {
5431         *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5432         *ptrptr = ptr;
5433         return FALSE;
5434         }
5435       PUT(reverse_count, 0, fixed_length);
5436       }
5437     }
5438
5439   /* Reached end of expression, either ')' or end of pattern. In the real
5440   compile phase, go back through the alternative branches and reverse the chain
5441   of offsets, with the field in the BRA item now becoming an offset to the
5442   first alternative. If there are no alternatives, it points to the end of the
5443   group. The length in the terminating ket is always the length of the whole
5444   bracketed item. If any of the ims options were changed inside the group,
5445   compile a resetting op-code following, except at the very end of the pattern.
5446   Return leaving the pointer at the terminating char. */
5447
5448   if (*ptr != '|')
5449     {
5450     if (lengthptr == NULL)
5451       {
5452       int branch_length = code - last_branch;
5453       do
5454         {
5455         int prev_length = GET(last_branch, 1);
5456         PUT(last_branch, 1, branch_length);
5457         branch_length = prev_length;
5458         last_branch -= branch_length;
5459         }
5460       while (branch_length > 0);
5461       }
5462
5463     /* Fill in the ket */
5464
5465     *code = OP_KET;
5466     PUT(code, 1, code - start_bracket);
5467     code += 1 + LINK_SIZE;
5468
5469     /* Resetting option if needed */
5470
5471     if ((options & PCRE_IMS) != oldims && *ptr == ')')
5472       {
5473       *code++ = OP_OPT;
5474       *code++ = oldims;
5475       length += 2;
5476       }
5477
5478     /* Retain the highest bracket number, in case resetting was used. */
5479
5480     cd->bracount = max_bracount;
5481
5482     /* Set values to pass back */
5483
5484     *codeptr = code;
5485     *ptrptr = ptr;
5486     *firstbyteptr = firstbyte;
5487     *reqbyteptr = reqbyte;
5488     if (lengthptr != NULL)
5489       {
5490       if (OFLOW_MAX - *lengthptr < length)
5491         {
5492         *errorcodeptr = ERR20;
5493         return FALSE;
5494         }
5495       *lengthptr += length;
5496       }
5497     return TRUE;
5498     }
5499
5500   /* Another branch follows. In the pre-compile phase, we can move the code
5501   pointer back to where it was for the start of the first branch. (That is,
5502   pretend that each branch is the only one.)
5503
5504   In the real compile phase, insert an ALT node. Its length field points back
5505   to the previous branch while the bracket remains open. At the end the chain
5506   is reversed. It's done like this so that the start of the bracket has a
5507   zero offset until it is closed, making it possible to detect recursion. */
5508
5509   if (lengthptr != NULL)
5510     {
5511     code = *codeptr + 1 + LINK_SIZE + skipbytes;
5512     length += 1 + LINK_SIZE;
5513     }
5514   else
5515     {
5516     *code = OP_ALT;
5517     PUT(code, 1, code - last_branch);
5518     bc.current = last_branch = code;
5519     code += 1 + LINK_SIZE;
5520     }
5521
5522   ptr++;
5523   }
5524 /* Control never reaches here */
5525 }
5526
5527
5528
5529
5530 /*************************************************
5531 *          Check for anchored expression         *
5532 *************************************************/
5533
5534 /* Try to find out if this is an anchored regular expression. Consider each
5535 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5536 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5537 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5538 counts, since OP_CIRC can match in the middle.
5539
5540 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5541 This is the code for \G, which means "match at start of match position, taking
5542 into account the match offset".
5543
5544 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5545 because that will try the rest of the pattern at all possible matching points,
5546 so there is no point trying again.... er ....
5547
5548 .... except when the .* appears inside capturing parentheses, and there is a
5549 subsequent back reference to those parentheses. We haven't enough information
5550 to catch that case precisely.
5551
5552 At first, the best we could do was to detect when .* was in capturing brackets
5553 and the highest back reference was greater than or equal to that level.
5554 However, by keeping a bitmap of the first 31 back references, we can catch some
5555 of the more common cases more precisely.
5556
5557 Arguments:
5558   code           points to start of expression (the bracket)
5559   options        points to the options setting
5560   bracket_map    a bitmap of which brackets we are inside while testing; this
5561                   handles up to substring 31; after that we just have to take
5562                   the less precise approach
5563   backref_map    the back reference bitmap
5564
5565 Returns:     TRUE or FALSE
5566 */
5567
5568 static BOOL
5569 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5570   unsigned int backref_map)
5571 {
5572 do {
5573    const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5574      options, PCRE_MULTILINE, FALSE);
5575    register int op = *scode;
5576
5577    /* Non-capturing brackets */
5578
5579    if (op == OP_BRA)
5580      {
5581      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5582      }
5583
5584    /* Capturing brackets */
5585
5586    else if (op == OP_CBRA)
5587      {
5588      int n = GET2(scode, 1+LINK_SIZE);
5589      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5590      if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5591      }
5592
5593    /* Other brackets */
5594
5595    else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5596      {
5597      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5598      }
5599
5600    /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5601    are or may be referenced. */
5602
5603    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5604              op == OP_TYPEPOSSTAR) &&
5605             (*options & PCRE_DOTALL) != 0)
5606      {
5607      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5608      }
5609
5610    /* Check for explicit anchoring */
5611
5612    else if (op != OP_SOD && op != OP_SOM &&
5613            ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5614      return FALSE;
5615    code += GET(code, 1);
5616    }
5617 while (*code == OP_ALT);   /* Loop for each alternative */
5618 return TRUE;
5619 }
5620
5621
5622
5623 /*************************************************
5624 *         Check for starting with ^ or .*        *
5625 *************************************************/
5626
5627 /* This is called to find out if every branch starts with ^ or .* so that
5628 "first char" processing can be done to speed things up in multiline
5629 matching and for non-DOTALL patterns that start with .* (which must start at
5630 the beginning or after \n). As in the case of is_anchored() (see above), we
5631 have to take account of back references to capturing brackets that contain .*
5632 because in that case we can't make the assumption.
5633
5634 Arguments:
5635   code           points to start of expression (the bracket)
5636   bracket_map    a bitmap of which brackets we are inside while testing; this
5637                   handles up to substring 31; after that we just have to take
5638                   the less precise approach
5639   backref_map    the back reference bitmap
5640
5641 Returns:         TRUE or FALSE
5642 */
5643
5644 static BOOL
5645 is_startline(const uschar *code, unsigned int bracket_map,
5646   unsigned int backref_map)
5647 {
5648 do {
5649    const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5650      NULL, 0, FALSE);
5651    register int op = *scode;
5652
5653    /* Non-capturing brackets */
5654
5655    if (op == OP_BRA)
5656      {
5657      if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5658      }
5659
5660    /* Capturing brackets */
5661
5662    else if (op == OP_CBRA)
5663      {
5664      int n = GET2(scode, 1+LINK_SIZE);
5665      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5666      if (!is_startline(scode, new_map, backref_map)) return FALSE;
5667      }
5668
5669    /* Other brackets */
5670
5671    else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5672      { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5673
5674    /* .* means "start at start or after \n" if it isn't in brackets that
5675    may be referenced. */
5676
5677    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5678      {
5679      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5680      }
5681
5682    /* Check for explicit circumflex */
5683
5684    else if (op != OP_CIRC) return FALSE;
5685
5686    /* Move on to the next alternative */
5687
5688    code += GET(code, 1);
5689    }
5690 while (*code == OP_ALT);  /* Loop for each alternative */
5691 return TRUE;
5692 }
5693
5694
5695
5696 /*************************************************
5697 *       Check for asserted fixed first char      *
5698 *************************************************/
5699
5700 /* During compilation, the "first char" settings from forward assertions are
5701 discarded, because they can cause conflicts with actual literals that follow.
5702 However, if we end up without a first char setting for an unanchored pattern,
5703 it is worth scanning the regex to see if there is an initial asserted first
5704 char. If all branches start with the same asserted char, or with a bracket all
5705 of whose alternatives start with the same asserted char (recurse ad lib), then
5706 we return that char, otherwise -1.
5707
5708 Arguments:
5709   code       points to start of expression (the bracket)
5710   options    pointer to the options (used to check casing changes)
5711   inassert   TRUE if in an assertion
5712
5713 Returns:     -1 or the fixed first char
5714 */
5715
5716 static int
5717 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5718 {
5719 register int c = -1;
5720 do {
5721    int d;
5722    const uschar *scode =
5723      first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5724    register int op = *scode;
5725
5726    switch(op)
5727      {
5728      default:
5729      return -1;
5730
5731      case OP_BRA:
5732      case OP_CBRA:
5733      case OP_ASSERT:
5734      case OP_ONCE:
5735      case OP_COND:
5736      if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5737        return -1;
5738      if (c < 0) c = d; else if (c != d) return -1;
5739      break;
5740
5741      case OP_EXACT:       /* Fall through */
5742      scode += 2;
5743
5744      case OP_CHAR:
5745      case OP_CHARNC:
5746      case OP_PLUS:
5747      case OP_MINPLUS:
5748      case OP_POSPLUS:
5749      if (!inassert) return -1;
5750      if (c < 0)
5751        {
5752        c = scode[1];
5753        if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5754        }
5755      else if (c != scode[1]) return -1;
5756      break;
5757      }
5758
5759    code += GET(code, 1);
5760    }
5761 while (*code == OP_ALT);
5762 return c;
5763 }
5764
5765
5766
5767 /*************************************************
5768 *        Compile a Regular Expression            *
5769 *************************************************/
5770
5771 /* This function takes a string and returns a pointer to a block of store
5772 holding a compiled version of the expression. The original API for this
5773 function had no error code return variable; it is retained for backwards
5774 compatibility. The new function is given a new name.
5775
5776 Arguments:
5777   pattern       the regular expression
5778   options       various option bits
5779   errorcodeptr  pointer to error code variable (pcre_compile2() only)
5780                   can be NULL if you don't want a code value
5781   errorptr      pointer to pointer to error text
5782   erroroffset   ptr offset in pattern where error was detected
5783   tables        pointer to character tables or NULL
5784
5785 Returns:        pointer to compiled data block, or NULL on error,
5786                 with errorptr and erroroffset set
5787 */
5788
5789 PCRE_EXP_DEFN pcre *
5790 pcre_compile(const char *pattern, int options, const char **errorptr,
5791   int *erroroffset, const unsigned char *tables)
5792 {
5793 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5794 }
5795
5796
5797 PCRE_EXP_DEFN pcre *
5798 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5799   const char **errorptr, int *erroroffset, const unsigned char *tables)
5800 {
5801 real_pcre *re;
5802 int length = 1;  /* For final END opcode */
5803 int firstbyte, reqbyte, newline;
5804 int errorcode = 0;
5805 int skipatstart = 0;
5806 #ifdef SUPPORT_UTF8
5807 BOOL utf8;
5808 #endif
5809 size_t size;
5810 uschar *code;
5811 const uschar *codestart;
5812 const uschar *ptr;
5813 compile_data compile_block;
5814 compile_data *cd = &compile_block;
5815
5816 /* This space is used for "compiling" into during the first phase, when we are
5817 computing the amount of memory that is needed. Compiled items are thrown away
5818 as soon as possible, so that a fairly large buffer should be sufficient for
5819 this purpose. The same space is used in the second phase for remembering where
5820 to fill in forward references to subpatterns. */
5821
5822 uschar cworkspace[COMPILE_WORK_SIZE];
5823
5824 /* Set this early so that early errors get offset 0. */
5825
5826 ptr = (const uschar *)pattern;
5827
5828 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5829 can do is just return NULL, but we can set a code value if there is a code
5830 pointer. */
5831
5832 if (errorptr == NULL)
5833   {
5834   if (errorcodeptr != NULL) *errorcodeptr = 99;
5835   return NULL;
5836   }
5837
5838 *errorptr = NULL;
5839 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5840
5841 /* However, we can give a message for this error */
5842
5843 if (erroroffset == NULL)
5844   {
5845   errorcode = ERR16;
5846   goto PCRE_EARLY_ERROR_RETURN2;
5847   }
5848
5849 *erroroffset = 0;
5850
5851 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5852
5853 #ifdef SUPPORT_UTF8
5854 utf8 = (options & PCRE_UTF8) != 0;
5855 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5856      (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5857   {
5858   errorcode = ERR44;
5859   goto PCRE_EARLY_ERROR_RETURN2;
5860   }
5861 #else
5862 if ((options & PCRE_UTF8) != 0)
5863   {
5864   errorcode = ERR32;
5865   goto PCRE_EARLY_ERROR_RETURN;
5866   }
5867 #endif
5868
5869 if ((options & ~PUBLIC_OPTIONS) != 0)
5870   {
5871   errorcode = ERR17;
5872   goto PCRE_EARLY_ERROR_RETURN;
5873   }
5874
5875 /* Set up pointers to the individual character tables */
5876
5877 if (tables == NULL) tables = _pcre_default_tables;
5878 cd->lcc = tables + lcc_offset;
5879 cd->fcc = tables + fcc_offset;
5880 cd->cbits = tables + cbits_offset;
5881 cd->ctypes = tables + ctypes_offset;
5882
5883 /* Check for global one-time settings at the start of the pattern, and remember
5884 the offset for later. */
5885
5886 while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
5887   {
5888   int newnl = 0;
5889   int newbsr = 0;
5890
5891   if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
5892     { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
5893   else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3)  == 0)
5894     { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
5895   else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5)  == 0)
5896     { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
5897   else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
5898     { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
5899   else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8)  == 0)
5900     { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
5901
5902   else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
5903     { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
5904   else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
5905     { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
5906
5907   if (newnl != 0)
5908     options = (options & ~PCRE_NEWLINE_BITS) | newnl;
5909   else if (newbsr != 0)
5910     options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
5911   else break;
5912   }
5913
5914 /* Check validity of \R options. */
5915
5916 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5917   {
5918   case 0:
5919   case PCRE_BSR_ANYCRLF:
5920   case PCRE_BSR_UNICODE:
5921   break;
5922   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5923   }
5924
5925 /* Handle different types of newline. The three bits give seven cases. The
5926 current code allows for fixed one- or two-byte sequences, plus "any" and
5927 "anycrlf". */
5928
5929 switch (options & PCRE_NEWLINE_BITS)
5930   {
5931   case 0: newline = NEWLINE; break;   /* Build-time default */
5932   case PCRE_NEWLINE_CR: newline = '\r'; break;
5933   case PCRE_NEWLINE_LF: newline = '\n'; break;
5934   case PCRE_NEWLINE_CR+
5935        PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5936   case PCRE_NEWLINE_ANY: newline = -1; break;
5937   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5938   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5939   }
5940
5941 if (newline == -2)
5942   {
5943   cd->nltype = NLTYPE_ANYCRLF;
5944   }
5945 else if (newline < 0)
5946   {
5947   cd->nltype = NLTYPE_ANY;
5948   }
5949 else
5950   {
5951   cd->nltype = NLTYPE_FIXED;
5952   if (newline > 255)
5953     {
5954     cd->nllen = 2;
5955     cd->nl[0] = (newline >> 8) & 255;
5956     cd->nl[1] = newline & 255;
5957     }
5958   else
5959     {
5960     cd->nllen = 1;
5961     cd->nl[0] = newline;
5962     }
5963   }
5964
5965 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5966 references to help in deciding whether (.*) can be treated as anchored or not.
5967 */
5968
5969 cd->top_backref = 0;
5970 cd->backref_map = 0;
5971
5972 /* Reflect pattern for debugging output */
5973
5974 DPRINTF(("------------------------------------------------------------------\n"));
5975 DPRINTF(("%s\n", pattern));
5976
5977 /* Pretend to compile the pattern while actually just accumulating the length
5978 of memory required. This behaviour is triggered by passing a non-NULL final
5979 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5980 to compile parts of the pattern into; the compiled code is discarded when it is
5981 no longer needed, so hopefully this workspace will never overflow, though there
5982 is a test for its doing so. */
5983
5984 cd->bracount = cd->final_bracount = 0;
5985 cd->names_found = 0;
5986 cd->name_entry_size = 0;
5987 cd->name_table = NULL;
5988 cd->start_workspace = cworkspace;
5989 cd->start_code = cworkspace;
5990 cd->hwm = cworkspace;
5991 cd->start_pattern = (const uschar *)pattern;
5992 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5993 cd->req_varyopt = 0;
5994 cd->external_options = options;
5995 cd->external_flags = 0;
5996
5997 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5998 don't need to look at the result of the function here. The initial options have
5999 been put into the cd block so that they can be changed if an option setting is
6000 found within the regex right at the beginning. Bringing initial option settings
6001 outside can help speed up starting point checks. */
6002
6003 ptr += skipatstart;
6004 code = cworkspace;
6005 *code = OP_BRA;
6006 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
6007   &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
6008   &length);
6009 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
6010
6011 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
6012   cd->hwm - cworkspace));
6013
6014 if (length > MAX_PATTERN_SIZE)
6015   {
6016   errorcode = ERR20;
6017   goto PCRE_EARLY_ERROR_RETURN;
6018   }
6019
6020 /* Compute the size of data block needed and get it, either from malloc or
6021 externally provided function. Integer overflow should no longer be possible
6022 because nowadays we limit the maximum value of cd->names_found and
6023 cd->name_entry_size. */
6024
6025 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
6026 re = (real_pcre *)(pcre_malloc)(size);
6027
6028 if (re == NULL)
6029   {
6030   errorcode = ERR21;
6031   goto PCRE_EARLY_ERROR_RETURN;
6032   }
6033
6034 /* Put in the magic number, and save the sizes, initial options, internal
6035 flags, and character table pointer. NULL is used for the default character
6036 tables. The nullpad field is at the end; it's there to help in the case when a
6037 regex compiled on a system with 4-byte pointers is run on another with 8-byte
6038 pointers. */
6039
6040 re->magic_number = MAGIC_NUMBER;
6041 re->size = size;
6042 re->options = cd->external_options;
6043 re->flags = cd->external_flags;
6044 re->dummy1 = 0;
6045 re->first_byte = 0;
6046 re->req_byte = 0;
6047 re->name_table_offset = sizeof(real_pcre);
6048 re->name_entry_size = cd->name_entry_size;
6049 re->name_count = cd->names_found;
6050 re->ref_count = 0;
6051 re->tables = (tables == _pcre_default_tables)? NULL : tables;
6052 re->nullpad = NULL;
6053
6054 /* The starting points of the name/number translation table and of the code are
6055 passed around in the compile data block. The start/end pattern and initial
6056 options are already set from the pre-compile phase, as is the name_entry_size
6057 field. Reset the bracket count and the names_found field. Also reset the hwm
6058 field; this time it's used for remembering forward references to subpatterns.
6059 */
6060
6061 cd->final_bracount = cd->bracount;  /* Save for checking forward references */
6062 cd->bracount = 0;
6063 cd->names_found = 0;
6064 cd->name_table = (uschar *)re + re->name_table_offset;
6065 codestart = cd->name_table + re->name_entry_size * re->name_count;
6066 cd->start_code = codestart;
6067 cd->hwm = cworkspace;
6068 cd->req_varyopt = 0;
6069 cd->had_accept = FALSE;
6070
6071 /* Set up a starting, non-extracting bracket, then compile the expression. On
6072 error, errorcode will be set non-zero, so we don't need to look at the result
6073 of the function here. */
6074
6075 ptr = (const uschar *)pattern + skipatstart;
6076 code = (uschar *)codestart;
6077 *code = OP_BRA;
6078 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6079   &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6080 re->top_bracket = cd->bracount;
6081 re->top_backref = cd->top_backref;
6082 re->flags = cd->external_flags;
6083
6084 if (cd->had_accept) reqbyte = -1;   /* Must disable after (*ACCEPT) */
6085
6086 /* If not reached end of pattern on success, there's an excess bracket. */
6087
6088 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6089
6090 /* Fill in the terminating state and check for disastrous overflow, but
6091 if debugging, leave the test till after things are printed out. */
6092
6093 *code++ = OP_END;
6094
6095 #ifndef DEBUG
6096 if (code - codestart > length) errorcode = ERR23;
6097 #endif
6098
6099 /* Fill in any forward references that are required. */
6100
6101 while (errorcode == 0 && cd->hwm > cworkspace)
6102   {
6103   int offset, recno;
6104   const uschar *groupptr;
6105   cd->hwm -= LINK_SIZE;
6106   offset = GET(cd->hwm, 0);
6107   recno = GET(codestart, offset);
6108   groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6109   if (groupptr == NULL) errorcode = ERR53;
6110     else PUT(((uschar *)codestart), offset, groupptr - codestart);
6111   }
6112
6113 /* Give an error if there's back reference to a non-existent capturing
6114 subpattern. */
6115
6116 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6117
6118 /* Failed to compile, or error while post-processing */
6119
6120 if (errorcode != 0)
6121   {
6122   (pcre_free)(re);
6123   PCRE_EARLY_ERROR_RETURN:
6124   *erroroffset = ptr - (const uschar *)pattern;
6125   PCRE_EARLY_ERROR_RETURN2:
6126   *errorptr = find_error_text(errorcode);
6127   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6128   return NULL;
6129   }
6130
6131 /* If the anchored option was not passed, set the flag if we can determine that
6132 the pattern is anchored by virtue of ^ characters or \A or anything else (such
6133 as starting with .* when DOTALL is set).
6134
6135 Otherwise, if we know what the first byte has to be, save it, because that
6136 speeds up unanchored matches no end. If not, see if we can set the
6137 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6138 start with ^. and also when all branches start with .* for non-DOTALL matches.
6139 */
6140
6141 if ((re->options & PCRE_ANCHORED) == 0)
6142   {
6143   int temp_options = re->options;   /* May get changed during these scans */
6144   if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6145     re->options |= PCRE_ANCHORED;
6146   else
6147     {
6148     if (firstbyte < 0)
6149       firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6150     if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
6151       {
6152       int ch = firstbyte & 255;
6153       re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6154          cd->fcc[ch] == ch)? ch : firstbyte;
6155       re->flags |= PCRE_FIRSTSET;
6156       }
6157     else if (is_startline(codestart, 0, cd->backref_map))
6158       re->flags |= PCRE_STARTLINE;
6159     }
6160   }
6161
6162 /* For an anchored pattern, we use the "required byte" only if it follows a
6163 variable length item in the regex. Remove the caseless flag for non-caseable
6164 bytes. */
6165
6166 if (reqbyte >= 0 &&
6167      ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6168   {
6169   int ch = reqbyte & 255;
6170   re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6171     cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6172   re->flags |= PCRE_REQCHSET;
6173   }
6174
6175 /* Print out the compiled data if debugging is enabled. This is never the
6176 case when building a production library. */
6177
6178 #ifdef DEBUG
6179
6180 printf("Length = %d top_bracket = %d top_backref = %d\n",
6181   length, re->top_bracket, re->top_backref);
6182
6183 printf("Options=%08x\n", re->options);
6184
6185 if ((re->flags & PCRE_FIRSTSET) != 0)
6186   {
6187   int ch = re->first_byte & 255;
6188   const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6189     "" : " (caseless)";
6190   if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6191     else printf("First char = \\x%02x%s\n", ch, caseless);
6192   }
6193
6194 if ((re->flags & PCRE_REQCHSET) != 0)
6195   {
6196   int ch = re->req_byte & 255;
6197   const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6198     "" : " (caseless)";
6199   if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6200     else printf("Req char = \\x%02x%s\n", ch, caseless);
6201   }
6202
6203 pcre_printint(re, stdout, TRUE);
6204
6205 /* This check is done here in the debugging case so that the code that
6206 was compiled can be seen. */
6207
6208 if (code - codestart > length)
6209   {
6210   (pcre_free)(re);
6211   *errorptr = find_error_text(ERR23);
6212   *erroroffset = ptr - (uschar *)pattern;
6213   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6214   return NULL;
6215   }
6216 #endif   /* DEBUG */
6217
6218 return (pcre *)re;
6219 }
6220
6221 /* End of pcre_compile.c */