js/src/yarr/pcre/pcre_compile.cpp

   1 /* This is JavaScriptCore's variant of the PCRE library. While this library
   2 started out as a copy of PCRE, many of the features of PCRE have been
   3 removed. This library now supports only the regular expression features
   4 required by the JavaScript language specification, and has only the functions
   5 needed by JavaScriptCore and the rest of WebKit.
   6
   7                  Originally written by Philip Hazel
   8            Copyright (c) 1997-2006 University of Cambridge
   9     Copyright (C) 2002, 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
  10     Copyright (C) 2007 Eric Seidel <eric@webkit.org>
  11
  12 -----------------------------------------------------------------------------
  13 Redistribution and use in source and binary forms, with or without
  14 modification, are permitted provided that the following conditions are met:
  15
  16     * Redistributions of source code must retain the above copyright notice,
  17       this list of conditions and the following disclaimer.
  18
  19     * Redistributions in binary form must reproduce the above copyright
  20       notice, this list of conditions and the following disclaimer in the
  21       documentation and/or other materials provided with the distribution.
  22
  23     * Neither the name of the University of Cambridge nor the names of its
  24       contributors may be used to endorse or promote products derived from
  25       this software without specific prior written permission.
  26
  27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  37 POSSIBILITY OF SUCH DAMAGE.
  38 -----------------------------------------------------------------------------
  39 */
  40
  41 /* This module contains the external function jsRegExpExecute(), along with
  42 supporting internal functions that are not used by other modules. */
  43
  44 #include "pcre_internal.h"
  45
  46 #include <string.h>
  47 #include "yarr/wtf/ASCIICType.h"
  48 #include "jsvector.h"
  49
  50 using namespace WTF;
  51
  52 /* Negative values for the firstchar and reqchar variables */
  53
  54 #define REQ_UNSET (-2)
  55 #define REQ_NONE  (-1)
  56
  57 /*************************************************
  58 *      Code parameters and static tables         *
  59 *************************************************/
  60
  61 /* Maximum number of items on the nested bracket stacks at compile time. This
  62 applies to the nesting of all kinds of parentheses. It does not limit
  63 un-nested, non-capturing parentheses. This number can be made bigger if
  64 necessary - it is used to dimension one int and one unsigned char vector at
  65 compile time. */
  66
  67 #define BRASTACK_SIZE 200
  68
  69 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
  70 are simple data values; negative values are for special things like \d and so
  71 on. Zero means further processing is needed (for things like \x), or the escape
  72 is invalid. */
  73
  74 static const short escapes[] = {
  75      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
  76      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
  77    '@',      0, -ESC_B,      0, -ESC_D,      0,      0,      0,   /* @ - G */
  78      0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
  79      0,      0,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
  80      0,      0,      0,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
  81    '`',      7, -ESC_b,      0, -ESC_d,      0,   '\f',      0,   /* ` - g */
  82      0,      0,      0,      0,      0,      0,   '\n',      0,   /* h - o */
  83      0,      0,    '\r', -ESC_s,   '\t',      0,  '\v', -ESC_w,   /* p - w */
  84      0,      0,      0                                            /* x - z */
  85 };
  86 static const unsigned OPCODE_LEN = 1;
  87 static const unsigned BRAZERO_LEN = OPCODE_LEN;
  88 static const unsigned BRA_NEST_SIZE = 2;
  89 static const unsigned BRA_LEN = OPCODE_LEN + LINK_SIZE + BRA_NEST_SIZE;
  90 static const unsigned KET_LEN = OPCODE_LEN + LINK_SIZE;
  91
  92 /* Error code numbers. They are given names so that they can more easily be
  93 tracked. */
  94
  95 enum ErrorCode {
  96     ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
  97     ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17
  98 };
  99
 100 /* These are the error texts that correspond to the above error codes:
 101       // 1
 102       "\\ at end of pattern\0"
 103       "\\c at end of pattern\0"
 104       "character value in \\x{...} sequence is too large\0"
 105       "numbers out of order in {} quantifier\0"
 106       // 5
 107       "number too big in {} quantifier\0"
 108       "missing terminating ] for character class\0"
 109       "internal error: code overflow\0"
 110       "range out of order in character class\0"
 111       "nothing to repeat\0"
 112       // 10
 113       "unmatched parentheses\0"
 114       "internal error: unexpected repeat\0"
 115       "unrecognized character after (?\0"
 116       "failed to get memory\0"
 117       "missing )\0"
 118       // 15
 119       "reference to non-existent subpattern\0"
 120       "regular expression too large\0"
 121       "parentheses nested too deeply" */
 122
 123 /* Structure for passing "static" information around between the functions
 124 doing the compiling. */
 125
 126 struct CompileData {
 127     CompileData() {
 128         topBackref = 0;
 129         backrefMap = 0;
 130         reqVaryOpt = 0;
 131         needOuterBracket = false;
 132         numCapturingBrackets = 0;
 133     }
 134     int topBackref;            /* Maximum back reference */
 135     unsigned backrefMap;       /* Bitmap of low back refs */
 136     int reqVaryOpt;            /* "After variable item" flag for reqByte */
 137     bool needOuterBracket;
 138     int numCapturingBrackets;
 139 };
 140
 141 /* Definitions to allow mutual recursion */
 142
 143 static bool compileBracket(int, int*, unsigned char**, const UChar**, const UChar*, ErrorCode*, int, int*, int*, CompileData&);
 144 static bool bracketIsAnchored(const unsigned char* code);
 145 static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap);
 146 static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool inassert);
 147
 148 /*************************************************
 149 *            Handle escapes                      *
 150 *************************************************/
 151
 152 /* This function is called when a \ has been encountered. It either returns a
 153 positive value for a simple escape such as \n, or a negative value which
 154 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
 155 a positive value greater than 255 may be returned. On entry, ptr is pointing at
 156 the \. On exit, it is on the final character of the escape sequence.
 157
 158 Arguments:
 159   ptrPtr         points to the pattern position pointer
 160   errorCodePtr   points to the errorcode variable
 161   bracount       number of previous extracting brackets
 162   options        the options bits
 163   isClass        true if inside a character class
 164
 165 Returns:         zero or positive => a data character
 166                  negative => a special escape sequence
 167                  on error, error is set
 168 */
 169
 170 static int checkEscape(const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int bracount, bool isClass)
 171 {
 172     const UChar* ptr = *ptrPtr + 1;
 173
 174     /* If backslash is at the end of the pattern, it's an error. */
 175     if (ptr == patternEnd) {
 176         *errorCodePtr = ERR1;
 177         *ptrPtr = ptr;
 178         return 0;
 179     }
 180
 181     int c = *ptr;
 182
 183     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
 184      a table. A non-zero result is something that can be returned immediately.
 185      Otherwise further processing may be required. */
 186
 187     if (c < '0' || c > 'z') { /* Not alphameric */
 188     } else if (int escapeValue = escapes[c - '0']) {
 189         c = escapeValue;
 190         if (isClass) {
 191             if (-c == ESC_b)
 192                 c = '\b'; /* \b is backslash in a class */
 193             else if (-c == ESC_B)
 194                 c = 'B'; /* and \B is a capital B in a class (in browsers event though ECMAScript 15.10.2.19 says it raises an error) */
 195         }
 196     /* Escapes that need further processing, or are illegal. */
 197
 198     } else {
 199         switch (c) {
 200             case '1':
 201             case '2':
 202             case '3':
 203             case '4':
 204             case '5':
 205             case '6':
 206             case '7':
 207             case '8':
 208             case '9':
 209                 /* Escape sequences starting with a non-zero digit are backreferences,
 210                  unless there are insufficient brackets, in which case they are octal
 211                  escape sequences. Those sequences end on the first non-octal character
 212                  or when we overflow 0-255, whichever comes first. */
 213
 214                 if (!isClass) {
 215                     const UChar* oldptr = ptr;
 216                     c -= '0';
 217                     while ((ptr + 1 < patternEnd) && isASCIIDigit(ptr[1]) && c <= bracount)
 218                         c = c * 10 + *(++ptr) - '0';
 219                     if (c <= bracount) {
 220                         c = -(ESC_REF + c);
 221                         break;
 222                     }
 223                     ptr = oldptr;      /* Put the pointer back and fall through */
 224                 }
 225
 226                 /* Handle an octal number following \. If the first digit is 8 or 9,
 227                  this is not octal. */
 228
 229                 if ((c = *ptr) >= '8') {
 230                     c = '\\';
 231                     ptr -= 1;
 232                     break;
 233                 }
 234
 235             /* \0 always starts an octal number, but we may drop through to here with a
 236              larger first octal digit. */
 237
 238             case '0': {
 239                 c -= '0';
 240                 int i;
 241                 for (i = 1; i <= 2; ++i) {
 242                     if (ptr + i >= patternEnd || ptr[i] < '0' || ptr[i] > '7')
 243                         break;
 244                     int cc = c * 8 + ptr[i] - '0';
 245                     if (cc > 255)
 246                         break;
 247                     c = cc;
 248                 }
 249                 ptr += i - 1;
 250                 break;
 251             }
 252
 253             case 'x': {
 254                 c = 0;
 255                 int i;
 256                 for (i = 1; i <= 2; ++i) {
 257                     if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) {
 258                         c = 'x';
 259                         i = 1;
 260                         break;
 261                     }
 262                     int cc = ptr[i];
 263                     if (cc >= 'a')
 264                         cc -= 32;             /* Convert to upper case */
 265                     c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10));
 266                 }
 267                 ptr += i - 1;
 268                 break;
 269             }
 270
 271             case 'u': {
 272                 c = 0;
 273                 int i;
 274                 for (i = 1; i <= 4; ++i) {
 275                     if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) {
 276                         c = 'u';
 277                         i = 1;
 278                         break;
 279                     }
 280                     int cc = ptr[i];
 281                     if (cc >= 'a')
 282                         cc -= 32;             /* Convert to upper case */
 283                     c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10));
 284                 }
 285                 ptr += i - 1;
 286                 break;
 287             }
 288
 289             case 'c':
 290                 if (++ptr == patternEnd) {
 291                     *errorCodePtr = ERR2;
 292                     return 0;
 293                 }
 294
 295                 c = *ptr;
 296
 297                 /* To match Firefox, inside a character class, we also accept
 298                    numbers and '_' as control characters */
 299                 if ((!isClass && !isASCIIAlpha(c)) || (!isASCIIAlphanumeric(c) && c != '_')) {
 300                     c = '\\';
 301                     ptr -= 2;
 302                     break;
 303                 }
 304
 305                 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
 306                  is ASCII-specific, but then the whole concept of \cx is ASCII-specific. */
 307                 c = toASCIIUpper(c) ^ 0x40;
 308                 break;
 309             }
 310     }
 311
 312     *ptrPtr = ptr;
 313     return c;
 314 }
 315
 316 /*************************************************
 317 *            Check for counted repeat            *
 318 *************************************************/
 319
 320 /* This function is called when a '{' is encountered in a place where it might
 321 start a quantifier. It looks ahead to see if it really is a quantifier or not.
 322 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
 323 where the ddds are digits.
 324
 325 Arguments:
 326   p         pointer to the first char after '{'
 327
 328 Returns:    true or false
 329 */
 330
 331 static bool isCountedRepeat(const UChar* p, const UChar* patternEnd)
 332 {
 333     if (p >= patternEnd || !isASCIIDigit(*p))
 334         return false;
 335     p++;
 336     while (p < patternEnd && isASCIIDigit(*p))
 337         p++;
 338     if (p < patternEnd && *p == '}')
 339         return true;
 340
 341     if (p >= patternEnd || *p++ != ',')
 342         return false;
 343     if (p < patternEnd && *p == '}')
 344         return true;
 345
 346     if (p >= patternEnd || !isASCIIDigit(*p))
 347         return false;
 348     p++;
 349     while (p < patternEnd && isASCIIDigit(*p))
 350         p++;
 351
 352     return (p < patternEnd && *p == '}');
 353 }
 354
 355 /*************************************************
 356 *         Read repeat counts                     *
 357 *************************************************/
 358
 359 /* Read an item of the form {n,m} and return the values. This is called only
 360 after isCountedRepeat() has confirmed that a repeat-count quantifier exists,
 361 so the syntax is guaranteed to be correct, but we need to check the values.
 362
 363 Arguments:
 364   p              pointer to first char after '{'
 365   minp           pointer to int for min
 366   maxp           pointer to int for max
 367                  returned as -1 if no max
 368   errorCodePtr   points to error code variable
 369
 370 Returns:         pointer to '}' on success;
 371                  current ptr on error, with errorCodePtr set non-zero
 372 */
 373
 374 static const UChar* readRepeatCounts(const UChar* p, int* minp, int* maxp, ErrorCode* errorCodePtr)
 375 {
 376     int min = 0;
 377     int max = -1;
 378
 379     /* Read the minimum value and do a paranoid check: a negative value indicates
 380      an integer overflow. */
 381
 382     while (isASCIIDigit(*p))
 383         min = min * 10 + *p++ - '0';
 384     if (min < 0 || min > 65535) {
 385         *errorCodePtr = ERR5;
 386         return p;
 387     }
 388
 389     /* Read the maximum value if there is one, and again do a paranoid on its size.
 390      Also, max must not be less than min. */
 391
 392     if (*p == '}')
 393         max = min;
 394     else {
 395         if (*(++p) != '}') {
 396             max = 0;
 397             while (isASCIIDigit(*p))
 398                 max = max * 10 + *p++ - '0';
 399             if (max < 0 || max > 65535) {
 400                 *errorCodePtr = ERR5;
 401                 return p;
 402             }
 403             if (max < min) {
 404                 *errorCodePtr = ERR4;
 405                 return p;
 406             }
 407         }
 408     }
 409
 410     /* Fill in the required variables, and pass back the pointer to the terminating
 411      '}'. */
 412
 413     *minp = min;
 414     *maxp = max;
 415     return p;
 416 }
 417
 418 /*************************************************
 419 *      Find first significant op code            *
 420 *************************************************/
 421
 422 /* This is called by several functions that scan a compiled expression looking
 423 for a fixed first character, or an anchoring op code etc. It skips over things
 424 that do not influence this.
 425
 426 Arguments:
 427   code         pointer to the start of the group
 428 Returns:       pointer to the first significant opcode
 429 */
 430
 431 static const unsigned char* firstSignificantOpcode(const unsigned char* code)
 432 {
 433     while (*code == OP_BRANUMBER)
 434         code += 3;
 435     return code;
 436 }
 437
 438 static const unsigned char* firstSignificantOpcodeSkippingAssertions(const unsigned char* code)
 439 {
 440     while (true) {
 441         switch (*code) {
 442             case OP_ASSERT_NOT:
 443                 advanceToEndOfBracket(code);
 444                 code += 1 + LINK_SIZE;
 445                 break;
 446             case OP_WORD_BOUNDARY:
 447             case OP_NOT_WORD_BOUNDARY:
 448                 ++code;
 449                 break;
 450             case OP_BRANUMBER:
 451                 code += 3;
 452                 break;
 453             default:
 454                 return code;
 455         }
 456     }
 457 }
 458
 459 /*************************************************
 460 *           Get othercase range                  *
 461 *************************************************/
 462
 463 /* This function is passed the start and end of a class range, in UTF-8 mode
 464 with UCP support. It searches up the characters, looking for internal ranges of
 465 characters in the "other" case. Each call returns the next one, updating the
 466 start address.
 467
 468 Arguments:
 469   cptr        points to starting character value; updated
 470   d           end value
 471   ocptr       where to put start of othercase range
 472   odptr       where to put end of othercase range
 473
 474 Yield:        true when range returned; false when no more
 475 */
 476
 477 static bool getOthercaseRange(int* cptr, int d, int* ocptr, int* odptr)
 478 {
 479     int c, othercase = 0;
 480
 481     for (c = *cptr; c <= d; c++) {
 482         if ((othercase = jsc_pcre_ucp_othercase(c)) >= 0)
 483             break;
 484     }
 485
 486     if (c > d)
 487         return false;
 488
 489     *ocptr = othercase;
 490     int next = othercase + 1;
 491
 492     for (++c; c <= d; c++) {
 493         if (jsc_pcre_ucp_othercase(c) != next)
 494             break;
 495         next++;
 496     }
 497
 498     *odptr = next - 1;
 499     *cptr = c;
 500
 501     return true;
 502 }
 503
 504 /*************************************************
 505  *       Convert character value to UTF-8         *
 506  *************************************************/
 507
 508 /* This function takes an integer value in the range 0 - 0x7fffffff
 509  and encodes it as a UTF-8 character in 0 to 6 bytes.
 510
 511  Arguments:
 512  cvalue     the character value
 513  buffer     pointer to buffer for result - at least 6 bytes long
 514
 515  Returns:     number of characters placed in the buffer
 516  */
 517
 518 static int encodeUTF8(int cvalue, unsigned char *buffer)
 519 {
 520     int i;
 521     for (i = 0; i < jsc_pcre_utf8_table1_size; i++)
 522         if (cvalue <= jsc_pcre_utf8_table1[i])
 523             break;
 524     buffer += i;
 525     for (int j = i; j > 0; j--) {
 526         *buffer-- = 0x80 | (cvalue & 0x3f);
 527         cvalue >>= 6;
 528     }
 529     *buffer = jsc_pcre_utf8_table2[i] | cvalue;
 530     return i + 1;
 531 }
 532
 533 /*************************************************
 534 *           Compile one branch                   *
 535 *************************************************/
 536
 537 /* Scan the pattern, compiling it into the code vector.
 538
 539 Arguments:
 540   options        the option bits
 541   brackets       points to number of extracting brackets used
 542   codePtr        points to the pointer to the current code point
 543   ptrPtr         points to the current pattern pointer
 544   errorCodePtr   points to error code variable
 545   firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
 546   reqbyteptr     set to the last literal character required, else < 0
 547   cd             contains pointers to tables etc.
 548
 549 Returns:         true on success
 550                  false, with *errorCodePtr set non-zero on error
 551 */
 552
 553 static inline bool safelyCheckNextChar(const UChar* ptr, const UChar* patternEnd, UChar expected)
 554 {
 555     return ((ptr + 1 < patternEnd) && ptr[1] == expected);
 556 }
 557
 558 static bool
 559 compileBranch(int options, int* brackets, unsigned char** codePtr,
 560                const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int *firstbyteptr,
 561                int* reqbyteptr, CompileData& cd)
 562 {
 563     int repeatType, opType;
 564     int repeatMin = 0, repeat_max = 0;      /* To please picky compilers */
 565     int bravalue = 0;
 566     int reqvary, tempreqvary;
 567     int c;
 568     unsigned char* code = *codePtr;
 569     unsigned char* tempcode;
 570     bool didGroupSetFirstByte = false;
 571     const UChar* ptr = *ptrPtr;
 572     const UChar* tempptr;
 573     unsigned char* previous = NULL;
 574     unsigned char classbits[32];
 575
 576     bool class_utf8;
 577     unsigned char* class_utf8data;
 578     unsigned char utf8_char[6];
 579
 580     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
 581      matching encountered yet". It gets changed to REQ_NONE if we hit something that
 582      matches a non-fixed char first char; reqByte just remains unset if we never
 583      find one.
 584
 585      When we hit a repeat whose minimum is zero, we may have to adjust these values
 586      to take the zero repeat into account. This is implemented by setting them to
 587      zeroFirstByte and zeroReqByte when such a repeat is encountered. The individual
 588      item types that can be repeated set these backoff variables appropriately. */
 589
 590     int firstByte = REQ_UNSET;
 591     int reqByte = REQ_UNSET;
 592     int zeroReqByte = REQ_UNSET;
 593     int zeroFirstByte = REQ_UNSET;
 594
 595     /* The variable reqCaseOpt contains either the REQ_IGNORE_CASE value or zero,
 596      according to the current setting of the ignores-case flag. REQ_IGNORE_CASE is a bit
 597      value > 255. It is added into the firstByte or reqByte variables to record the
 598      case status of the value. This is used only for ASCII characters. */
 599
 600     int reqCaseOpt = (options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0;
 601
 602     /* Switch on next character until the end of the branch */
 603
 604     for (;; ptr++) {
 605         bool negateClass;
 606         bool shouldFlipNegation; /* If a negative special such as \S is used, we should negate the whole class to properly support Unicode. */
 607         int classCharCount;
 608         int classLastChar;
 609         int skipBytes;
 610         int subReqByte;
 611         int subFirstByte;
 612         int mcLength;
 613         unsigned char mcbuffer[8];
 614
 615         /* Next byte in the pattern */
 616
 617         c = ptr < patternEnd ? *ptr : 0;
 618
 619         /* Fill in length of a previous callout, except when the next thing is
 620          a quantifier. */
 621
 622         bool isQuantifier = c == '*' || c == '+' || c == '?' || (c == '{' && isCountedRepeat(ptr + 1, patternEnd));
 623
 624         switch (c) {
 625             /* The branch terminates at end of string, |, or ). */
 626
 627             case 0:
 628                 if (ptr < patternEnd)
 629                     goto NORMAL_CHAR;
 630                 // End of string; fall through
 631             case '|':
 632             case ')':
 633                 *firstbyteptr = firstByte;
 634                 *reqbyteptr = reqByte;
 635                 *codePtr = code;
 636                 *ptrPtr = ptr;
 637                 return true;
 638
 639             /* Handle single-character metacharacters. In multiline mode, ^ disables
 640              the setting of any following char as a first character. */
 641
 642             case '^':
 643                 if (options & MatchAcrossMultipleLinesOption) {
 644                     if (firstByte == REQ_UNSET)
 645                         firstByte = REQ_NONE;
 646                     *code++ = OP_BOL;
 647                 } else
 648                     *code++ = OP_CIRC;
 649                 previous = NULL;
 650                 break;
 651
 652             case '$':
 653                 previous = NULL;
 654                 if (options & MatchAcrossMultipleLinesOption)
 655                   *code++ = OP_EOL;
 656                 else
 657                   *code++ = OP_DOLL;
 658                 break;
 659
 660             /* There can never be a first char if '.' is first, whatever happens about
 661              repeats. The value of reqByte doesn't change either. */
 662
 663             case '.':
 664                 if (firstByte == REQ_UNSET)
 665                     firstByte = REQ_NONE;
 666                 zeroFirstByte = firstByte;
 667                 zeroReqByte = reqByte;
 668                 previous = code;
 669                 *code++ = OP_NOT_NEWLINE;
 670                 break;
 671
 672             /* Character classes. If the included characters are all < 256, we build a
 673              32-byte bitmap of the permitted characters, except in the special case
 674              where there is only one such character. For negated classes, we build the
 675              map as usual, then invert it at the end. However, we use a different opcode
 676              so that data characters > 255 can be handled correctly.
 677
 678              If the class contains characters outside the 0-255 range, a different
 679              opcode is compiled. It may optionally have a bit map for characters < 256,
 680              but those above are are explicitly listed afterwards. A flag byte tells
 681              whether the bitmap is present, and whether this is a negated class or not.
 682              */
 683
 684             case '[': {
 685                 previous = code;
 686                 shouldFlipNegation = false;
 687
 688                 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
 689                  they are encountered at the top level, so we'll do that too. */
 690
 691                 /* If the first character is '^', set the negation flag and skip it. */
 692
 693                 if (ptr + 1 >= patternEnd) {
 694                     *errorCodePtr = ERR6;
 695                     return false;
 696                 }
 697
 698                 if (ptr[1] == '^') {
 699                     negateClass = true;
 700                     ++ptr;
 701                 } else
 702                     negateClass = false;
 703
 704                 /* Keep a count of chars with values < 256 so that we can optimize the case
 705                  of just a single character (as long as it's < 256). For higher valued UTF-8
 706                  characters, we don't yet do any optimization. */
 707
 708                 classCharCount = 0;
 709                 classLastChar = -1;
 710
 711                 class_utf8 = false;                       /* No chars >= 256 */
 712                 class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */
 713
 714                 /* Initialize the 32-char bit map to all zeros. We have to build the
 715                  map in a temporary bit of store, in case the class contains only 1
 716                  character (< 256), because in that case the compiled code doesn't use the
 717                  bit map. */
 718
 719                 memset(classbits, 0, 32 * sizeof(unsigned char));
 720
 721                 /* Process characters until ] is reached. The first pass
 722                  through the regex checked the overall syntax, so we don't need to be very
 723                  strict here. At the start of the loop, c contains the first byte of the
 724                  character. */
 725
 726                 while ((++ptr < patternEnd) && (c = *ptr) != ']') {
 727                     /* Backslash may introduce a single character, or it may introduce one
 728                      of the specials, which just set a flag. Escaped items are checked for
 729                      validity in the pre-compiling pass. The sequence \b is a special case.
 730                      Inside a class (and only there) it is treated as backspace. Elsewhere
 731                      it marks a word boundary. Other escapes have preset maps ready to
 732                      or into the one we are building. We assume they have more than one
 733                      character in them, so set classCharCount bigger than one. */
 734
 735                     if (c == '\\') {
 736                         c = checkEscape(&ptr, patternEnd, errorCodePtr, cd.numCapturingBrackets, true);
 737                         if (c < 0) {
 738                             classCharCount += 2;     /* Greater than 1 is what matters */
 739                             switch (-c) {
 740                                 case ESC_d:
 741                                     for (c = 0; c < 32; c++)
 742                                         classbits[c] |= classBitmapForChar(c + cbit_digit);
 743                                     continue;
 744
 745                                 case ESC_D:
 746                                     shouldFlipNegation = true;
 747                                     for (c = 0; c < 32; c++)
 748                                         classbits[c] |= ~classBitmapForChar(c + cbit_digit);
 749                                     continue;
 750
 751                                 case ESC_w:
 752                                     for (c = 0; c < 32; c++)
 753                                         classbits[c] |= classBitmapForChar(c + cbit_word);
 754                                     continue;
 755
 756                                 case ESC_W:
 757                                     shouldFlipNegation = true;
 758                                     for (c = 0; c < 32; c++)
 759                                         classbits[c] |= ~classBitmapForChar(c + cbit_word);
 760                                     continue;
 761
 762                                 case ESC_s:
 763                                     for (c = 0; c < 32; c++)
 764                                          classbits[c] |= classBitmapForChar(c + cbit_space);
 765                                     continue;
 766
 767                                 case ESC_S:
 768                                     shouldFlipNegation = true;
 769                                     for (c = 0; c < 32; c++)
 770                                          classbits[c] |= ~classBitmapForChar(c + cbit_space);
 771                                     continue;
 772
 773                                     /* Unrecognized escapes are faulted if PCRE is running in its
 774                                      strict mode. By default, for compatibility with Perl, they are
 775                                      treated as literals. */
 776
 777                                 default:
 778                                     c = *ptr;              /* The final character */
 779                                     classCharCount -= 2;  /* Undo the default count from above */
 780                             }
 781                         }
 782
 783                         /* Fall through if we have a single character (c >= 0). This may be
 784                          > 256 in UTF-8 mode. */
 785
 786                     }   /* End of backslash handling */
 787
 788                     /* A single character may be followed by '-' to form a range. However,
 789                      Perl does not permit ']' to be the end of the range. A '-' character
 790                      here is treated as a literal. */
 791
 792                     if ((ptr + 2 < patternEnd) && ptr[1] == '-' && ptr[2] != ']') {
 793                         ptr += 2;
 794
 795                         int d = *ptr;
 796
 797                         /* The second part of a range can be a single-character escape, but
 798                          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
 799                          in such circumstances. */
 800
 801                         if (d == '\\') {
 802                             const UChar* oldptr = ptr;
 803                             d = checkEscape(&ptr, patternEnd, errorCodePtr, cd.numCapturingBrackets, true);
 804
 805                             /* \X is literal X; any other special means the '-' was literal */
 806                             if (d < 0) {
 807                                 ptr = oldptr - 2;
 808                                 goto LONE_SINGLE_CHARACTER;  /* A few lines below */
 809                             }
 810                         }
 811
 812                         /* The check that the two values are in the correct order happens in
 813                          the pre-pass. Optimize one-character ranges */
 814
 815                         if (d == c)
 816                             goto LONE_SINGLE_CHARACTER;  /* A few lines below */
 817
 818                         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
 819                          matching, we have to use an XCLASS with extra data items. Caseless
 820                          matching for characters > 127 is available only if UCP support is
 821                          available. */
 822
 823                         if ((d > 255 || ((options & IgnoreCaseOption) && d > 127))) {
 824                             class_utf8 = true;
 825
 826                             /* With UCP support, we can find the other case equivalents of
 827                              the relevant characters. There may be several ranges. Optimize how
 828                              they fit with the basic range. */
 829
 830                             if (options & IgnoreCaseOption) {
 831                                 int occ, ocd;
 832                                 int cc = c;
 833                                 int origd = d;
 834                                 while (getOthercaseRange(&cc, origd, &occ, &ocd)) {
 835                                     if (occ >= c && ocd <= d)
 836                                         continue;  /* Skip embedded ranges */
 837
 838                                     if (occ < c  && ocd >= c - 1)        /* Extend the basic range */
 839                                     {                                  /* if there is overlap,   */
 840                                         c = occ;                           /* noting that if occ < c */
 841                                         continue;                          /* we can't have ocd > d  */
 842                                     }                                  /* because a subrange is  */
 843                                     if (ocd > d && occ <= d + 1)         /* always shorter than    */
 844                                     {                                  /* the basic range.       */
 845                                         d = ocd;
 846                                         continue;
 847                                     }
 848
 849                                     if (occ == ocd)
 850                                         *class_utf8data++ = XCL_SINGLE;
 851                                     else {
 852                                         *class_utf8data++ = XCL_RANGE;
 853                                         class_utf8data += encodeUTF8(occ, class_utf8data);
 854                                     }
 855                                     class_utf8data += encodeUTF8(ocd, class_utf8data);
 856                                 }
 857                             }
 858
 859                             /* Now record the original range, possibly modified for UCP caseless
 860                              overlapping ranges. */
 861
 862                             *class_utf8data++ = XCL_RANGE;
 863                             class_utf8data += encodeUTF8(c, class_utf8data);
 864                             class_utf8data += encodeUTF8(d, class_utf8data);
 865
 866                             /* With UCP support, we are done. Without UCP support, there is no
 867                              caseless matching for UTF-8 characters > 127; we can use the bit map
 868                              for the smaller ones. */
 869
 870                             continue;    /* With next character in the class */
 871                         }
 872
 873                         /* We use the bit map for all cases when not in UTF-8 mode; else
 874                          ranges that lie entirely within 0-127 when there is UCP support; else
 875                          for partial ranges without UCP support. */
 876
 877                         for (; c <= d; c++) {
 878                             classbits[c/8] |= (1 << (c&7));
 879                             if (options & IgnoreCaseOption) {
 880                                 int uc = flipCase(c);
 881                                 classbits[uc/8] |= (1 << (uc&7));
 882                             }
 883                             classCharCount++;                /* in case a one-char range */
 884                             classLastChar = c;
 885                         }
 886
 887                         continue;   /* Go get the next char in the class */
 888                     }
 889
 890                     /* Handle a lone single character - we can get here for a normal
 891                      non-escape char, or after \ that introduces a single character or for an
 892                      apparent range that isn't. */
 893
 894                 LONE_SINGLE_CHARACTER:
 895
 896                     /* Handle a character that cannot go in the bit map */
 897
 898                     if ((c > 255 || ((options & IgnoreCaseOption) && c > 127))) {
 899                         class_utf8 = true;
 900                         *class_utf8data++ = XCL_SINGLE;
 901                         class_utf8data += encodeUTF8(c, class_utf8data);
 902
 903                         if (options & IgnoreCaseOption) {
 904                             int othercase;
 905                             if ((othercase = jsc_pcre_ucp_othercase(c)) >= 0) {
 906                                 *class_utf8data++ = XCL_SINGLE;
 907                                 class_utf8data += encodeUTF8(othercase, class_utf8data);
 908                             }
 909                         }
 910                     } else {
 911                         /* Handle a single-byte character */
 912                         classbits[c/8] |= (1 << (c&7));
 913                         if (options & IgnoreCaseOption) {
 914                             c = flipCase(c);
 915                             classbits[c/8] |= (1 << (c&7));
 916                         }
 917                         classCharCount++;
 918                         classLastChar = c;
 919                     }
 920                 }
 921
 922                 /* If classCharCount is 1, we saw precisely one character whose value is
 923                  less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
 924                  can optimize the negative case only if there were no characters >= 128
 925                  because OP_NOT and the related opcodes like OP_NOTSTAR operate on
 926                  single-bytes only. This is an historical hangover. Maybe one day we can
 927                  tidy these opcodes to handle multi-byte characters.
 928
 929                  The optimization throws away the bit map. We turn the item into a
 930                  1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
 931                  that OP_NOT does not support multibyte characters. In the positive case, it
 932                  can cause firstByte to be set. Otherwise, there can be no first char if
 933                  this item is first, whatever repeat count may follow. In the case of
 934                  reqByte, save the previous value for reinstating. */
 935
 936                 if (classCharCount == 1 && (!class_utf8 && (!negateClass || classLastChar < 128))) {
 937                     zeroReqByte = reqByte;
 938
 939                     /* The OP_NOT opcode works on one-byte characters only. */
 940
 941                     if (negateClass) {
 942                         if (firstByte == REQ_UNSET)
 943                             firstByte = REQ_NONE;
 944                         zeroFirstByte = firstByte;
 945                         *code++ = OP_NOT;
 946                         *code++ = classLastChar;
 947                         break;
 948                     }
 949
 950                     /* For a single, positive character, get the value into c, and
 951                      then we can handle this with the normal one-character code. */
 952
 953                     c = classLastChar;
 954                     goto NORMAL_CHAR;
 955                 }       /* End of 1-char optimization */
 956
 957                 /* The general case - not the one-char optimization. If this is the first
 958                  thing in the branch, there can be no first char setting, whatever the
 959                  repeat count. Any reqByte setting must remain unchanged after any kind of
 960                  repeat. */
 961
 962                 if (firstByte == REQ_UNSET) firstByte = REQ_NONE;
 963                 zeroFirstByte = firstByte;
 964                 zeroReqByte = reqByte;
 965
 966                 /* If there are characters with values > 255, we have to compile an
 967                  extended class, with its own opcode. If there are no characters < 256,
 968                  we can omit the bitmap. */
 969
 970                 if (class_utf8 && !shouldFlipNegation) {
 971                     *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
 972                     *code++ = OP_XCLASS;
 973                     code += LINK_SIZE;
 974                     *code = negateClass? XCL_NOT : 0;
 975
 976                     /* If the map is required, install it, and move on to the end of
 977                      the extra data */
 978
 979                     if (classCharCount > 0) {
 980                         *code++ |= XCL_MAP;
 981                         memcpy(code, classbits, 32);
 982                         code = class_utf8data;
 983                     }
 984
 985                     /* If the map is not required, slide down the extra data. */
 986
 987                     else {
 988                         int len = class_utf8data - (code + 33);
 989                         memmove(code + 1, code + 33, len);
 990                         code += len + 1;
 991                     }
 992
 993                     /* Now fill in the complete length of the item */
 994
 995                     putLinkValue(previous + 1, code - previous);
 996                     break;   /* End of class handling */
 997                 }
 998
 999                 /* If there are no characters > 255, negate the 32-byte map if necessary,
1000                  and copy it into the code vector. If this is the first thing in the branch,
1001                  there can be no first char setting, whatever the repeat count. Any reqByte
1002                  setting must remain unchanged after any kind of repeat. */
1003
1004                 *code++ = (negateClass == shouldFlipNegation) ? OP_CLASS : OP_NCLASS;
1005                 if (negateClass)
1006                     for (c = 0; c < 32; c++)
1007                         code[c] = ~classbits[c];
1008                 else
1009                     memcpy(code, classbits, 32);
1010                 code += 32;
1011                 break;
1012             }
1013
1014             /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
1015              has been tested above. */
1016
1017             case '{':
1018                 if (!isQuantifier)
1019                     goto NORMAL_CHAR;
1020                 ptr = readRepeatCounts(ptr + 1, &repeatMin, &repeat_max, errorCodePtr);
1021                 if (*errorCodePtr)
1022                     goto FAILED;
1023                 goto REPEAT;
1024
1025             case '*':
1026                 repeatMin = 0;
1027                 repeat_max = -1;
1028                 goto REPEAT;
1029
1030             case '+':
1031                 repeatMin = 1;
1032                 repeat_max = -1;
1033                 goto REPEAT;
1034
1035             case '?':
1036                 repeatMin = 0;
1037                 repeat_max = 1;
1038
1039             REPEAT:
1040                 if (!previous) {
1041                     *errorCodePtr = ERR9;
1042                     goto FAILED;
1043                 }
1044
1045                 if (repeatMin == 0) {
1046                     firstByte = zeroFirstByte;    /* Adjust for zero repeat */
1047                     reqByte = zeroReqByte;        /* Ditto */
1048                 }
1049
1050                 /* Remember whether this is a variable length repeat */
1051
1052                 reqvary = (repeatMin == repeat_max) ? 0 : REQ_VARY;
1053
1054                 opType = 0;                    /* Default single-char op codes */
1055
1056                 /* Save start of previous item, in case we have to move it up to make space
1057                  for an inserted OP_ONCE for the additional '+' extension. */
1058                 /* FIXME: Probably don't need this because we don't use OP_ONCE. */
1059
1060                 tempcode = previous;
1061
1062                 /* If the next character is '+', we have a possessive quantifier. This
1063                  implies greediness, whatever the setting of the PCRE_UNGREEDY option.
1064                  If the next character is '?' this is a minimizing repeat, by default,
1065                  but if PCRE_UNGREEDY is set, it works the other way round. We change the
1066                  repeat type to the non-default. */
1067
1068                 if (safelyCheckNextChar(ptr, patternEnd, '?')) {
1069                     repeatType = 1;
1070                     ptr++;
1071                 } else
1072                     repeatType = 0;
1073
1074                 /* If previous was a character match, abolish the item and generate a
1075                  repeat item instead. If a char item has a minumum of more than one, ensure
1076                  that it is set in reqByte - it might not be if a sequence such as x{3} is
1077                  the first thing in a branch because the x will have gone into firstByte
1078                  instead.  */
1079
1080                 if (*previous == OP_CHAR || *previous == OP_CHAR_IGNORING_CASE) {
1081                     /* Deal with UTF-8 characters that take up more than one byte. It's
1082                      easier to write this out separately than try to macrify it. Use c to
1083                      hold the length of the character in bytes, plus 0x80 to flag that it's a
1084                      length rather than a small character. */
1085
1086                     if (code[-1] & 0x80) {
1087                         unsigned char *lastchar = code - 1;
1088                         while((*lastchar & 0xc0) == 0x80)
1089                             lastchar--;
1090                         c = code - lastchar;            /* Length of UTF-8 character */
1091                         memcpy(utf8_char, lastchar, c); /* Save the char */
1092                         c |= 0x80;                      /* Flag c as a length */
1093                     }
1094                     else {
1095                         c = code[-1];
1096                         if (repeatMin > 1)
1097                             reqByte = c | reqCaseOpt | cd.reqVaryOpt;
1098                     }
1099
1100                     goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
1101                 }
1102
1103                 else if (*previous == OP_ASCII_CHAR || *previous == OP_ASCII_LETTER_IGNORING_CASE) {
1104                     c = previous[1];
1105                     if (repeatMin > 1)
1106                         reqByte = c | reqCaseOpt | cd.reqVaryOpt;
1107                     goto OUTPUT_SINGLE_REPEAT;
1108                 }
1109
1110                 /* If previous was a single negated character ([^a] or similar), we use
1111                  one of the special opcodes, replacing it. The code is shared with single-
1112                  character repeats by setting opt_type to add a suitable offset into
1113                  repeatType. OP_NOT is currently used only for single-byte chars. */
1114
1115                 else if (*previous == OP_NOT) {
1116                     opType = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
1117                     c = previous[1];
1118                     goto OUTPUT_SINGLE_REPEAT;
1119                 }
1120
1121                 /* If previous was a character type match (\d or similar), abolish it and
1122                  create a suitable repeat item. The code is shared with single-character
1123                  repeats by setting opType to add a suitable offset into repeatType. */
1124
1125                 else if (*previous <= OP_NOT_NEWLINE) {
1126                     opType = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
1127                     c = *previous;
1128
1129                 OUTPUT_SINGLE_REPEAT:
1130                     int prop_type = -1;
1131                     int prop_value = -1;
1132
1133                     unsigned char* oldcode = code;
1134                     code = previous;                  /* Usually overwrite previous item */
1135
1136                     /* If the maximum is zero then the minimum must also be zero; Perl allows
1137                      this case, so we do too - by simply omitting the item altogether. */
1138
1139                     if (repeat_max == 0)
1140                         goto END_REPEAT;
1141
1142                     /* Combine the opType with the repeatType */
1143
1144                     repeatType += opType;
1145
1146                     /* A minimum of zero is handled either as the special case * or ?, or as
1147                      an UPTO, with the maximum given. */
1148
1149                     if (repeatMin == 0) {
1150                         if (repeat_max == -1)
1151                             *code++ = OP_STAR + repeatType;
1152                         else if (repeat_max == 1)
1153                             *code++ = OP_QUERY + repeatType;
1154                         else {
1155                             *code++ = OP_UPTO + repeatType;
1156                             put2ByteValueAndAdvance(code, repeat_max);
1157                         }
1158                     }
1159
1160                     /* A repeat minimum of 1 is optimized into some special cases. If the
1161                      maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
1162                      left in place and, if the maximum is greater than 1, we use OP_UPTO with
1163                      one less than the maximum. */
1164
1165                     else if (repeatMin == 1) {
1166                         if (repeat_max == -1)
1167                             *code++ = OP_PLUS + repeatType;
1168                         else {
1169                             code = oldcode;                 /* leave previous item in place */
1170                             if (repeat_max == 1)
1171                                 goto END_REPEAT;
1172                             *code++ = OP_UPTO + repeatType;
1173                             put2ByteValueAndAdvance(code, repeat_max - 1);
1174                         }
1175                     }
1176
1177                     /* The case {n,n} is just an EXACT, while the general case {n,m} is
1178                      handled as an EXACT followed by an UPTO. */
1179
1180                     else {
1181                         *code++ = OP_EXACT + opType;  /* NB EXACT doesn't have repeatType */
1182                         put2ByteValueAndAdvance(code, repeatMin);
1183
1184                         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
1185                          we have to insert the character for the previous code. For a repeated
1186                          Unicode property match, there are two extra bytes that define the
1187                          required property. In UTF-8 mode, long characters have their length in
1188                          c, with the 0x80 bit as a flag. */
1189
1190                         if (repeat_max < 0) {
1191                             if (c >= 128) {
1192                                 memcpy(code, utf8_char, c & 7);
1193                                 code += c & 7;
1194                             } else {
1195                                 *code++ = c;
1196                                 if (prop_type >= 0) {
1197                                     *code++ = prop_type;
1198                                     *code++ = prop_value;
1199                                 }
1200                             }
1201                             *code++ = OP_STAR + repeatType;
1202                         }
1203
1204                         /* Else insert an UPTO if the max is greater than the min, again
1205                          preceded by the character, for the previously inserted code. */
1206
1207                         else if (repeat_max != repeatMin) {
1208                             if (c >= 128) {
1209                                 memcpy(code, utf8_char, c & 7);
1210                                 code += c & 7;
1211                             } else
1212                                 *code++ = c;
1213                             if (prop_type >= 0) {
1214                                 *code++ = prop_type;
1215                                 *code++ = prop_value;
1216                             }
1217                             repeat_max -= repeatMin;
1218                             *code++ = OP_UPTO + repeatType;
1219                             put2ByteValueAndAdvance(code, repeat_max);
1220                         }
1221                     }
1222
1223                     /* The character or character type itself comes last in all cases. */
1224
1225                     if (c >= 128) {
1226                         memcpy(code, utf8_char, c & 7);
1227                         code += c & 7;
1228                     } else
1229                         *code++ = c;
1230
1231                     /* For a repeated Unicode property match, there are two extra bytes that
1232                      define the required property. */
1233
1234                     if (prop_type >= 0) {
1235                         *code++ = prop_type;
1236                         *code++ = prop_value;
1237                     }
1238                 }
1239
1240                 /* If previous was a character class or a back reference, we put the repeat
1241                  stuff after it, but just skip the item if the repeat was {0,0}. */
1242
1243                 else if (*previous == OP_CLASS ||
1244                          *previous == OP_NCLASS ||
1245                          *previous == OP_XCLASS ||
1246                          *previous == OP_REF)
1247                 {
1248                     if (repeat_max == 0) {
1249                         code = previous;
1250                         goto END_REPEAT;
1251                     }
1252
1253                     if (repeatMin == 0 && repeat_max == -1)
1254                         *code++ = OP_CRSTAR + repeatType;
1255                     else if (repeatMin == 1 && repeat_max == -1)
1256                         *code++ = OP_CRPLUS + repeatType;
1257                     else if (repeatMin == 0 && repeat_max == 1)
1258                         *code++ = OP_CRQUERY + repeatType;
1259                     else {
1260                         *code++ = OP_CRRANGE + repeatType;
1261                         put2ByteValueAndAdvance(code, repeatMin);
1262                         if (repeat_max == -1)
1263                             repeat_max = 0;  /* 2-byte encoding for max */
1264                         put2ByteValueAndAdvance(code, repeat_max);
1265                     }
1266                 }
1267
1268                 /* If previous was a bracket group, we may have to replicate it in certain
1269                  cases. */
1270
1271                 else if (*previous >= OP_BRA) {
1272                     int ketoffset = 0;
1273                     int len = code - previous;
1274                     unsigned char* bralink = NULL;
1275                     int nested = get2ByteValue(previous + 1 + LINK_SIZE);
1276
1277                     /* If the maximum repeat count is unlimited, find the end of the bracket
1278                      by scanning through from the start, and compute the offset back to it
1279                      from the current code pointer. There may be an OP_OPT setting following
1280                      the final KET, so we can't find the end just by going back from the code
1281                      pointer. */
1282
1283                     if (repeat_max == -1) {
1284                         const unsigned char* ket = previous;
1285                         advanceToEndOfBracket(ket);
1286                         ketoffset = code - ket;
1287                     }
1288
1289                     /* The case of a zero minimum is special because of the need to stick
1290                      OP_BRAZERO in front of it, and because the group appears once in the
1291                      data, whereas in other cases it appears the minimum number of times. For
1292                      this reason, it is simplest to treat this case separately, as otherwise
1293                      the code gets far too messy. There are several special subcases when the
1294                      minimum is zero. */
1295
1296                     if (repeatMin == 0) {
1297                         /* If the maximum is also zero, we just omit the group from the output
1298                          altogether. */
1299
1300                         if (repeat_max == 0) {
1301                             code = previous;
1302                             goto END_REPEAT;
1303                         }
1304
1305                         /* If the maximum is 1 or unlimited, we just have to stick in the
1306                          BRAZERO and do no more at this point. However, we do need to adjust
1307                          any OP_RECURSE calls inside the group that refer to the group itself or
1308                          any internal group, because the offset is from the start of the whole
1309                          regex. Temporarily terminate the pattern while doing this. */
1310
1311                         if (repeat_max <= 1) {
1312                             *code = OP_END;
1313                             memmove(previous+1, previous, len);
1314                             code++;
1315                             *previous++ = OP_BRAZERO + repeatType;
1316                         }
1317
1318                         /* If the maximum is greater than 1 and limited, we have to replicate
1319                          in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1320                          The first one has to be handled carefully because it's the original
1321                          copy, which has to be moved up. The remainder can be handled by code
1322                          that is common with the non-zero minimum case below. We have to
1323                          adjust the value of repeat_max, since one less copy is required. */
1324
1325                         else {
1326                             *code = OP_END;
1327                             memmove(previous + 4 + LINK_SIZE, previous, len);
1328                             code += 4 + LINK_SIZE;
1329                             *previous++ = OP_BRAZERO + repeatType;
1330                             *previous++ = OP_BRA;
1331
1332                             /* We chain together the bracket offset fields that have to be
1333                              filled in later when the ends of the brackets are reached. */
1334
1335                             int offset = (!bralink) ? 0 : previous - bralink;
1336                             bralink = previous;
1337                             putLinkValueAllowZeroAndAdvance(previous, offset);
1338                             put2ByteValueAndAdvance(previous, nested);
1339                         }
1340
1341                         repeat_max--;
1342                     }
1343
1344                     /* If the minimum is greater than zero, replicate the group as many
1345                      times as necessary, and adjust the maximum to the number of subsequent
1346                      copies that we need. If we set a first char from the group, and didn't
1347                      set a required char, copy the latter from the former. */
1348
1349                     else {
1350                         if (repeatMin > 1) {
1351                             if (didGroupSetFirstByte && reqByte < 0)
1352                                 reqByte = firstByte;
1353                             for (int i = 1; i < repeatMin; i++) {
1354                                 memcpy(code, previous, len);
1355                                 code += len;
1356                             }
1357                         }
1358                         if (repeat_max > 0)
1359                             repeat_max -= repeatMin;
1360                     }
1361
1362                     /* This code is common to both the zero and non-zero minimum cases. If
1363                      the maximum is limited, it replicates the group in a nested fashion,
1364                      remembering the bracket starts on a stack. In the case of a zero minimum,
1365                      the first one was set up above. In all cases the repeat_max now specifies
1366                      the number of additional copies needed. */
1367
1368                     if (repeat_max >= 0) {
1369                         for (int i = repeat_max - 1; i >= 0; i--) {
1370                             *code++ = OP_BRAZERO + repeatType;
1371
1372                             /* All but the final copy start a new nesting, maintaining the
1373                              chain of brackets outstanding. */
1374
1375                             if (i != 0) {
1376                                 *code++ = OP_BRA;
1377                                 int offset = (!bralink) ? 0 : code - bralink;
1378                                 bralink = code;
1379                                 putLinkValueAllowZeroAndAdvance(code, offset);
1380                                 put2ByteValueAndAdvance(code, nested);
1381                             }
1382
1383                             memcpy(code, previous, len);
1384                             code += len;
1385                         }
1386
1387                         /* Now chain through the pending brackets, and fill in their length
1388                          fields (which are holding the chain links pro tem). */
1389
1390                         while (bralink) {
1391                             int offset = code - bralink + 1;
1392                             unsigned char* bra = code - offset;
1393                             int oldlinkoffset = getLinkValueAllowZero(bra + 1);
1394                             bralink = (!oldlinkoffset) ? 0 : bralink - oldlinkoffset;
1395                             *code++ = OP_KET;
1396                             putLinkValueAndAdvance(code, offset);
1397                             putLinkValue(bra + 1, offset);
1398                         }
1399                     }
1400
1401                     /* If the maximum is unlimited, set a repeater in the final copy. We
1402                      can't just offset backwards from the current code point, because we
1403                      don't know if there's been an options resetting after the ket. The
1404                      correct offset was computed above. */
1405
1406                     else
1407                         code[-ketoffset] = OP_KETRMAX + repeatType;
1408                 }
1409
1410                 // A quantifier after an assertion is mostly meaningless, but it
1411                 // can nullify the assertion if it has a 0 minimum.
1412                 else if (*previous == OP_ASSERT || *previous == OP_ASSERT_NOT) {
1413                     if (repeatMin == 0) {
1414                         code = previous;
1415                         goto END_REPEAT;
1416                     }
1417                 }
1418
1419                 /* Else there's some kind of shambles */
1420
1421                 else {
1422                     *errorCodePtr = ERR11;
1423                     goto FAILED;
1424                 }
1425
1426                 /* In all case we no longer have a previous item. We also set the
1427                  "follows varying string" flag for subsequently encountered reqbytes if
1428                  it isn't already set and we have just passed a varying length item. */
1429
1430             END_REPEAT:
1431                 previous = NULL;
1432                 cd.reqVaryOpt |= reqvary;
1433                 break;
1434
1435             /* Start of nested bracket sub-expression, or comment or lookahead or
1436              lookbehind or option setting or condition. First deal with special things
1437              that can come after a bracket; all are introduced by ?, and the appearance
1438              of any of them means that this is not a referencing group. They were
1439              checked for validity in the first pass over the string, so we don't have to
1440              check for syntax errors here.  */
1441
1442             case '(':
1443             {
1444                 skipBytes = 2;
1445                 unsigned minBracket = *brackets + 1;
1446                 if (*(++ptr) == '?') {
1447                     switch (*(++ptr)) {
1448                         case ':':                 /* Non-extracting bracket */
1449                             bravalue = OP_BRA;
1450                             ptr++;
1451                             break;
1452
1453                         case '=':                 /* Positive lookahead */
1454                             bravalue = OP_ASSERT;
1455                             ptr++;
1456                             break;
1457
1458                         case '!':                 /* Negative lookahead */
1459                             bravalue = OP_ASSERT_NOT;
1460                             ptr++;
1461                             break;
1462
1463                         /* Character after (? not specially recognized */
1464
1465                         default:
1466                             *errorCodePtr = ERR12;
1467                             goto FAILED;
1468                         }
1469                 }
1470
1471                 /* Else we have a referencing group; adjust the opcode. If the bracket
1472                  number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
1473                  arrange for the true number to follow later, in an OP_BRANUMBER item. */
1474
1475                 else {
1476                     if (++(*brackets) > EXTRACT_BASIC_MAX) {
1477                         bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
1478                         code[3 + LINK_SIZE] = OP_BRANUMBER;
1479                         put2ByteValue(code + 4 + LINK_SIZE, *brackets);
1480                         skipBytes = 5;
1481                     }
1482                     else
1483                         bravalue = OP_BRA + *brackets;
1484                 }
1485
1486                 /* Process nested bracketed re. We copy code into a non-variable
1487                  in order to be able to pass its address because some compilers
1488                  complain otherwise. Pass in a new setting for the ims options
1489                  if they have changed. */
1490
1491                 previous = code;
1492                 *code = bravalue;
1493                 tempcode = code;
1494                 tempreqvary = cd.reqVaryOpt;     /* Save value before bracket */
1495                 {
1496                     unsigned bracketsBeforeRecursion = *brackets;
1497                     if (!compileBracket(
1498                                        options,
1499                                        brackets,                     /* Extracting bracket count */
1500                                        &tempcode,                    /* Where to put code (updated) */
1501                                        &ptr,                         /* Input pointer (updated) */
1502                                        patternEnd,
1503                                        errorCodePtr,                 /* Where to put an error message */
1504                                        skipBytes,                    /* Skip over OP_BRANUMBER */
1505                                        &subFirstByte,                /* For possible first char */
1506                                        &subReqByte,                  /* For possible last char */
1507                                        cd))                          /* Tables block */
1508                         goto FAILED;
1509                     unsigned enclosedBrackets = (*brackets - bracketsBeforeRecursion);
1510                     unsigned limitBracket = minBracket + enclosedBrackets + (bravalue > OP_BRA);
1511                     if (!((minBracket & 0xff) == minBracket && (limitBracket & 0xff) == limitBracket)) {
1512                         *errorCodePtr = ERR17;
1513                         return false;
1514                     }
1515                     JS_ASSERT(minBracket <= limitBracket);
1516                     put2ByteValue(code + 1 + LINK_SIZE, minBracket << 8 | limitBracket);
1517                 }
1518
1519                 /* At the end of compiling, code is still pointing to the start of the
1520                  group, while tempcode has been updated to point past the end of the group
1521                  and any option resetting that may follow it. The pattern pointer (ptr)
1522                  is on the bracket. */
1523
1524                 /* Handle updating of the required and first characters. Update for normal
1525                  brackets of all kinds, and conditions with two branches (see code above).
1526                  If the bracket is followed by a quantifier with zero repeat, we have to
1527                  back off. Hence the definition of zeroReqByte and zeroFirstByte outside the
1528                  main loop so that they can be accessed for the back off. */
1529
1530                 zeroReqByte = reqByte;
1531                 zeroFirstByte = firstByte;
1532                 didGroupSetFirstByte = false;
1533
1534                 if (bravalue >= OP_BRA) {
1535                     /* If we have not yet set a firstByte in this branch, take it from the
1536                      subpattern, remembering that it was set here so that a repeat of more
1537                      than one can replicate it as reqByte if necessary. If the subpattern has
1538                      no firstByte, set "none" for the whole branch. In both cases, a zero
1539                      repeat forces firstByte to "none". */
1540
1541                     if (firstByte == REQ_UNSET) {
1542                         if (subFirstByte >= 0) {
1543                             firstByte = subFirstByte;
1544                             didGroupSetFirstByte = true;
1545                         }
1546                         else
1547                             firstByte = REQ_NONE;
1548                         zeroFirstByte = REQ_NONE;
1549                     }
1550
1551                     /* If firstByte was previously set, convert the subpattern's firstByte
1552                      into reqByte if there wasn't one, using the vary flag that was in
1553                      existence beforehand. */
1554
1555                     else if (subFirstByte >= 0 && subReqByte < 0)
1556                         subReqByte = subFirstByte | tempreqvary;
1557
1558                     /* If the subpattern set a required byte (or set a first byte that isn't
1559                      really the first byte - see above), set it. */
1560
1561                     if (subReqByte >= 0)
1562                         reqByte = subReqByte;
1563                 }
1564
1565                 /* For a forward assertion, we take the reqByte, if set. This can be
1566                  helpful if the pattern that follows the assertion doesn't set a different
1567                  char. For example, it's useful for /(?=abcde).+/. We can't set firstByte
1568                  for an assertion, however because it leads to incorrect effect for patterns
1569                  such as /(?=a)a.+/ when the "real" "a" would then become a reqByte instead
1570                  of a firstByte. This is overcome by a scan at the end if there's no
1571                  firstByte, looking for an asserted first char. */
1572
1573                 else if (bravalue == OP_ASSERT && subReqByte >= 0)
1574                     reqByte = subReqByte;
1575
1576                 /* Now update the main code pointer to the end of the group. */
1577
1578                 code = tempcode;
1579
1580                 /* Error if hit end of pattern */
1581
1582                 if (ptr >= patternEnd || *ptr != ')') {
1583                     *errorCodePtr = ERR14;
1584                     goto FAILED;
1585                 }
1586                 break;
1587
1588             }
1589             /* Check \ for being a real metacharacter; if not, fall through and handle
1590              it as a data character at the start of a string. Escape items are checked
1591              for validity in the pre-compiling pass. */
1592
1593             case '\\':
1594                 tempptr = ptr;
1595                 c = checkEscape(&ptr, patternEnd, errorCodePtr, cd.numCapturingBrackets, false);
1596
1597                 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1598                  are arranged to be the negation of the corresponding OP_values. For the
1599                  back references, the values are ESC_REF plus the reference number. Only
1600                  back references and those types that consume a character may be repeated.
1601                  We can test for values between ESC_b and ESC_w for the latter; this may
1602                  have to change if any new ones are ever created. */
1603
1604                 if (c < 0) {
1605                     /* For metasequences that actually match a character, we disable the
1606                      setting of a first character if it hasn't already been set. */
1607
1608                     if (firstByte == REQ_UNSET && -c > ESC_b && -c <= ESC_w)
1609                         firstByte = REQ_NONE;
1610
1611                     /* Set values to reset to if this is followed by a zero repeat. */
1612
1613                     zeroFirstByte = firstByte;
1614                     zeroReqByte = reqByte;
1615
1616                     /* Back references are handled specially */
1617
1618                     if (-c >= ESC_REF) {
1619                         int number = -c - ESC_REF;
1620                         previous = code;
1621                         *code++ = OP_REF;
1622                         put2ByteValueAndAdvance(code, number);
1623                     }
1624
1625                     /* For the rest, we can obtain the OP value by negating the escape
1626                      value */
1627
1628                     else {
1629                         previous = (-c > ESC_b && -c <= ESC_w) ? code : NULL;
1630                         *code++ = -c;
1631                     }
1632                     continue;
1633                 }
1634
1635                 /* Fall through. */
1636
1637                 /* Handle a literal character. It is guaranteed not to be whitespace or #
1638                  when the extended flag is set. If we are in UTF-8 mode, it may be a
1639                  multi-byte literal character. */
1640
1641                 default:
1642             NORMAL_CHAR:
1643
1644                 previous = code;
1645
1646                 if (c < 128) {
1647                     mcLength = 1;
1648                     mcbuffer[0] = c;
1649
1650                     if ((options & IgnoreCaseOption) && (c | 0x20) >= 'a' && (c | 0x20) <= 'z') {
1651                         *code++ = OP_ASCII_LETTER_IGNORING_CASE;
1652                         *code++ = c | 0x20;
1653                     } else {
1654                         *code++ = OP_ASCII_CHAR;
1655                         *code++ = c;
1656                     }
1657                 } else {
1658                     mcLength = encodeUTF8(c, mcbuffer);
1659
1660                     *code++ = (options & IgnoreCaseOption) ? OP_CHAR_IGNORING_CASE : OP_CHAR;
1661                     for (c = 0; c < mcLength; c++)
1662                         *code++ = mcbuffer[c];
1663                 }
1664
1665                 /* Set the first and required bytes appropriately. If no previous first
1666                  byte, set it from this character, but revert to none on a zero repeat.
1667                  Otherwise, leave the firstByte value alone, and don't change it on a zero
1668                  repeat. */
1669
1670                 if (firstByte == REQ_UNSET) {
1671                     zeroFirstByte = REQ_NONE;
1672                     zeroReqByte = reqByte;
1673
1674                     /* If the character is more than one byte long, we can set firstByte
1675                      only if it is not to be matched caselessly. */
1676
1677                     if (mcLength == 1 || reqCaseOpt == 0) {
1678                         firstByte = mcbuffer[0] | reqCaseOpt;
1679                         if (mcLength != 1)
1680                             reqByte = code[-1] | cd.reqVaryOpt;
1681                     }
1682                     else
1683                         firstByte = reqByte = REQ_NONE;
1684                 }
1685
1686                 /* firstByte was previously set; we can set reqByte only the length is
1687                  1 or the matching is caseful. */
1688
1689                 else {
1690                     zeroFirstByte = firstByte;
1691                     zeroReqByte = reqByte;
1692                     if (mcLength == 1 || reqCaseOpt == 0)
1693                         reqByte = code[-1] | reqCaseOpt | cd.reqVaryOpt;
1694                 }
1695
1696                 break;            /* End of literal character handling */
1697         }
1698     }                   /* end of big loop */
1699
1700     /* Control never reaches here by falling through, only by a goto for all the
1701      error states. Pass back the position in the pattern so that it can be displayed
1702      to the user for diagnosing the error. */
1703
1704 FAILED:
1705     *ptrPtr = ptr;
1706     return false;
1707 }
1708
1709 /*************************************************
1710 *     Compile sequence of alternatives           *
1711 *************************************************/
1712
1713 /* On entry, ptr is pointing past the bracket character, but on return
1714 it points to the closing bracket, or vertical bar, or end of string.
1715 The code variable is pointing at the byte into which the BRA operator has been
1716 stored. If the ims options are changed at the start (for a (?ims: group) or
1717 during any branch, we need to insert an OP_OPT item at the start of every
1718 following branch to ensure they get set correctly at run time, and also pass
1719 the new options into every subsequent branch compile.
1720
1721 Argument:
1722   options        option bits, including any changes for this subpattern
1723   brackets       -> int containing the number of extracting brackets used
1724   codePtr        -> the address of the current code pointer
1725   ptrPtr         -> the address of the current pattern pointer
1726   errorCodePtr   -> pointer to error code variable
1727   skipBytes      skip this many bytes at start (for OP_BRANUMBER)
1728   firstbyteptr   place to put the first required character, or a negative number
1729   reqbyteptr     place to put the last required character, or a negative number
1730   cd             points to the data block with tables pointers etc.
1731
1732 Returns:      true on success
1733 */
1734
1735 static bool
1736 compileBracket(int options, int* brackets, unsigned char** codePtr,
1737     const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int skipBytes,
1738     int* firstbyteptr, int* reqbyteptr, CompileData& cd)
1739 {
1740     const UChar* ptr = *ptrPtr;
1741     unsigned char* code = *codePtr;
1742     unsigned char* lastBranch = code;
1743     unsigned char* start_bracket = code;
1744     int firstByte = REQ_UNSET;
1745     int reqByte = REQ_UNSET;
1746
1747     /* Offset is set zero to mark that this bracket is still open */
1748
1749     putLinkValueAllowZero(code + 1, 0);
1750     code += 1 + LINK_SIZE + skipBytes;
1751
1752     /* Loop for each alternative branch */
1753
1754     while (true) {
1755         /* Now compile the branch */
1756
1757         int branchFirstByte;
1758         int branchReqByte;
1759         if (!compileBranch(options, brackets, &code, &ptr, patternEnd, errorCodePtr,
1760                             &branchFirstByte, &branchReqByte, cd)) {
1761             *ptrPtr = ptr;
1762             return false;
1763         }
1764
1765         /* If this is the first branch, the firstByte and reqByte values for the
1766          branch become the values for the regex. */
1767
1768         if (*lastBranch != OP_ALT) {
1769             firstByte = branchFirstByte;
1770             reqByte = branchReqByte;
1771         }
1772
1773         /* If this is not the first branch, the first char and reqByte have to
1774          match the values from all the previous branches, except that if the previous
1775          value for reqByte didn't have REQ_VARY set, it can still match, and we set
1776          REQ_VARY for the regex. */
1777
1778         else {
1779             /* If we previously had a firstByte, but it doesn't match the new branch,
1780              we have to abandon the firstByte for the regex, but if there was previously
1781              no reqByte, it takes on the value of the old firstByte. */
1782
1783             if (firstByte >= 0 && firstByte != branchFirstByte) {
1784                 if (reqByte < 0)
1785                     reqByte = firstByte;
1786                 firstByte = REQ_NONE;
1787             }
1788
1789             /* If we (now or from before) have no firstByte, a firstByte from the
1790              branch becomes a reqByte if there isn't a branch reqByte. */
1791
1792             if (firstByte < 0 && branchFirstByte >= 0 && branchReqByte < 0)
1793                 branchReqByte = branchFirstByte;
1794
1795             /* Now ensure that the reqbytes match */
1796
1797             if ((reqByte & ~REQ_VARY) != (branchReqByte & ~REQ_VARY))
1798                 reqByte = REQ_NONE;
1799             else
1800                 reqByte |= branchReqByte;   /* To "or" REQ_VARY */
1801         }
1802
1803         /* Reached end of expression, either ')' or end of pattern. Go back through
1804          the alternative branches and reverse the chain of offsets, with the field in
1805          the BRA item now becoming an offset to the first alternative. If there are
1806          no alternatives, it points to the end of the group. The length in the
1807          terminating ket is always the length of the whole bracketed item.
1808          Return leaving the pointer at the terminating char. */
1809
1810         if (ptr >= patternEnd || *ptr != '|') {
1811             int length = code - lastBranch;
1812             do {
1813                 int prevLength = getLinkValueAllowZero(lastBranch + 1);
1814                 putLinkValue(lastBranch + 1, length);
1815                 length = prevLength;
1816                 lastBranch -= length;
1817             } while (length > 0);
1818
1819             /* Fill in the ket */
1820
1821             *code = OP_KET;
1822             putLinkValue(code + 1, code - start_bracket);
1823             code += 1 + LINK_SIZE;
1824
1825             /* Set values to pass back */
1826
1827             *codePtr = code;
1828             *ptrPtr = ptr;
1829             *firstbyteptr = firstByte;
1830             *reqbyteptr = reqByte;
1831             return true;
1832         }
1833
1834         /* Another branch follows; insert an "or" node. Its length field points back
1835          to the previous branch while the bracket remains open. At the end the chain
1836          is reversed. It's done like this so that the start of the bracket has a
1837          zero offset until it is closed, making it possible to detect recursion. */
1838
1839         *code = OP_ALT;
1840         putLinkValue(code + 1, code - lastBranch);
1841         lastBranch = code;
1842         code += 1 + LINK_SIZE;
1843         ptr++;
1844     }
1845     JS_NOT_REACHED("No fallthru.");
1846 }
1847
1848 /*************************************************
1849 *          Check for anchored expression         *
1850 *************************************************/
1851
1852 /* Try to find out if this is an anchored regular expression. Consider each
1853 alternative branch. If they all start OP_CIRC, or with a bracket
1854 all of whose alternatives start OP_CIRC (recurse ad lib), then
1855 it's anchored.
1856
1857 Arguments:
1858   code          points to start of expression (the bracket)
1859   captureMap    a bitmap of which brackets we are inside while testing; this
1860                  handles up to substring 31; all brackets after that share
1861                  the zero bit
1862   backrefMap    the back reference bitmap
1863 */
1864
1865 static bool branchIsAnchored(const unsigned char* code)
1866 {
1867     const unsigned char* scode = firstSignificantOpcode(code);
1868     int op = *scode;
1869
1870     /* Brackets */
1871     if (op >= OP_BRA || op == OP_ASSERT)
1872         return bracketIsAnchored(scode);
1873
1874     /* Check for explicit anchoring */
1875     return op == OP_CIRC;
1876 }
1877
1878 static bool bracketIsAnchored(const unsigned char* code)
1879 {
1880     do {
1881         if (!branchIsAnchored(code + 1 + LINK_SIZE))
1882             return false;
1883         code += getLinkValue(code + 1);
1884     } while (*code == OP_ALT);   /* Loop for each alternative */
1885     return true;
1886 }
1887
1888 /*************************************************
1889 *         Check for starting with ^ or .*        *
1890 *************************************************/
1891
1892 /* This is called to find out if every branch starts with ^ or .* so that
1893 "first char" processing can be done to speed things up in multiline
1894 matching and for non-DOTALL patterns that start with .* (which must start at
1895 the beginning or after \n)
1896
1897 Except when the .* appears inside capturing parentheses, and there is a
1898 subsequent back reference to those parentheses. By keeping a bitmap of the
1899 first 31 back references, we can catch some of the more common cases more
1900 precisely; all the greater back references share a single bit.
1901
1902 Arguments:
1903   code          points to start of expression (the bracket)
1904   captureMap    a bitmap of which brackets we are inside while testing; this
1905                  handles up to substring 31; all brackets after that share
1906                  the zero bit
1907   backrefMap    the back reference bitmap
1908 */
1909
1910 static bool branchNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap)
1911 {
1912     const unsigned char* scode = firstSignificantOpcode(code);
1913     int op = *scode;
1914
1915     /* Capturing brackets */
1916     if (op > OP_BRA) {
1917         int captureNum = op - OP_BRA;
1918         if (captureNum > EXTRACT_BASIC_MAX)
1919             captureNum = get2ByteValue(scode + 2 + LINK_SIZE);
1920         int bracketMask = (captureNum < 32) ? (1 << captureNum) : 1;
1921         return bracketNeedsLineStart(scode, captureMap | bracketMask, backrefMap);
1922     }
1923
1924     /* Other brackets */
1925     if (op == OP_BRA || op == OP_ASSERT)
1926         return bracketNeedsLineStart(scode, captureMap, backrefMap);
1927
1928     /* .* means "start at start or after \n" if it isn't in brackets that
1929      may be referenced. */
1930
1931     if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1932         return scode[1] == OP_NOT_NEWLINE && !(captureMap & backrefMap);
1933
1934     /* Explicit ^ */
1935     return op == OP_CIRC || op == OP_BOL;
1936 }
1937
1938 static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap)
1939 {
1940     do {
1941         if (!branchNeedsLineStart(code + 1 + LINK_SIZE, captureMap, backrefMap))
1942             return false;
1943         code += getLinkValue(code + 1);
1944     } while (*code == OP_ALT);  /* Loop for each alternative */
1945     return true;
1946 }
1947
1948 /*************************************************
1949 *       Check for asserted fixed first char      *
1950 *************************************************/
1951
1952 /* During compilation, the "first char" settings from forward assertions are
1953 discarded, because they can cause conflicts with actual literals that follow.
1954 However, if we end up without a first char setting for an unanchored pattern,
1955 it is worth scanning the regex to see if there is an initial asserted first
1956 char. If all branches start with the same asserted char, or with a bracket all
1957 of whose alternatives start with the same asserted char (recurse ad lib), then
1958 we return that char, otherwise -1.
1959
1960 Arguments:
1961   code       points to start of expression (the bracket)
1962   options    pointer to the options (used to check casing changes)
1963   inassert   true if in an assertion
1964
1965 Returns:     -1 or the fixed first char
1966 */
1967
1968 static int branchFindFirstAssertedCharacter(const unsigned char* code, bool inassert)
1969 {
1970     const unsigned char* scode = firstSignificantOpcodeSkippingAssertions(code);
1971     int op = *scode;
1972
1973     if (op >= OP_BRA)
1974         op = OP_BRA;
1975
1976     switch (op) {
1977         default:
1978             return -1;
1979
1980         case OP_BRA:
1981         case OP_ASSERT:
1982             return bracketFindFirstAssertedCharacter(scode, op == OP_ASSERT);
1983
1984         case OP_EXACT:
1985             scode += 2;
1986             /* Fall through */
1987
1988         case OP_CHAR:
1989         case OP_CHAR_IGNORING_CASE:
1990         case OP_ASCII_CHAR:
1991         case OP_ASCII_LETTER_IGNORING_CASE:
1992         case OP_PLUS:
1993         case OP_MINPLUS:
1994             if (!inassert)
1995                 return -1;
1996             return scode[1];
1997     }
1998 }
1999
2000 static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool inassert)
2001 {
2002     int c = -1;
2003     do {
2004         int d = branchFindFirstAssertedCharacter(code + 1 + LINK_SIZE, inassert);
2005         if (d < 0)
2006             return -1;
2007         if (c < 0)
2008             c = d;
2009         else if (c != d)
2010             return -1;
2011         code += getLinkValue(code + 1);
2012     } while (*code == OP_ALT);
2013     return c;
2014 }
2015
2016 static inline int multiplyWithOverflowCheck(int a, int b)
2017 {
2018     if (!a || !b)
2019         return 0;
2020     if (a > MAX_PATTERN_SIZE / b)
2021         return -1;
2022     return a * b;
2023 }
2024
2025 static int calculateCompiledPatternLength(const UChar* pattern, int patternLength, JSRegExpIgnoreCaseOption ignoreCase,
2026     CompileData& cd, ErrorCode& errorcode)
2027 {
2028     /* Make a pass over the pattern to compute the
2029      amount of store required to hold the compiled code. This does not have to be
2030      perfect as long as errors are overestimates. */
2031
2032     if (patternLength > MAX_PATTERN_SIZE) {
2033         errorcode = ERR16;
2034         return -1;
2035     }
2036
2037     int length = BRA_LEN;      /* For initial BRA. */
2038     int branch_extra = 0;
2039     int lastitemlength = 0;
2040     unsigned brastackptr = 0;
2041     int brastack[BRASTACK_SIZE];
2042     unsigned char bralenstack[BRASTACK_SIZE];
2043     int bracount = 0;
2044
2045     const UChar* ptr = (const UChar*)(pattern - 1);
2046     const UChar* patternEnd = (const UChar*)(pattern + patternLength);
2047
2048     while (++ptr < patternEnd) {
2049         int minRepeats = 0, maxRepeats = 0;
2050         int c = *ptr;
2051
2052         switch (c) {
2053             /* A backslashed item may be an escaped data character or it may be a
2054              character type. */
2055
2056             case '\\':
2057                 c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, false);
2058                 if (errorcode != 0)
2059                     return -1;
2060
2061                 lastitemlength = 1;     /* Default length of last item for repeats */
2062
2063                 if (c >= 0) {            /* Data character */
2064                     length += 2;          /* For a one-byte character */
2065
2066                     if (c > 127) {
2067                         int i;
2068                         for (i = 0; i < jsc_pcre_utf8_table1_size; i++)
2069                             if (c <= jsc_pcre_utf8_table1[i]) break;
2070                         length += i;
2071                         lastitemlength += i;
2072                     }
2073
2074                     continue;
2075                 }
2076
2077                 /* Other escapes need one byte */
2078
2079                 length++;
2080
2081                 /* A back reference needs an additional 2 bytes, plus either one or 5
2082                  bytes for a repeat. We also need to keep the value of the highest
2083                  back reference. */
2084
2085                 if (c <= -ESC_REF) {
2086                     int refnum = -c - ESC_REF;
2087                     cd.backrefMap |= (refnum < 32) ? (1 << refnum) : 1;
2088                     if (refnum > cd.topBackref)
2089                         cd.topBackref = refnum;
2090                     length += 2;   /* For single back reference */
2091                     if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRepeat(ptr + 2, patternEnd)) {
2092                         ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
2093                         if (errorcode)
2094                             return -1;
2095                         if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
2096                             (minRepeats == 1 && maxRepeats == -1))
2097                             length++;
2098                         else
2099                             length += 5;
2100                         if (safelyCheckNextChar(ptr, patternEnd, '?'))
2101                             ptr++;
2102                     }
2103                 }
2104                 continue;
2105
2106             case '^':     /* Single-byte metacharacters */
2107             case '.':
2108             case '$':
2109                 length++;
2110                 lastitemlength = 1;
2111                 continue;
2112
2113             case '*':            /* These repeats won't be after brackets; */
2114             case '+':            /* those are handled separately */
2115             case '?':
2116                 length++;
2117                 goto POSSESSIVE;
2118
2119             /* This covers the cases of braced repeats after a single char, metachar,
2120              class, or back reference. */
2121
2122             case '{':
2123                 if (!isCountedRepeat(ptr + 1, patternEnd))
2124                     goto NORMAL_CHAR;
2125                 ptr = readRepeatCounts(ptr + 1, &minRepeats, &maxRepeats, &errorcode);
2126                 if (errorcode != 0)
2127                     return -1;
2128
2129                 /* These special cases just insert one extra opcode */
2130
2131                 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
2132                     (minRepeats == 1 && maxRepeats == -1))
2133                     length++;
2134
2135                 /* These cases might insert additional copies of a preceding character. */
2136
2137                 else {
2138                     if (minRepeats != 1) {
2139                         length -= lastitemlength;   /* Uncount the original char or metachar */
2140                         if (minRepeats > 0)
2141                             length += 5 + lastitemlength;
2142                     }
2143                     length += lastitemlength + ((maxRepeats > 0) ? 5 : 1);
2144                 }
2145
2146                 if (safelyCheckNextChar(ptr, patternEnd, '?'))
2147                     ptr++;      /* Needs no extra length */
2148
2149             POSSESSIVE:                     /* Test for possessive quantifier */
2150                 if (safelyCheckNextChar(ptr, patternEnd, '+')) {
2151                     ptr++;
2152                     length += 2 + 2 * LINK_SIZE;   /* Allow for atomic brackets */
2153                 }
2154                 continue;
2155
2156             /* An alternation contains an offset to the next branch or ket. If any ims
2157              options changed in the previous branch(es), and/or if we are in a
2158              lookbehind assertion, extra space will be needed at the start of the
2159              branch. This is handled by branch_extra. */
2160
2161             case '|':
2162                 if (brastackptr == 0)
2163                     cd.needOuterBracket = true;
2164                 length += 1 + LINK_SIZE + branch_extra;
2165                 continue;
2166
2167             /* A character class uses 33 characters provided that all the character
2168              values are less than 256. Otherwise, it uses a bit map for low valued
2169              characters, and individual items for others. Don't worry about character
2170              types that aren't allowed in classes - they'll get picked up during the
2171              compile. A character class that contains only one single-byte character
2172              uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
2173              where we can. (In UTF-8 mode we can do this only for chars < 128.) */
2174
2175             case '[': {
2176                 int class_optcount;
2177                 if (*(++ptr) == '^') {
2178                     class_optcount = 10;  /* Greater than one */
2179                     ptr++;
2180                 }
2181                 else
2182                     class_optcount = 0;
2183
2184                 bool class_utf8 = false;
2185
2186                 for (; ptr < patternEnd && *ptr != ']'; ++ptr) {
2187                     /* Check for escapes */
2188
2189                     if (*ptr == '\\') {
2190                         c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, true);
2191                         if (errorcode != 0)
2192                             return -1;
2193
2194                         /* Handle escapes that turn into characters */
2195
2196                         if (c >= 0)
2197                             goto NON_SPECIAL_CHARACTER;
2198
2199                         /* Escapes that are meta-things. The normal ones just affect the
2200                          bit map, but Unicode properties require an XCLASS extended item. */
2201
2202                         else
2203                             class_optcount = 10;         /* \d, \s etc; make sure > 1 */
2204                     }
2205
2206                     /* Anything else increments the possible optimization count. We have to
2207                      detect ranges here so that we can compute the number of extra ranges for
2208                      caseless wide characters when UCP support is available. If there are wide
2209                      characters, we are going to have to use an XCLASS, even for single
2210                      characters. */
2211
2212                     else {
2213                         c = *ptr;
2214
2215                         /* Come here from handling \ above when it escapes to a char value */
2216
2217                     NON_SPECIAL_CHARACTER:
2218                         class_optcount++;
2219
2220                         int d = -1;
2221                         if (safelyCheckNextChar(ptr, patternEnd, '-')) {
2222                             const UChar* hyptr = ptr++;
2223                             if (safelyCheckNextChar(ptr, patternEnd, '\\')) {
2224                                 ptr++;
2225                                 d = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, true);
2226                                 if (errorcode != 0)
2227                                     return -1;
2228                             }
2229                             else if ((ptr + 1 < patternEnd) && ptr[1] != ']')
2230                                 d = *++ptr;
2231                             if (d < 0)
2232                                 ptr = hyptr;      /* go back to hyphen as data */
2233                         }
2234
2235                         /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
2236                          127 for caseless matching, we will need to use an XCLASS. */
2237
2238                         if (d >= 0) {
2239                             class_optcount = 10;     /* Ensure > 1 */
2240                             if (d < c) {
2241                                 errorcode = ERR8;
2242                                 return -1;
2243                             }
2244
2245                             if ((d > 255 || (ignoreCase && d > 127))) {
2246                                 unsigned char buffer[6];
2247                                 if (!class_utf8)         /* Allow for XCLASS overhead */
2248                                 {
2249                                     class_utf8 = true;
2250                                     length += LINK_SIZE + 2;
2251                                 }
2252
2253                                 /* If we have UCP support, find out how many extra ranges are
2254                                  needed to map the other case of characters within this range. We
2255                                  have to mimic the range optimization here, because extending the
2256                                  range upwards might push d over a boundary that makes it use
2257                                  another byte in the UTF-8 representation. */
2258
2259                                 if (ignoreCase) {
2260                                     int occ, ocd;
2261                                     int cc = c;
2262                                     int origd = d;
2263                                     while (getOthercaseRange(&cc, origd, &occ, &ocd)) {
2264                                         if (occ >= c && ocd <= d)
2265                                             continue;   /* Skip embedded */
2266
2267                                         if (occ < c  && ocd >= c - 1)  /* Extend the basic range */
2268                                         {                            /* if there is overlap,   */
2269                                             c = occ;                     /* noting that if occ < c */
2270                                             continue;                    /* we can't have ocd > d  */
2271                                         }                            /* because a subrange is  */
2272                                         if (ocd > d && occ <= d + 1)   /* always shorter than    */
2273                                         {                            /* the basic range.       */
2274                                             d = ocd;
2275                                             continue;
2276                                         }
2277
2278                                         /* An extra item is needed */
2279
2280                                         length += 1 + encodeUTF8(occ, buffer) +
2281                                         ((occ == ocd) ? 0 : encodeUTF8(ocd, buffer));
2282                                     }
2283                                 }
2284
2285                                 /* The length of the (possibly extended) range */
2286
2287                                 length += 1 + encodeUTF8(c, buffer) + encodeUTF8(d, buffer);
2288                             }
2289
2290                         }
2291
2292                         /* We have a single character. There is nothing to be done unless we
2293                          are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
2294                          allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
2295                          support. */
2296
2297                         else {
2298                             if ((c > 255 || (ignoreCase && c > 127))) {
2299                                 unsigned char buffer[6];
2300                                 class_optcount = 10;     /* Ensure > 1 */
2301                                 if (!class_utf8)         /* Allow for XCLASS overhead */
2302                                 {
2303                                     class_utf8 = true;
2304                                     length += LINK_SIZE + 2;
2305                                 }
2306                                 length += (ignoreCase ? 2 : 1) * (1 + encodeUTF8(c, buffer));
2307                             }
2308                         }
2309                     }
2310                 }
2311
2312                 if (ptr >= patternEnd) {   /* Missing terminating ']' */
2313                     errorcode = ERR6;
2314                     return -1;
2315                 }
2316
2317                 /* We can optimize when there was only one optimizable character.
2318                  Note that this does not detect the case of a negated single character.
2319                  In that case we do an incorrect length computation, but it's not a serious
2320                  problem because the computed length is too large rather than too small. */
2321
2322                 if (class_optcount == 1)
2323                     goto NORMAL_CHAR;
2324
2325                 /* Here, we handle repeats for the class opcodes. */
2326                 {
2327                     length += 33;
2328
2329                     /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
2330                      we also need extra for wrapping the whole thing in a sub-pattern. */
2331
2332                     if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRepeat(ptr + 2, patternEnd)) {
2333                         ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
2334                         if (errorcode != 0)
2335                             return -1;
2336                         if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
2337                             (minRepeats == 1 && maxRepeats == -1))
2338                             length++;
2339                         else
2340                             length += 5;
2341                         if (safelyCheckNextChar(ptr, patternEnd, '+')) {
2342                             ptr++;
2343                             length += 2 + 2 * LINK_SIZE;
2344                         } else if (safelyCheckNextChar(ptr, patternEnd, '?'))
2345                             ptr++;
2346                     }
2347                 }
2348                 continue;
2349             }
2350
2351             /* Brackets may be genuine groups or special things */
2352
2353             case '(': {
2354                 int branch_newextra = 0;
2355                 int bracket_length = BRA_LEN;
2356                 bool capturing = false;
2357
2358                 /* Handle special forms of bracket, which all start (? */
2359
2360                 if (safelyCheckNextChar(ptr, patternEnd, '?')) {
2361                     switch (c = (ptr + 2 < patternEnd ? ptr[2] : 0)) {
2362                         /* Non-referencing groups and lookaheads just move the pointer on, and
2363                          then behave like a non-special bracket, except that they don't increment
2364                          the count of extracting brackets. Ditto for the "once only" bracket,
2365                          which is in Perl from version 5.005. */
2366
2367                         case ':':
2368                         case '=':
2369                         case '!':
2370                             ptr += 2;
2371                             break;
2372
2373                         /* Else loop checking valid options until ) is met. Anything else is an
2374                          error. If we are without any brackets, i.e. at top level, the settings
2375                          act as if specified in the options, so massage the options immediately.
2376                          This is for backward compatibility with Perl 5.004. */
2377
2378                         default:
2379                             errorcode = ERR12;
2380                             return -1;
2381                     }
2382                 } else
2383                     capturing = true;
2384
2385                 /* Capturing brackets must be counted so we can process escapes in a
2386                  Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need
2387                  an additional 3 bytes of memory per capturing bracket. */
2388
2389                 if (capturing) {
2390                     bracount++;
2391                     if (bracount > EXTRACT_BASIC_MAX)
2392                         bracket_length += 3;
2393                 }
2394
2395                 /* Save length for computing whole length at end if there's a repeat that
2396                  requires duplication of the group. Also save the current value of
2397                  branch_extra, and start the new group with the new value. If non-zero, this
2398                  will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
2399
2400                 if (brastackptr >= sizeof(brastack)/sizeof(int)) {
2401                     errorcode = ERR17;
2402                     return -1;
2403                 }
2404
2405                 bralenstack[brastackptr] = branch_extra;
2406                 branch_extra = branch_newextra;
2407
2408                 brastack[brastackptr++] = length;
2409                 length += bracket_length;
2410                 continue;
2411             }
2412
2413             /* Handle ket. Look for subsequent maxRepeats/minRepeats; for certain sets of values we
2414              have to replicate this bracket up to that many times. If brastackptr is
2415              0 this is an unmatched bracket which will generate an error, but take care
2416              not to try to access brastack[-1] when computing the length and restoring
2417              the branch_extra value. */
2418
2419             case ')': {
2420                 int duplength;
2421                 length += KET_LEN;
2422                 if (brastackptr > 0) {
2423                     duplength = length - brastack[--brastackptr];
2424                     branch_extra = bralenstack[brastackptr];
2425                 }
2426                 else
2427                     duplength = 0;
2428
2429                 /* Leave ptr at the final char; for readRepeatCounts this happens
2430                  automatically; for the others we need an increment. */
2431
2432                 if ((ptr + 1 < patternEnd) && (c = ptr[1]) == '{' && isCountedRepeat(ptr + 2, patternEnd)) {
2433                     ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
2434                     if (errorcode)
2435                         return -1;
2436                 } else if (c == '*') {
2437                     minRepeats = 0;
2438                     maxRepeats = -1;
2439                     ptr++;
2440                 } else if (c == '+') {
2441                     minRepeats = 1;
2442                     maxRepeats = -1;
2443                     ptr++;
2444                 } else if (c == '?') {
2445                     minRepeats = 0;
2446                     maxRepeats = 1;
2447                     ptr++;
2448                 } else {
2449                     minRepeats = 1;
2450                     maxRepeats = 1;
2451                 }
2452
2453                 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2454                  group, and if the maximum is greater than zero, we have to replicate
2455                  maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2456                  bracket set. */
2457
2458                 int repeatsLength;
2459                 if (minRepeats == 0) {
2460                     length++;
2461                     if (maxRepeats > 0) {
2462                         repeatsLength = multiplyWithOverflowCheck(maxRepeats - 1, duplength + BRA_LEN + KET_LEN + OPCODE_LEN);
2463                         if (repeatsLength < 0) {
2464                             errorcode = ERR16;
2465                             return -1;
2466                         }
2467                         length += repeatsLength;
2468                         if (length > MAX_PATTERN_SIZE) {
2469                             errorcode = ERR16;
2470                             return -1;
2471                         }
2472                     }
2473                 }
2474
2475                 /* When the minimum is greater than zero, we have to replicate up to
2476                  minval-1 times, with no additions required in the copies. Then, if there
2477                  is a limited maximum we have to replicate up to maxval-1 times allowing
2478                  for a BRAZERO item before each optional copy and nesting brackets for all
2479                  but one of the optional copies. */
2480
2481                 else {
2482                     repeatsLength = multiplyWithOverflowCheck(minRepeats - 1, duplength);
2483                     if (repeatsLength < 0) {
2484                         errorcode = ERR16;
2485                         return -1;
2486                     }
2487                     length += repeatsLength;
2488                     if (maxRepeats > minRepeats) { /* Need this test as maxRepeats=-1 means no limit */
2489                         repeatsLength = multiplyWithOverflowCheck(maxRepeats - minRepeats, duplength + BRAZERO_LEN + BRA_LEN + KET_LEN);
2490                         if (repeatsLength < 0) {
2491                             errorcode = ERR16;
2492                             return -1;
2493                         }
2494                         length += repeatsLength - (2 + 2 * LINK_SIZE);
2495                     }
2496                     if (length > MAX_PATTERN_SIZE) {
2497                         errorcode = ERR16;
2498                         return -1;
2499                     }
2500                 }
2501
2502                 /* Allow space for once brackets for "possessive quantifier" */
2503
2504                 if (safelyCheckNextChar(ptr, patternEnd, '+')) {
2505                     ptr++;
2506                     length += 2 + 2 * LINK_SIZE;
2507                 }
2508                 continue;
2509             }
2510
2511             /* Non-special character. It won't be space or # in extended mode, so it is
2512              always a genuine character. If we are in a \Q...\E sequence, check for the
2513              end; if not, we have a literal. */
2514
2515             default:
2516             NORMAL_CHAR:
2517                 length += 2;          /* For a one-byte character */
2518                 lastitemlength = 1;   /* Default length of last item for repeats */
2519
2520                 if (c > 127) {
2521                     int i;
2522                     for (i = 0; i < jsc_pcre_utf8_table1_size; i++)
2523                         if (c <= jsc_pcre_utf8_table1[i])
2524                             break;
2525                     length += i;
2526                     lastitemlength += i;
2527                 }
2528
2529                 continue;
2530         }
2531     }
2532
2533     length += KET_LEN + OPCODE_LEN;    /* For final KET and END */
2534
2535     cd.numCapturingBrackets = bracount;
2536     return length;
2537 }
2538
2539 /*************************************************
2540 *        Compile a Regular Expression            *
2541 *************************************************/
2542
2543 /* This function takes a string and returns a pointer to a block of store
2544 holding a compiled version of the expression. The original API for this
2545 function had no error code return variable; it is retained for backwards
2546 compatibility. The new function is given a new name.
2547
2548 Arguments:
2549   pattern       the regular expression
2550   options       various option bits
2551   errorCodePtr  pointer to error code variable (pcre_compile2() only)
2552                   can be NULL if you don't want a code value
2553   error         pointer to pointer to error text
2554   erroroffset   ptr offset in pattern where error was detected
2555   tables        pointer to character tables or NULL
2556
2557 Returns:        pointer to compiled data block, or NULL on error,
2558                 with error and erroroffset set
2559 */
2560
2561 static inline JSRegExp* returnError(ErrorCode errorcode, int *error)
2562 {
2563     *error = static_cast<int>(errorcode);
2564     return 0;
2565 }
2566
2567 JSRegExp* jsRegExpCompile(const UChar* pattern, int patternLength,
2568                 JSRegExpIgnoreCaseOption ignoreCase, JSRegExpMultilineOption multiline,
2569                 unsigned* numSubpatterns, int *error)
2570 {
2571     /* We can't pass back an error message if error is NULL; I guess the best we
2572      can do is just return NULL, but we can set a code value if there is a code pointer. */
2573     if (!error)
2574         return 0;
2575     *error = 0;
2576
2577     CompileData cd;
2578
2579     ErrorCode errorcode = ERR0;
2580     /* Call this once just to count the brackets. */
2581     calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, errorcode);
2582     /* Call it again to compute the length. */
2583     int length = calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, errorcode);
2584     if (errorcode)
2585         return returnError(errorcode, error);
2586
2587     if (length > MAX_PATTERN_SIZE)
2588         return returnError(ERR16, error);
2589
2590     size_t size = length + sizeof(JSRegExp);
2591     // FIXME: bug 574459 -- no NULL check
2592     JSRegExp* re = reinterpret_cast<JSRegExp*>(js_array_new<char>(size));
2593
2594     if (!re)
2595         return returnError(ERR13, error);
2596
2597     re->options = (ignoreCase ? IgnoreCaseOption : 0) | (multiline ? MatchAcrossMultipleLinesOption : 0);
2598
2599     /* The starting points of the name/number translation table and of the code are
2600      passed around in the compile data block. */
2601
2602     const unsigned char* codeStart = (const unsigned char*)(re + 1);
2603
2604     /* Set up a starting, non-extracting bracket, then compile the expression. On
2605      error, errorcode will be set non-zero, so we don't need to look at the result
2606      of the function here. */
2607
2608     const UChar* ptr = (const UChar*)pattern;
2609     const UChar* patternEnd = pattern + patternLength;
2610     unsigned char* code = const_cast<unsigned char*>(codeStart);
2611     int firstByte, reqByte;
2612     int bracketCount = 0;
2613     if (!cd.needOuterBracket)
2614         compileBranch(re->options, &bracketCount, &code, &ptr, patternEnd, &errorcode, &firstByte, &reqByte, cd);
2615     else {
2616         *code = OP_BRA;
2617         unsigned char * const codeBefore = code;
2618         compileBracket(re->options, &bracketCount, &code, &ptr, patternEnd, &errorcode, 2, &firstByte, &reqByte, cd);
2619         JS_ASSERT((bracketCount & 0xff) == bracketCount);
2620         put2ByteValue(codeBefore + 1 + LINK_SIZE, 0 << 8 | (bracketCount & 0xff));
2621     }
2622     re->topBracket = bracketCount;
2623     re->topBackref = cd.topBackref;
2624
2625     /* If not reached end of pattern on success, there's an excess bracket. */
2626
2627     if (errorcode == 0 && ptr < patternEnd)
2628         errorcode = ERR10;
2629
2630     /* Fill in the terminating state and check for disastrous overflow, but
2631      if debugging, leave the test till after things are printed out. */
2632
2633     *code++ = OP_END;
2634
2635     JS_ASSERT(code - codeStart <= length);
2636     if (code - codeStart > length)
2637         errorcode = ERR7;
2638
2639     /* Give an error if there's back reference to a non-existent capturing
2640      subpattern. */
2641
2642     if (re->topBackref > re->topBracket)
2643         errorcode = ERR15;
2644
2645     /* Failed to compile, or error while post-processing */
2646
2647     if (errorcode != ERR0) {
2648         js_array_delete(reinterpret_cast<char*>(re));
2649         return returnError(errorcode, error);
2650     }
2651
2652     /* If the anchored option was not passed, set the flag if we can determine that
2653      the pattern is anchored by virtue of ^ characters or \A or anything else (such
2654      as starting with .* when DOTALL is set).
2655
2656      Otherwise, if we know what the first character has to be, save it, because that
2657      speeds up unanchored matches no end. If not, see if we can set the
2658      UseMultiLineFirstByteOptimizationOption flag. This is helpful for multiline matches when all branches
2659      start with ^. and also when all branches start with .* for non-DOTALL matches.
2660      */
2661
2662     if (cd.needOuterBracket ? bracketIsAnchored(codeStart) : branchIsAnchored(codeStart))
2663         re->options |= IsAnchoredOption;
2664     else {
2665         if (firstByte < 0) {
2666             firstByte = (cd.needOuterBracket
2667                     ? bracketFindFirstAssertedCharacter(codeStart, false)
2668                     : branchFindFirstAssertedCharacter(codeStart, false))
2669                 | ((re->options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0);
2670         }
2671         if (firstByte >= 0) {
2672             int ch = firstByte & 255;
2673             if (ch < 127) {
2674                 re->firstByte = ((firstByte & REQ_IGNORE_CASE) && flipCase(ch) == ch) ? ch : firstByte;
2675                 re->options |= UseFirstByteOptimizationOption;
2676             }
2677         } else {
2678             if (cd.needOuterBracket ? bracketNeedsLineStart(codeStart, 0, cd.backrefMap) : branchNeedsLineStart(codeStart, 0, cd.backrefMap))
2679                 re->options |= UseMultiLineFirstByteOptimizationOption;
2680         }
2681     }
2682
2683     /* For an anchored pattern, we use the "required byte" only if it follows a
2684      variable length item in the regex. Remove the caseless flag for non-caseable
2685      bytes. */
2686
2687     if (reqByte >= 0 && (!(re->options & IsAnchoredOption) || (reqByte & REQ_VARY))) {
2688         int ch = reqByte & 255;
2689         if (ch < 127) {
2690             re->reqByte = ((reqByte & REQ_IGNORE_CASE) && flipCase(ch) == ch) ? (reqByte & ~REQ_IGNORE_CASE) : reqByte;
2691             re->options |= UseRequiredByteOptimizationOption;
2692         }
2693     }
2694
2695     if (numSubpatterns)
2696         *numSubpatterns = re->topBracket;
2697
2698     return re;
2699 }
2700
2701 void jsRegExpFree(JSRegExp* re)
2702 {
2703     js_array_delete(reinterpret_cast<char*>(re));
2704 }