1 /* This is JavaScriptCore's variant of the PCRE library. While this library
2 started out as a copy of PCRE, many of the features of PCRE have been
3 removed. This library now supports only the regular expression features
4 required by the JavaScript language specification, and has only the functions
5 needed by JavaScriptCore and the rest of WebKit.
7 Originally written by Philip Hazel
8 Copyright (c) 1997-2006 University of Cambridge
9 Copyright (C) 2002, 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
10 Copyright (C) 2007 Eric Seidel <eric@webkit.org>
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
41 /* This module contains the external function jsRegExpExecute(), along with
42 supporting internal functions that are not used by other modules. */
44 #include "pcre_internal.h"
47 #include "yarr/wtf/ASCIICType.h"
52 /* Negative values for the firstchar and reqchar variables */
54 #define REQ_UNSET (-2)
57 /*************************************************
58 * Code parameters and static tables *
59 *************************************************/
61 /* Maximum number of items on the nested bracket stacks at compile time. This
62 applies to the nesting of all kinds of parentheses. It does not limit
63 un-nested, non-capturing parentheses. This number can be made bigger if
64 necessary - it is used to dimension one int and one unsigned char vector at
67 #define BRASTACK_SIZE 200
69 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
70 are simple data values; negative values are for special things like \d and so
71 on. Zero means further processing is needed (for things like \x), or the escape
74 static const short escapes[] = {
75 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
76 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
77 '@', 0, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
78 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
79 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
80 0, 0, 0, '[', '\\', ']', '^', '_', /* X - _ */
81 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */
82 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
83 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */
86 static const unsigned OPCODE_LEN = 1;
87 static const unsigned BRAZERO_LEN = OPCODE_LEN;
88 static const unsigned BRA_NEST_SIZE = 2;
89 static const unsigned BRA_LEN = OPCODE_LEN + LINK_SIZE + BRA_NEST_SIZE;
90 static const unsigned KET_LEN = OPCODE_LEN + LINK_SIZE;
92 /* Error code numbers. They are given names so that they can more easily be
96 ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
97 ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17
100 /* These are the error texts that correspond to the above error codes:
102 "\\ at end of pattern\0"
103 "\\c at end of pattern\0"
104 "character value in \\x{...} sequence is too large\0"
105 "numbers out of order in {} quantifier\0"
107 "number too big in {} quantifier\0"
108 "missing terminating ] for character class\0"
109 "internal error: code overflow\0"
110 "range out of order in character class\0"
111 "nothing to repeat\0"
113 "unmatched parentheses\0"
114 "internal error: unexpected repeat\0"
115 "unrecognized character after (?\0"
116 "failed to get memory\0"
119 "reference to non-existent subpattern\0"
120 "regular expression too large\0"
121 "parentheses nested too deeply" */
123 /* Structure for passing "static" information around between the functions
124 doing the compiling. */
131 needOuterBracket = false;
132 numCapturingBrackets = 0;
134 int topBackref; /* Maximum back reference */
135 unsigned backrefMap; /* Bitmap of low back refs */
136 int reqVaryOpt; /* "After variable item" flag for reqByte */
137 bool needOuterBracket;
138 int numCapturingBrackets;
141 /* Definitions to allow mutual recursion */
143 static bool compileBracket(int, int*, unsigned char**, const UChar**, const UChar*, ErrorCode*, int, int*, int*, CompileData&);
144 static bool bracketIsAnchored(const unsigned char* code);
145 static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap);
146 static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool inassert);
148 /*************************************************
150 *************************************************/
152 /* This function is called when a \ has been encountered. It either returns a
153 positive value for a simple escape such as \n, or a negative value which
154 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
155 a positive value greater than 255 may be returned. On entry, ptr is pointing at
156 the \. On exit, it is on the final character of the escape sequence.
159 ptrPtr points to the pattern position pointer
160 errorCodePtr points to the errorcode variable
161 bracount number of previous extracting brackets
162 options the options bits
163 isClass true if inside a character class
165 Returns: zero or positive => a data character
166 negative => a special escape sequence
167 on error, error is set
170 static int checkEscape(const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int bracount, bool isClass)
172 const UChar* ptr = *ptrPtr + 1;
174 /* If backslash is at the end of the pattern, it's an error. */
175 if (ptr == patternEnd) {
176 *errorCodePtr = ERR1;
183 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
184 a table. A non-zero result is something that can be returned immediately.
185 Otherwise further processing may be required. */
187 if (c < '0' || c > 'z') { /* Not alphameric */
188 } else if (int escapeValue = escapes[c - '0']) {
192 c = '\b'; /* \b is backslash in a class */
193 else if (-c == ESC_B)
194 c = 'B'; /* and \B is a capital B in a class (in browsers event though ECMAScript 15.10.2.19 says it raises an error) */
196 /* Escapes that need further processing, or are illegal. */
209 /* Escape sequences starting with a non-zero digit are backreferences,
210 unless there are insufficient brackets, in which case they are octal
211 escape sequences. Those sequences end on the first non-octal character
212 or when we overflow 0-255, whichever comes first. */
215 const UChar* oldptr = ptr;
217 while ((ptr + 1 < patternEnd) && isASCIIDigit(ptr[1]) && c <= bracount)
218 c = c * 10 + *(++ptr) - '0';
223 ptr = oldptr; /* Put the pointer back and fall through */
226 /* Handle an octal number following \. If the first digit is 8 or 9,
227 this is not octal. */
229 if ((c = *ptr) >= '8') {
235 /* \0 always starts an octal number, but we may drop through to here with a
236 larger first octal digit. */
241 for (i = 1; i <= 2; ++i) {
242 if (ptr + i >= patternEnd || ptr[i] < '0' || ptr[i] > '7')
244 int cc = c * 8 + ptr[i] - '0';
256 for (i = 1; i <= 2; ++i) {
257 if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) {
264 cc -= 32; /* Convert to upper case */
265 c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10));
274 for (i = 1; i <= 4; ++i) {
275 if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) {
282 cc -= 32; /* Convert to upper case */
283 c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10));
290 if (++ptr == patternEnd) {
291 *errorCodePtr = ERR2;
297 /* To match Firefox, inside a character class, we also accept
298 numbers and '_' as control characters */
299 if ((!isClass && !isASCIIAlpha(c)) || (!isASCIIAlphanumeric(c) && c != '_')) {
305 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
306 is ASCII-specific, but then the whole concept of \cx is ASCII-specific. */
307 c = toASCIIUpper(c) ^ 0x40;
316 /*************************************************
317 * Check for counted repeat *
318 *************************************************/
320 /* This function is called when a '{' is encountered in a place where it might
321 start a quantifier. It looks ahead to see if it really is a quantifier or not.
322 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
323 where the ddds are digits.
326 p pointer to the first char after '{'
328 Returns: true or false
331 static bool isCountedRepeat(const UChar* p, const UChar* patternEnd)
333 if (p >= patternEnd || !isASCIIDigit(*p))
336 while (p < patternEnd && isASCIIDigit(*p))
338 if (p < patternEnd && *p == '}')
341 if (p >= patternEnd || *p++ != ',')
343 if (p < patternEnd && *p == '}')
346 if (p >= patternEnd || !isASCIIDigit(*p))
349 while (p < patternEnd && isASCIIDigit(*p))
352 return (p < patternEnd && *p == '}');
355 /*************************************************
356 * Read repeat counts *
357 *************************************************/
359 /* Read an item of the form {n,m} and return the values. This is called only
360 after isCountedRepeat() has confirmed that a repeat-count quantifier exists,
361 so the syntax is guaranteed to be correct, but we need to check the values.
364 p pointer to first char after '{'
365 minp pointer to int for min
366 maxp pointer to int for max
367 returned as -1 if no max
368 errorCodePtr points to error code variable
370 Returns: pointer to '}' on success;
371 current ptr on error, with errorCodePtr set non-zero
374 static const UChar* readRepeatCounts(const UChar* p, int* minp, int* maxp, ErrorCode* errorCodePtr)
379 /* Read the minimum value and do a paranoid check: a negative value indicates
380 an integer overflow. */
382 while (isASCIIDigit(*p))
383 min = min * 10 + *p++ - '0';
384 if (min < 0 || min > 65535) {
385 *errorCodePtr = ERR5;
389 /* Read the maximum value if there is one, and again do a paranoid on its size.
390 Also, max must not be less than min. */
397 while (isASCIIDigit(*p))
398 max = max * 10 + *p++ - '0';
399 if (max < 0 || max > 65535) {
400 *errorCodePtr = ERR5;
404 *errorCodePtr = ERR4;
410 /* Fill in the required variables, and pass back the pointer to the terminating
418 /*************************************************
419 * Find first significant op code *
420 *************************************************/
422 /* This is called by several functions that scan a compiled expression looking
423 for a fixed first character, or an anchoring op code etc. It skips over things
424 that do not influence this.
427 code pointer to the start of the group
428 Returns: pointer to the first significant opcode
431 static const unsigned char* firstSignificantOpcode(const unsigned char* code)
433 while (*code == OP_BRANUMBER)
438 static const unsigned char* firstSignificantOpcodeSkippingAssertions(const unsigned char* code)
443 advanceToEndOfBracket(code);
444 code += 1 + LINK_SIZE;
446 case OP_WORD_BOUNDARY:
447 case OP_NOT_WORD_BOUNDARY:
459 /*************************************************
460 * Get othercase range *
461 *************************************************/
463 /* This function is passed the start and end of a class range, in UTF-8 mode
464 with UCP support. It searches up the characters, looking for internal ranges of
465 characters in the "other" case. Each call returns the next one, updating the
469 cptr points to starting character value; updated
471 ocptr where to put start of othercase range
472 odptr where to put end of othercase range
474 Yield: true when range returned; false when no more
477 static bool getOthercaseRange(int* cptr, int d, int* ocptr, int* odptr)
479 int c, othercase = 0;
481 for (c = *cptr; c <= d; c++) {
482 if ((othercase = jsc_pcre_ucp_othercase(c)) >= 0)
490 int next = othercase + 1;
492 for (++c; c <= d; c++) {
493 if (jsc_pcre_ucp_othercase(c) != next)
504 /*************************************************
505 * Convert character value to UTF-8 *
506 *************************************************/
508 /* This function takes an integer value in the range 0 - 0x7fffffff
509 and encodes it as a UTF-8 character in 0 to 6 bytes.
512 cvalue the character value
513 buffer pointer to buffer for result - at least 6 bytes long
515 Returns: number of characters placed in the buffer
518 static int encodeUTF8(int cvalue, unsigned char *buffer)
521 for (i = 0; i < jsc_pcre_utf8_table1_size; i++)
522 if (cvalue <= jsc_pcre_utf8_table1[i])
525 for (int j = i; j > 0; j--) {
526 *buffer-- = 0x80 | (cvalue & 0x3f);
529 *buffer = jsc_pcre_utf8_table2[i] | cvalue;
533 /*************************************************
534 * Compile one branch *
535 *************************************************/
537 /* Scan the pattern, compiling it into the code vector.
540 options the option bits
541 brackets points to number of extracting brackets used
542 codePtr points to the pointer to the current code point
543 ptrPtr points to the current pattern pointer
544 errorCodePtr points to error code variable
545 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
546 reqbyteptr set to the last literal character required, else < 0
547 cd contains pointers to tables etc.
549 Returns: true on success
550 false, with *errorCodePtr set non-zero on error
553 static inline bool safelyCheckNextChar(const UChar* ptr, const UChar* patternEnd, UChar expected)
555 return ((ptr + 1 < patternEnd) && ptr[1] == expected);
559 compileBranch(int options, int* brackets, unsigned char** codePtr,
560 const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int *firstbyteptr,
561 int* reqbyteptr, CompileData& cd)
563 int repeatType, opType;
564 int repeatMin = 0, repeat_max = 0; /* To please picky compilers */
566 int reqvary, tempreqvary;
568 unsigned char* code = *codePtr;
569 unsigned char* tempcode;
570 bool didGroupSetFirstByte = false;
571 const UChar* ptr = *ptrPtr;
572 const UChar* tempptr;
573 unsigned char* previous = NULL;
574 unsigned char classbits[32];
577 unsigned char* class_utf8data;
578 unsigned char utf8_char[6];
580 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
581 matching encountered yet". It gets changed to REQ_NONE if we hit something that
582 matches a non-fixed char first char; reqByte just remains unset if we never
585 When we hit a repeat whose minimum is zero, we may have to adjust these values
586 to take the zero repeat into account. This is implemented by setting them to
587 zeroFirstByte and zeroReqByte when such a repeat is encountered. The individual
588 item types that can be repeated set these backoff variables appropriately. */
590 int firstByte = REQ_UNSET;
591 int reqByte = REQ_UNSET;
592 int zeroReqByte = REQ_UNSET;
593 int zeroFirstByte = REQ_UNSET;
595 /* The variable reqCaseOpt contains either the REQ_IGNORE_CASE value or zero,
596 according to the current setting of the ignores-case flag. REQ_IGNORE_CASE is a bit
597 value > 255. It is added into the firstByte or reqByte variables to record the
598 case status of the value. This is used only for ASCII characters. */
600 int reqCaseOpt = (options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0;
602 /* Switch on next character until the end of the branch */
606 bool shouldFlipNegation; /* If a negative special such as \S is used, we should negate the whole class to properly support Unicode. */
613 unsigned char mcbuffer[8];
615 /* Next byte in the pattern */
617 c = ptr < patternEnd ? *ptr : 0;
619 /* Fill in length of a previous callout, except when the next thing is
622 bool isQuantifier = c == '*' || c == '+' || c == '?' || (c == '{' && isCountedRepeat(ptr + 1, patternEnd));
625 /* The branch terminates at end of string, |, or ). */
628 if (ptr < patternEnd)
630 // End of string; fall through
633 *firstbyteptr = firstByte;
634 *reqbyteptr = reqByte;
639 /* Handle single-character metacharacters. In multiline mode, ^ disables
640 the setting of any following char as a first character. */
643 if (options & MatchAcrossMultipleLinesOption) {
644 if (firstByte == REQ_UNSET)
645 firstByte = REQ_NONE;
654 if (options & MatchAcrossMultipleLinesOption)
660 /* There can never be a first char if '.' is first, whatever happens about
661 repeats. The value of reqByte doesn't change either. */
664 if (firstByte == REQ_UNSET)
665 firstByte = REQ_NONE;
666 zeroFirstByte = firstByte;
667 zeroReqByte = reqByte;
669 *code++ = OP_NOT_NEWLINE;
672 /* Character classes. If the included characters are all < 256, we build a
673 32-byte bitmap of the permitted characters, except in the special case
674 where there is only one such character. For negated classes, we build the
675 map as usual, then invert it at the end. However, we use a different opcode
676 so that data characters > 255 can be handled correctly.
678 If the class contains characters outside the 0-255 range, a different
679 opcode is compiled. It may optionally have a bit map for characters < 256,
680 but those above are are explicitly listed afterwards. A flag byte tells
681 whether the bitmap is present, and whether this is a negated class or not.
686 shouldFlipNegation = false;
688 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
689 they are encountered at the top level, so we'll do that too. */
691 /* If the first character is '^', set the negation flag and skip it. */
693 if (ptr + 1 >= patternEnd) {
694 *errorCodePtr = ERR6;
704 /* Keep a count of chars with values < 256 so that we can optimize the case
705 of just a single character (as long as it's < 256). For higher valued UTF-8
706 characters, we don't yet do any optimization. */
711 class_utf8 = false; /* No chars >= 256 */
712 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
714 /* Initialize the 32-char bit map to all zeros. We have to build the
715 map in a temporary bit of store, in case the class contains only 1
716 character (< 256), because in that case the compiled code doesn't use the
719 memset(classbits, 0, 32 * sizeof(unsigned char));
721 /* Process characters until ] is reached. The first pass
722 through the regex checked the overall syntax, so we don't need to be very
723 strict here. At the start of the loop, c contains the first byte of the
726 while ((++ptr < patternEnd) && (c = *ptr) != ']') {
727 /* Backslash may introduce a single character, or it may introduce one
728 of the specials, which just set a flag. Escaped items are checked for
729 validity in the pre-compiling pass. The sequence \b is a special case.
730 Inside a class (and only there) it is treated as backspace. Elsewhere
731 it marks a word boundary. Other escapes have preset maps ready to
732 or into the one we are building. We assume they have more than one
733 character in them, so set classCharCount bigger than one. */
736 c = checkEscape(&ptr, patternEnd, errorCodePtr, cd.numCapturingBrackets, true);
738 classCharCount += 2; /* Greater than 1 is what matters */
741 for (c = 0; c < 32; c++)
742 classbits[c] |= classBitmapForChar(c + cbit_digit);
746 shouldFlipNegation = true;
747 for (c = 0; c < 32; c++)
748 classbits[c] |= ~classBitmapForChar(c + cbit_digit);
752 for (c = 0; c < 32; c++)
753 classbits[c] |= classBitmapForChar(c + cbit_word);
757 shouldFlipNegation = true;
758 for (c = 0; c < 32; c++)
759 classbits[c] |= ~classBitmapForChar(c + cbit_word);
763 for (c = 0; c < 32; c++)
764 classbits[c] |= classBitmapForChar(c + cbit_space);
768 shouldFlipNegation = true;
769 for (c = 0; c < 32; c++)
770 classbits[c] |= ~classBitmapForChar(c + cbit_space);
773 /* Unrecognized escapes are faulted if PCRE is running in its
774 strict mode. By default, for compatibility with Perl, they are
775 treated as literals. */
778 c = *ptr; /* The final character */
779 classCharCount -= 2; /* Undo the default count from above */
783 /* Fall through if we have a single character (c >= 0). This may be
784 > 256 in UTF-8 mode. */
786 } /* End of backslash handling */
788 /* A single character may be followed by '-' to form a range. However,
789 Perl does not permit ']' to be the end of the range. A '-' character
790 here is treated as a literal. */
792 if ((ptr + 2 < patternEnd) && ptr[1] == '-' && ptr[2] != ']') {
797 /* The second part of a range can be a single-character escape, but
798 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
799 in such circumstances. */
802 const UChar* oldptr = ptr;
803 d = checkEscape(&ptr, patternEnd, errorCodePtr, cd.numCapturingBrackets, true);
805 /* \X is literal X; any other special means the '-' was literal */
808 goto LONE_SINGLE_CHARACTER; /* A few lines below */
812 /* The check that the two values are in the correct order happens in
813 the pre-pass. Optimize one-character ranges */
816 goto LONE_SINGLE_CHARACTER; /* A few lines below */
818 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
819 matching, we have to use an XCLASS with extra data items. Caseless
820 matching for characters > 127 is available only if UCP support is
823 if ((d > 255 || ((options & IgnoreCaseOption) && d > 127))) {
826 /* With UCP support, we can find the other case equivalents of
827 the relevant characters. There may be several ranges. Optimize how
828 they fit with the basic range. */
830 if (options & IgnoreCaseOption) {
834 while (getOthercaseRange(&cc, origd, &occ, &ocd)) {
835 if (occ >= c && ocd <= d)
836 continue; /* Skip embedded ranges */
838 if (occ < c && ocd >= c - 1) /* Extend the basic range */
839 { /* if there is overlap, */
840 c = occ; /* noting that if occ < c */
841 continue; /* we can't have ocd > d */
842 } /* because a subrange is */
843 if (ocd > d && occ <= d + 1) /* always shorter than */
844 { /* the basic range. */
850 *class_utf8data++ = XCL_SINGLE;
852 *class_utf8data++ = XCL_RANGE;
853 class_utf8data += encodeUTF8(occ, class_utf8data);
855 class_utf8data += encodeUTF8(ocd, class_utf8data);
859 /* Now record the original range, possibly modified for UCP caseless
860 overlapping ranges. */
862 *class_utf8data++ = XCL_RANGE;
863 class_utf8data += encodeUTF8(c, class_utf8data);
864 class_utf8data += encodeUTF8(d, class_utf8data);
866 /* With UCP support, we are done. Without UCP support, there is no
867 caseless matching for UTF-8 characters > 127; we can use the bit map
868 for the smaller ones. */
870 continue; /* With next character in the class */
873 /* We use the bit map for all cases when not in UTF-8 mode; else
874 ranges that lie entirely within 0-127 when there is UCP support; else
875 for partial ranges without UCP support. */
877 for (; c <= d; c++) {
878 classbits[c/8] |= (1 << (c&7));
879 if (options & IgnoreCaseOption) {
880 int uc = flipCase(c);
881 classbits[uc/8] |= (1 << (uc&7));
883 classCharCount++; /* in case a one-char range */
887 continue; /* Go get the next char in the class */
890 /* Handle a lone single character - we can get here for a normal
891 non-escape char, or after \ that introduces a single character or for an
892 apparent range that isn't. */
894 LONE_SINGLE_CHARACTER:
896 /* Handle a character that cannot go in the bit map */
898 if ((c > 255 || ((options & IgnoreCaseOption) && c > 127))) {
900 *class_utf8data++ = XCL_SINGLE;
901 class_utf8data += encodeUTF8(c, class_utf8data);
903 if (options & IgnoreCaseOption) {
905 if ((othercase = jsc_pcre_ucp_othercase(c)) >= 0) {
906 *class_utf8data++ = XCL_SINGLE;
907 class_utf8data += encodeUTF8(othercase, class_utf8data);
911 /* Handle a single-byte character */
912 classbits[c/8] |= (1 << (c&7));
913 if (options & IgnoreCaseOption) {
915 classbits[c/8] |= (1 << (c&7));
922 /* If classCharCount is 1, we saw precisely one character whose value is
923 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
924 can optimize the negative case only if there were no characters >= 128
925 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
926 single-bytes only. This is an historical hangover. Maybe one day we can
927 tidy these opcodes to handle multi-byte characters.
929 The optimization throws away the bit map. We turn the item into a
930 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
931 that OP_NOT does not support multibyte characters. In the positive case, it
932 can cause firstByte to be set. Otherwise, there can be no first char if
933 this item is first, whatever repeat count may follow. In the case of
934 reqByte, save the previous value for reinstating. */
936 if (classCharCount == 1 && (!class_utf8 && (!negateClass || classLastChar < 128))) {
937 zeroReqByte = reqByte;
939 /* The OP_NOT opcode works on one-byte characters only. */
942 if (firstByte == REQ_UNSET)
943 firstByte = REQ_NONE;
944 zeroFirstByte = firstByte;
946 *code++ = classLastChar;
950 /* For a single, positive character, get the value into c, and
951 then we can handle this with the normal one-character code. */
955 } /* End of 1-char optimization */
957 /* The general case - not the one-char optimization. If this is the first
958 thing in the branch, there can be no first char setting, whatever the
959 repeat count. Any reqByte setting must remain unchanged after any kind of
962 if (firstByte == REQ_UNSET) firstByte = REQ_NONE;
963 zeroFirstByte = firstByte;
964 zeroReqByte = reqByte;
966 /* If there are characters with values > 255, we have to compile an
967 extended class, with its own opcode. If there are no characters < 256,
968 we can omit the bitmap. */
970 if (class_utf8 && !shouldFlipNegation) {
971 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
974 *code = negateClass? XCL_NOT : 0;
976 /* If the map is required, install it, and move on to the end of
979 if (classCharCount > 0) {
981 memcpy(code, classbits, 32);
982 code = class_utf8data;
985 /* If the map is not required, slide down the extra data. */
988 int len = class_utf8data - (code + 33);
989 memmove(code + 1, code + 33, len);
993 /* Now fill in the complete length of the item */
995 putLinkValue(previous + 1, code - previous);
996 break; /* End of class handling */
999 /* If there are no characters > 255, negate the 32-byte map if necessary,
1000 and copy it into the code vector. If this is the first thing in the branch,
1001 there can be no first char setting, whatever the repeat count. Any reqByte
1002 setting must remain unchanged after any kind of repeat. */
1004 *code++ = (negateClass == shouldFlipNegation) ? OP_CLASS : OP_NCLASS;
1006 for (c = 0; c < 32; c++)
1007 code[c] = ~classbits[c];
1009 memcpy(code, classbits, 32);
1014 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
1015 has been tested above. */
1020 ptr = readRepeatCounts(ptr + 1, &repeatMin, &repeat_max, errorCodePtr);
1041 *errorCodePtr = ERR9;
1045 if (repeatMin == 0) {
1046 firstByte = zeroFirstByte; /* Adjust for zero repeat */
1047 reqByte = zeroReqByte; /* Ditto */
1050 /* Remember whether this is a variable length repeat */
1052 reqvary = (repeatMin == repeat_max) ? 0 : REQ_VARY;
1054 opType = 0; /* Default single-char op codes */
1056 /* Save start of previous item, in case we have to move it up to make space
1057 for an inserted OP_ONCE for the additional '+' extension. */
1058 /* FIXME: Probably don't need this because we don't use OP_ONCE. */
1060 tempcode = previous;
1062 /* If the next character is '+', we have a possessive quantifier. This
1063 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
1064 If the next character is '?' this is a minimizing repeat, by default,
1065 but if PCRE_UNGREEDY is set, it works the other way round. We change the
1066 repeat type to the non-default. */
1068 if (safelyCheckNextChar(ptr, patternEnd, '?')) {
1074 /* If previous was a character match, abolish the item and generate a
1075 repeat item instead. If a char item has a minumum of more than one, ensure
1076 that it is set in reqByte - it might not be if a sequence such as x{3} is
1077 the first thing in a branch because the x will have gone into firstByte
1080 if (*previous == OP_CHAR || *previous == OP_CHAR_IGNORING_CASE) {
1081 /* Deal with UTF-8 characters that take up more than one byte. It's
1082 easier to write this out separately than try to macrify it. Use c to
1083 hold the length of the character in bytes, plus 0x80 to flag that it's a
1084 length rather than a small character. */
1086 if (code[-1] & 0x80) {
1087 unsigned char *lastchar = code - 1;
1088 while((*lastchar & 0xc0) == 0x80)
1090 c = code - lastchar; /* Length of UTF-8 character */
1091 memcpy(utf8_char, lastchar, c); /* Save the char */
1092 c |= 0x80; /* Flag c as a length */
1097 reqByte = c | reqCaseOpt | cd.reqVaryOpt;
1100 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1103 else if (*previous == OP_ASCII_CHAR || *previous == OP_ASCII_LETTER_IGNORING_CASE) {
1106 reqByte = c | reqCaseOpt | cd.reqVaryOpt;
1107 goto OUTPUT_SINGLE_REPEAT;
1110 /* If previous was a single negated character ([^a] or similar), we use
1111 one of the special opcodes, replacing it. The code is shared with single-
1112 character repeats by setting opt_type to add a suitable offset into
1113 repeatType. OP_NOT is currently used only for single-byte chars. */
1115 else if (*previous == OP_NOT) {
1116 opType = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1118 goto OUTPUT_SINGLE_REPEAT;
1121 /* If previous was a character type match (\d or similar), abolish it and
1122 create a suitable repeat item. The code is shared with single-character
1123 repeats by setting opType to add a suitable offset into repeatType. */
1125 else if (*previous <= OP_NOT_NEWLINE) {
1126 opType = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1129 OUTPUT_SINGLE_REPEAT:
1131 int prop_value = -1;
1133 unsigned char* oldcode = code;
1134 code = previous; /* Usually overwrite previous item */
1136 /* If the maximum is zero then the minimum must also be zero; Perl allows
1137 this case, so we do too - by simply omitting the item altogether. */
1139 if (repeat_max == 0)
1142 /* Combine the opType with the repeatType */
1144 repeatType += opType;
1146 /* A minimum of zero is handled either as the special case * or ?, or as
1147 an UPTO, with the maximum given. */
1149 if (repeatMin == 0) {
1150 if (repeat_max == -1)
1151 *code++ = OP_STAR + repeatType;
1152 else if (repeat_max == 1)
1153 *code++ = OP_QUERY + repeatType;
1155 *code++ = OP_UPTO + repeatType;
1156 put2ByteValueAndAdvance(code, repeat_max);
1160 /* A repeat minimum of 1 is optimized into some special cases. If the
1161 maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
1162 left in place and, if the maximum is greater than 1, we use OP_UPTO with
1163 one less than the maximum. */
1165 else if (repeatMin == 1) {
1166 if (repeat_max == -1)
1167 *code++ = OP_PLUS + repeatType;
1169 code = oldcode; /* leave previous item in place */
1170 if (repeat_max == 1)
1172 *code++ = OP_UPTO + repeatType;
1173 put2ByteValueAndAdvance(code, repeat_max - 1);
1177 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1178 handled as an EXACT followed by an UPTO. */
1181 *code++ = OP_EXACT + opType; /* NB EXACT doesn't have repeatType */
1182 put2ByteValueAndAdvance(code, repeatMin);
1184 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
1185 we have to insert the character for the previous code. For a repeated
1186 Unicode property match, there are two extra bytes that define the
1187 required property. In UTF-8 mode, long characters have their length in
1188 c, with the 0x80 bit as a flag. */
1190 if (repeat_max < 0) {
1192 memcpy(code, utf8_char, c & 7);
1196 if (prop_type >= 0) {
1197 *code++ = prop_type;
1198 *code++ = prop_value;
1201 *code++ = OP_STAR + repeatType;
1204 /* Else insert an UPTO if the max is greater than the min, again
1205 preceded by the character, for the previously inserted code. */
1207 else if (repeat_max != repeatMin) {
1209 memcpy(code, utf8_char, c & 7);
1213 if (prop_type >= 0) {
1214 *code++ = prop_type;
1215 *code++ = prop_value;
1217 repeat_max -= repeatMin;
1218 *code++ = OP_UPTO + repeatType;
1219 put2ByteValueAndAdvance(code, repeat_max);
1223 /* The character or character type itself comes last in all cases. */
1226 memcpy(code, utf8_char, c & 7);
1231 /* For a repeated Unicode property match, there are two extra bytes that
1232 define the required property. */
1234 if (prop_type >= 0) {
1235 *code++ = prop_type;
1236 *code++ = prop_value;
1240 /* If previous was a character class or a back reference, we put the repeat
1241 stuff after it, but just skip the item if the repeat was {0,0}. */
1243 else if (*previous == OP_CLASS ||
1244 *previous == OP_NCLASS ||
1245 *previous == OP_XCLASS ||
1246 *previous == OP_REF)
1248 if (repeat_max == 0) {
1253 if (repeatMin == 0 && repeat_max == -1)
1254 *code++ = OP_CRSTAR + repeatType;
1255 else if (repeatMin == 1 && repeat_max == -1)
1256 *code++ = OP_CRPLUS + repeatType;
1257 else if (repeatMin == 0 && repeat_max == 1)
1258 *code++ = OP_CRQUERY + repeatType;
1260 *code++ = OP_CRRANGE + repeatType;
1261 put2ByteValueAndAdvance(code, repeatMin);
1262 if (repeat_max == -1)
1263 repeat_max = 0; /* 2-byte encoding for max */
1264 put2ByteValueAndAdvance(code, repeat_max);
1268 /* If previous was a bracket group, we may have to replicate it in certain
1271 else if (*previous >= OP_BRA) {
1273 int len = code - previous;
1274 unsigned char* bralink = NULL;
1275 int nested = get2ByteValue(previous + 1 + LINK_SIZE);
1277 /* If the maximum repeat count is unlimited, find the end of the bracket
1278 by scanning through from the start, and compute the offset back to it
1279 from the current code pointer. There may be an OP_OPT setting following
1280 the final KET, so we can't find the end just by going back from the code
1283 if (repeat_max == -1) {
1284 const unsigned char* ket = previous;
1285 advanceToEndOfBracket(ket);
1286 ketoffset = code - ket;
1289 /* The case of a zero minimum is special because of the need to stick
1290 OP_BRAZERO in front of it, and because the group appears once in the
1291 data, whereas in other cases it appears the minimum number of times. For
1292 this reason, it is simplest to treat this case separately, as otherwise
1293 the code gets far too messy. There are several special subcases when the
1296 if (repeatMin == 0) {
1297 /* If the maximum is also zero, we just omit the group from the output
1300 if (repeat_max == 0) {
1305 /* If the maximum is 1 or unlimited, we just have to stick in the
1306 BRAZERO and do no more at this point. However, we do need to adjust
1307 any OP_RECURSE calls inside the group that refer to the group itself or
1308 any internal group, because the offset is from the start of the whole
1309 regex. Temporarily terminate the pattern while doing this. */
1311 if (repeat_max <= 1) {
1313 memmove(previous+1, previous, len);
1315 *previous++ = OP_BRAZERO + repeatType;
1318 /* If the maximum is greater than 1 and limited, we have to replicate
1319 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1320 The first one has to be handled carefully because it's the original
1321 copy, which has to be moved up. The remainder can be handled by code
1322 that is common with the non-zero minimum case below. We have to
1323 adjust the value of repeat_max, since one less copy is required. */
1327 memmove(previous + 4 + LINK_SIZE, previous, len);
1328 code += 4 + LINK_SIZE;
1329 *previous++ = OP_BRAZERO + repeatType;
1330 *previous++ = OP_BRA;
1332 /* We chain together the bracket offset fields that have to be
1333 filled in later when the ends of the brackets are reached. */
1335 int offset = (!bralink) ? 0 : previous - bralink;
1337 putLinkValueAllowZeroAndAdvance(previous, offset);
1338 put2ByteValueAndAdvance(previous, nested);
1344 /* If the minimum is greater than zero, replicate the group as many
1345 times as necessary, and adjust the maximum to the number of subsequent
1346 copies that we need. If we set a first char from the group, and didn't
1347 set a required char, copy the latter from the former. */
1350 if (repeatMin > 1) {
1351 if (didGroupSetFirstByte && reqByte < 0)
1352 reqByte = firstByte;
1353 for (int i = 1; i < repeatMin; i++) {
1354 memcpy(code, previous, len);
1359 repeat_max -= repeatMin;
1362 /* This code is common to both the zero and non-zero minimum cases. If
1363 the maximum is limited, it replicates the group in a nested fashion,
1364 remembering the bracket starts on a stack. In the case of a zero minimum,
1365 the first one was set up above. In all cases the repeat_max now specifies
1366 the number of additional copies needed. */
1368 if (repeat_max >= 0) {
1369 for (int i = repeat_max - 1; i >= 0; i--) {
1370 *code++ = OP_BRAZERO + repeatType;
1372 /* All but the final copy start a new nesting, maintaining the
1373 chain of brackets outstanding. */
1377 int offset = (!bralink) ? 0 : code - bralink;
1379 putLinkValueAllowZeroAndAdvance(code, offset);
1380 put2ByteValueAndAdvance(code, nested);
1383 memcpy(code, previous, len);
1387 /* Now chain through the pending brackets, and fill in their length
1388 fields (which are holding the chain links pro tem). */
1391 int offset = code - bralink + 1;
1392 unsigned char* bra = code - offset;
1393 int oldlinkoffset = getLinkValueAllowZero(bra + 1);
1394 bralink = (!oldlinkoffset) ? 0 : bralink - oldlinkoffset;
1396 putLinkValueAndAdvance(code, offset);
1397 putLinkValue(bra + 1, offset);
1401 /* If the maximum is unlimited, set a repeater in the final copy. We
1402 can't just offset backwards from the current code point, because we
1403 don't know if there's been an options resetting after the ket. The
1404 correct offset was computed above. */
1407 code[-ketoffset] = OP_KETRMAX + repeatType;
1410 // A quantifier after an assertion is mostly meaningless, but it
1411 // can nullify the assertion if it has a 0 minimum.
1412 else if (*previous == OP_ASSERT || *previous == OP_ASSERT_NOT) {
1413 if (repeatMin == 0) {
1419 /* Else there's some kind of shambles */
1422 *errorCodePtr = ERR11;
1426 /* In all case we no longer have a previous item. We also set the
1427 "follows varying string" flag for subsequently encountered reqbytes if
1428 it isn't already set and we have just passed a varying length item. */
1432 cd.reqVaryOpt |= reqvary;
1435 /* Start of nested bracket sub-expression, or comment or lookahead or
1436 lookbehind or option setting or condition. First deal with special things
1437 that can come after a bracket; all are introduced by ?, and the appearance
1438 of any of them means that this is not a referencing group. They were
1439 checked for validity in the first pass over the string, so we don't have to
1440 check for syntax errors here. */
1445 unsigned minBracket = *brackets + 1;
1446 if (*(++ptr) == '?') {
1448 case ':': /* Non-extracting bracket */
1453 case '=': /* Positive lookahead */
1454 bravalue = OP_ASSERT;
1458 case '!': /* Negative lookahead */
1459 bravalue = OP_ASSERT_NOT;
1463 /* Character after (? not specially recognized */
1466 *errorCodePtr = ERR12;
1471 /* Else we have a referencing group; adjust the opcode. If the bracket
1472 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
1473 arrange for the true number to follow later, in an OP_BRANUMBER item. */
1476 if (++(*brackets) > EXTRACT_BASIC_MAX) {
1477 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
1478 code[3 + LINK_SIZE] = OP_BRANUMBER;
1479 put2ByteValue(code + 4 + LINK_SIZE, *brackets);
1483 bravalue = OP_BRA + *brackets;
1486 /* Process nested bracketed re. We copy code into a non-variable
1487 in order to be able to pass its address because some compilers
1488 complain otherwise. Pass in a new setting for the ims options
1489 if they have changed. */
1494 tempreqvary = cd.reqVaryOpt; /* Save value before bracket */
1496 unsigned bracketsBeforeRecursion = *brackets;
1497 if (!compileBracket(
1499 brackets, /* Extracting bracket count */
1500 &tempcode, /* Where to put code (updated) */
1501 &ptr, /* Input pointer (updated) */
1503 errorCodePtr, /* Where to put an error message */
1504 skipBytes, /* Skip over OP_BRANUMBER */
1505 &subFirstByte, /* For possible first char */
1506 &subReqByte, /* For possible last char */
1507 cd)) /* Tables block */
1509 unsigned enclosedBrackets = (*brackets - bracketsBeforeRecursion);
1510 unsigned limitBracket = minBracket + enclosedBrackets + (bravalue > OP_BRA);
1511 if (!((minBracket & 0xff) == minBracket && (limitBracket & 0xff) == limitBracket)) {
1512 *errorCodePtr = ERR17;
1515 JS_ASSERT(minBracket <= limitBracket);
1516 put2ByteValue(code + 1 + LINK_SIZE, minBracket << 8 | limitBracket);
1519 /* At the end of compiling, code is still pointing to the start of the
1520 group, while tempcode has been updated to point past the end of the group
1521 and any option resetting that may follow it. The pattern pointer (ptr)
1522 is on the bracket. */
1524 /* Handle updating of the required and first characters. Update for normal
1525 brackets of all kinds, and conditions with two branches (see code above).
1526 If the bracket is followed by a quantifier with zero repeat, we have to
1527 back off. Hence the definition of zeroReqByte and zeroFirstByte outside the
1528 main loop so that they can be accessed for the back off. */
1530 zeroReqByte = reqByte;
1531 zeroFirstByte = firstByte;
1532 didGroupSetFirstByte = false;
1534 if (bravalue >= OP_BRA) {
1535 /* If we have not yet set a firstByte in this branch, take it from the
1536 subpattern, remembering that it was set here so that a repeat of more
1537 than one can replicate it as reqByte if necessary. If the subpattern has
1538 no firstByte, set "none" for the whole branch. In both cases, a zero
1539 repeat forces firstByte to "none". */
1541 if (firstByte == REQ_UNSET) {
1542 if (subFirstByte >= 0) {
1543 firstByte = subFirstByte;
1544 didGroupSetFirstByte = true;
1547 firstByte = REQ_NONE;
1548 zeroFirstByte = REQ_NONE;
1551 /* If firstByte was previously set, convert the subpattern's firstByte
1552 into reqByte if there wasn't one, using the vary flag that was in
1553 existence beforehand. */
1555 else if (subFirstByte >= 0 && subReqByte < 0)
1556 subReqByte = subFirstByte | tempreqvary;
1558 /* If the subpattern set a required byte (or set a first byte that isn't
1559 really the first byte - see above), set it. */
1561 if (subReqByte >= 0)
1562 reqByte = subReqByte;
1565 /* For a forward assertion, we take the reqByte, if set. This can be
1566 helpful if the pattern that follows the assertion doesn't set a different
1567 char. For example, it's useful for /(?=abcde).+/. We can't set firstByte
1568 for an assertion, however because it leads to incorrect effect for patterns
1569 such as /(?=a)a.+/ when the "real" "a" would then become a reqByte instead
1570 of a firstByte. This is overcome by a scan at the end if there's no
1571 firstByte, looking for an asserted first char. */
1573 else if (bravalue == OP_ASSERT && subReqByte >= 0)
1574 reqByte = subReqByte;
1576 /* Now update the main code pointer to the end of the group. */
1580 /* Error if hit end of pattern */
1582 if (ptr >= patternEnd || *ptr != ')') {
1583 *errorCodePtr = ERR14;
1589 /* Check \ for being a real metacharacter; if not, fall through and handle
1590 it as a data character at the start of a string. Escape items are checked
1591 for validity in the pre-compiling pass. */
1595 c = checkEscape(&ptr, patternEnd, errorCodePtr, cd.numCapturingBrackets, false);
1597 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1598 are arranged to be the negation of the corresponding OP_values. For the
1599 back references, the values are ESC_REF plus the reference number. Only
1600 back references and those types that consume a character may be repeated.
1601 We can test for values between ESC_b and ESC_w for the latter; this may
1602 have to change if any new ones are ever created. */
1605 /* For metasequences that actually match a character, we disable the
1606 setting of a first character if it hasn't already been set. */
1608 if (firstByte == REQ_UNSET && -c > ESC_b && -c <= ESC_w)
1609 firstByte = REQ_NONE;
1611 /* Set values to reset to if this is followed by a zero repeat. */
1613 zeroFirstByte = firstByte;
1614 zeroReqByte = reqByte;
1616 /* Back references are handled specially */
1618 if (-c >= ESC_REF) {
1619 int number = -c - ESC_REF;
1622 put2ByteValueAndAdvance(code, number);
1625 /* For the rest, we can obtain the OP value by negating the escape
1629 previous = (-c > ESC_b && -c <= ESC_w) ? code : NULL;
1637 /* Handle a literal character. It is guaranteed not to be whitespace or #
1638 when the extended flag is set. If we are in UTF-8 mode, it may be a
1639 multi-byte literal character. */
1650 if ((options & IgnoreCaseOption) && (c | 0x20) >= 'a' && (c | 0x20) <= 'z') {
1651 *code++ = OP_ASCII_LETTER_IGNORING_CASE;
1654 *code++ = OP_ASCII_CHAR;
1658 mcLength = encodeUTF8(c, mcbuffer);
1660 *code++ = (options & IgnoreCaseOption) ? OP_CHAR_IGNORING_CASE : OP_CHAR;
1661 for (c = 0; c < mcLength; c++)
1662 *code++ = mcbuffer[c];
1665 /* Set the first and required bytes appropriately. If no previous first
1666 byte, set it from this character, but revert to none on a zero repeat.
1667 Otherwise, leave the firstByte value alone, and don't change it on a zero
1670 if (firstByte == REQ_UNSET) {
1671 zeroFirstByte = REQ_NONE;
1672 zeroReqByte = reqByte;
1674 /* If the character is more than one byte long, we can set firstByte
1675 only if it is not to be matched caselessly. */
1677 if (mcLength == 1 || reqCaseOpt == 0) {
1678 firstByte = mcbuffer[0] | reqCaseOpt;
1680 reqByte = code[-1] | cd.reqVaryOpt;
1683 firstByte = reqByte = REQ_NONE;
1686 /* firstByte was previously set; we can set reqByte only the length is
1687 1 or the matching is caseful. */
1690 zeroFirstByte = firstByte;
1691 zeroReqByte = reqByte;
1692 if (mcLength == 1 || reqCaseOpt == 0)
1693 reqByte = code[-1] | reqCaseOpt | cd.reqVaryOpt;
1696 break; /* End of literal character handling */
1698 } /* end of big loop */
1700 /* Control never reaches here by falling through, only by a goto for all the
1701 error states. Pass back the position in the pattern so that it can be displayed
1702 to the user for diagnosing the error. */
1709 /*************************************************
1710 * Compile sequence of alternatives *
1711 *************************************************/
1713 /* On entry, ptr is pointing past the bracket character, but on return
1714 it points to the closing bracket, or vertical bar, or end of string.
1715 The code variable is pointing at the byte into which the BRA operator has been
1716 stored. If the ims options are changed at the start (for a (?ims: group) or
1717 during any branch, we need to insert an OP_OPT item at the start of every
1718 following branch to ensure they get set correctly at run time, and also pass
1719 the new options into every subsequent branch compile.
1722 options option bits, including any changes for this subpattern
1723 brackets -> int containing the number of extracting brackets used
1724 codePtr -> the address of the current code pointer
1725 ptrPtr -> the address of the current pattern pointer
1726 errorCodePtr -> pointer to error code variable
1727 skipBytes skip this many bytes at start (for OP_BRANUMBER)
1728 firstbyteptr place to put the first required character, or a negative number
1729 reqbyteptr place to put the last required character, or a negative number
1730 cd points to the data block with tables pointers etc.
1732 Returns: true on success
1736 compileBracket(int options, int* brackets, unsigned char** codePtr,
1737 const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int skipBytes,
1738 int* firstbyteptr, int* reqbyteptr, CompileData& cd)
1740 const UChar* ptr = *ptrPtr;
1741 unsigned char* code = *codePtr;
1742 unsigned char* lastBranch = code;
1743 unsigned char* start_bracket = code;
1744 int firstByte = REQ_UNSET;
1745 int reqByte = REQ_UNSET;
1747 /* Offset is set zero to mark that this bracket is still open */
1749 putLinkValueAllowZero(code + 1, 0);
1750 code += 1 + LINK_SIZE + skipBytes;
1752 /* Loop for each alternative branch */
1755 /* Now compile the branch */
1757 int branchFirstByte;
1759 if (!compileBranch(options, brackets, &code, &ptr, patternEnd, errorCodePtr,
1760 &branchFirstByte, &branchReqByte, cd)) {
1765 /* If this is the first branch, the firstByte and reqByte values for the
1766 branch become the values for the regex. */
1768 if (*lastBranch != OP_ALT) {
1769 firstByte = branchFirstByte;
1770 reqByte = branchReqByte;
1773 /* If this is not the first branch, the first char and reqByte have to
1774 match the values from all the previous branches, except that if the previous
1775 value for reqByte didn't have REQ_VARY set, it can still match, and we set
1776 REQ_VARY for the regex. */
1779 /* If we previously had a firstByte, but it doesn't match the new branch,
1780 we have to abandon the firstByte for the regex, but if there was previously
1781 no reqByte, it takes on the value of the old firstByte. */
1783 if (firstByte >= 0 && firstByte != branchFirstByte) {
1785 reqByte = firstByte;
1786 firstByte = REQ_NONE;
1789 /* If we (now or from before) have no firstByte, a firstByte from the
1790 branch becomes a reqByte if there isn't a branch reqByte. */
1792 if (firstByte < 0 && branchFirstByte >= 0 && branchReqByte < 0)
1793 branchReqByte = branchFirstByte;
1795 /* Now ensure that the reqbytes match */
1797 if ((reqByte & ~REQ_VARY) != (branchReqByte & ~REQ_VARY))
1800 reqByte |= branchReqByte; /* To "or" REQ_VARY */
1803 /* Reached end of expression, either ')' or end of pattern. Go back through
1804 the alternative branches and reverse the chain of offsets, with the field in
1805 the BRA item now becoming an offset to the first alternative. If there are
1806 no alternatives, it points to the end of the group. The length in the
1807 terminating ket is always the length of the whole bracketed item.
1808 Return leaving the pointer at the terminating char. */
1810 if (ptr >= patternEnd || *ptr != '|') {
1811 int length = code - lastBranch;
1813 int prevLength = getLinkValueAllowZero(lastBranch + 1);
1814 putLinkValue(lastBranch + 1, length);
1815 length = prevLength;
1816 lastBranch -= length;
1817 } while (length > 0);
1819 /* Fill in the ket */
1822 putLinkValue(code + 1, code - start_bracket);
1823 code += 1 + LINK_SIZE;
1825 /* Set values to pass back */
1829 *firstbyteptr = firstByte;
1830 *reqbyteptr = reqByte;
1834 /* Another branch follows; insert an "or" node. Its length field points back
1835 to the previous branch while the bracket remains open. At the end the chain
1836 is reversed. It's done like this so that the start of the bracket has a
1837 zero offset until it is closed, making it possible to detect recursion. */
1840 putLinkValue(code + 1, code - lastBranch);
1842 code += 1 + LINK_SIZE;
1845 JS_NOT_REACHED("No fallthru.");
1848 /*************************************************
1849 * Check for anchored expression *
1850 *************************************************/
1852 /* Try to find out if this is an anchored regular expression. Consider each
1853 alternative branch. If they all start OP_CIRC, or with a bracket
1854 all of whose alternatives start OP_CIRC (recurse ad lib), then
1858 code points to start of expression (the bracket)
1859 captureMap a bitmap of which brackets we are inside while testing; this
1860 handles up to substring 31; all brackets after that share
1862 backrefMap the back reference bitmap
1865 static bool branchIsAnchored(const unsigned char* code)
1867 const unsigned char* scode = firstSignificantOpcode(code);
1871 if (op >= OP_BRA || op == OP_ASSERT)
1872 return bracketIsAnchored(scode);
1874 /* Check for explicit anchoring */
1875 return op == OP_CIRC;
1878 static bool bracketIsAnchored(const unsigned char* code)
1881 if (!branchIsAnchored(code + 1 + LINK_SIZE))
1883 code += getLinkValue(code + 1);
1884 } while (*code == OP_ALT); /* Loop for each alternative */
1888 /*************************************************
1889 * Check for starting with ^ or .* *
1890 *************************************************/
1892 /* This is called to find out if every branch starts with ^ or .* so that
1893 "first char" processing can be done to speed things up in multiline
1894 matching and for non-DOTALL patterns that start with .* (which must start at
1895 the beginning or after \n)
1897 Except when the .* appears inside capturing parentheses, and there is a
1898 subsequent back reference to those parentheses. By keeping a bitmap of the
1899 first 31 back references, we can catch some of the more common cases more
1900 precisely; all the greater back references share a single bit.
1903 code points to start of expression (the bracket)
1904 captureMap a bitmap of which brackets we are inside while testing; this
1905 handles up to substring 31; all brackets after that share
1907 backrefMap the back reference bitmap
1910 static bool branchNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap)
1912 const unsigned char* scode = firstSignificantOpcode(code);
1915 /* Capturing brackets */
1917 int captureNum = op - OP_BRA;
1918 if (captureNum > EXTRACT_BASIC_MAX)
1919 captureNum = get2ByteValue(scode + 2 + LINK_SIZE);
1920 int bracketMask = (captureNum < 32) ? (1 << captureNum) : 1;
1921 return bracketNeedsLineStart(scode, captureMap | bracketMask, backrefMap);
1924 /* Other brackets */
1925 if (op == OP_BRA || op == OP_ASSERT)
1926 return bracketNeedsLineStart(scode, captureMap, backrefMap);
1928 /* .* means "start at start or after \n" if it isn't in brackets that
1929 may be referenced. */
1931 if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1932 return scode[1] == OP_NOT_NEWLINE && !(captureMap & backrefMap);
1935 return op == OP_CIRC || op == OP_BOL;
1938 static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap)
1941 if (!branchNeedsLineStart(code + 1 + LINK_SIZE, captureMap, backrefMap))
1943 code += getLinkValue(code + 1);
1944 } while (*code == OP_ALT); /* Loop for each alternative */
1948 /*************************************************
1949 * Check for asserted fixed first char *
1950 *************************************************/
1952 /* During compilation, the "first char" settings from forward assertions are
1953 discarded, because they can cause conflicts with actual literals that follow.
1954 However, if we end up without a first char setting for an unanchored pattern,
1955 it is worth scanning the regex to see if there is an initial asserted first
1956 char. If all branches start with the same asserted char, or with a bracket all
1957 of whose alternatives start with the same asserted char (recurse ad lib), then
1958 we return that char, otherwise -1.
1961 code points to start of expression (the bracket)
1962 options pointer to the options (used to check casing changes)
1963 inassert true if in an assertion
1965 Returns: -1 or the fixed first char
1968 static int branchFindFirstAssertedCharacter(const unsigned char* code, bool inassert)
1970 const unsigned char* scode = firstSignificantOpcodeSkippingAssertions(code);
1982 return bracketFindFirstAssertedCharacter(scode, op == OP_ASSERT);
1989 case OP_CHAR_IGNORING_CASE:
1991 case OP_ASCII_LETTER_IGNORING_CASE:
2000 static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool inassert)
2004 int d = branchFindFirstAssertedCharacter(code + 1 + LINK_SIZE, inassert);
2011 code += getLinkValue(code + 1);
2012 } while (*code == OP_ALT);
2016 static inline int multiplyWithOverflowCheck(int a, int b)
2020 if (a > MAX_PATTERN_SIZE / b)
2025 static int calculateCompiledPatternLength(const UChar* pattern, int patternLength, JSRegExpIgnoreCaseOption ignoreCase,
2026 CompileData& cd, ErrorCode& errorcode)
2028 /* Make a pass over the pattern to compute the
2029 amount of store required to hold the compiled code. This does not have to be
2030 perfect as long as errors are overestimates. */
2032 if (patternLength > MAX_PATTERN_SIZE) {
2037 int length = BRA_LEN; /* For initial BRA. */
2038 int branch_extra = 0;
2039 int lastitemlength = 0;
2040 unsigned brastackptr = 0;
2041 int brastack[BRASTACK_SIZE];
2042 unsigned char bralenstack[BRASTACK_SIZE];
2045 const UChar* ptr = (const UChar*)(pattern - 1);
2046 const UChar* patternEnd = (const UChar*)(pattern + patternLength);
2048 while (++ptr < patternEnd) {
2049 int minRepeats = 0, maxRepeats = 0;
2053 /* A backslashed item may be an escaped data character or it may be a
2057 c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, false);
2061 lastitemlength = 1; /* Default length of last item for repeats */
2063 if (c >= 0) { /* Data character */
2064 length += 2; /* For a one-byte character */
2068 for (i = 0; i < jsc_pcre_utf8_table1_size; i++)
2069 if (c <= jsc_pcre_utf8_table1[i]) break;
2071 lastitemlength += i;
2077 /* Other escapes need one byte */
2081 /* A back reference needs an additional 2 bytes, plus either one or 5
2082 bytes for a repeat. We also need to keep the value of the highest
2085 if (c <= -ESC_REF) {
2086 int refnum = -c - ESC_REF;
2087 cd.backrefMap |= (refnum < 32) ? (1 << refnum) : 1;
2088 if (refnum > cd.topBackref)
2089 cd.topBackref = refnum;
2090 length += 2; /* For single back reference */
2091 if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRepeat(ptr + 2, patternEnd)) {
2092 ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
2095 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
2096 (minRepeats == 1 && maxRepeats == -1))
2100 if (safelyCheckNextChar(ptr, patternEnd, '?'))
2106 case '^': /* Single-byte metacharacters */
2113 case '*': /* These repeats won't be after brackets; */
2114 case '+': /* those are handled separately */
2119 /* This covers the cases of braced repeats after a single char, metachar,
2120 class, or back reference. */
2123 if (!isCountedRepeat(ptr + 1, patternEnd))
2125 ptr = readRepeatCounts(ptr + 1, &minRepeats, &maxRepeats, &errorcode);
2129 /* These special cases just insert one extra opcode */
2131 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
2132 (minRepeats == 1 && maxRepeats == -1))
2135 /* These cases might insert additional copies of a preceding character. */
2138 if (minRepeats != 1) {
2139 length -= lastitemlength; /* Uncount the original char or metachar */
2141 length += 5 + lastitemlength;
2143 length += lastitemlength + ((maxRepeats > 0) ? 5 : 1);
2146 if (safelyCheckNextChar(ptr, patternEnd, '?'))
2147 ptr++; /* Needs no extra length */
2149 POSSESSIVE: /* Test for possessive quantifier */
2150 if (safelyCheckNextChar(ptr, patternEnd, '+')) {
2152 length += 2 + 2 * LINK_SIZE; /* Allow for atomic brackets */
2156 /* An alternation contains an offset to the next branch or ket. If any ims
2157 options changed in the previous branch(es), and/or if we are in a
2158 lookbehind assertion, extra space will be needed at the start of the
2159 branch. This is handled by branch_extra. */
2162 if (brastackptr == 0)
2163 cd.needOuterBracket = true;
2164 length += 1 + LINK_SIZE + branch_extra;
2167 /* A character class uses 33 characters provided that all the character
2168 values are less than 256. Otherwise, it uses a bit map for low valued
2169 characters, and individual items for others. Don't worry about character
2170 types that aren't allowed in classes - they'll get picked up during the
2171 compile. A character class that contains only one single-byte character
2172 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
2173 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
2177 if (*(++ptr) == '^') {
2178 class_optcount = 10; /* Greater than one */
2184 bool class_utf8 = false;
2186 for (; ptr < patternEnd && *ptr != ']'; ++ptr) {
2187 /* Check for escapes */
2190 c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, true);
2194 /* Handle escapes that turn into characters */
2197 goto NON_SPECIAL_CHARACTER;
2199 /* Escapes that are meta-things. The normal ones just affect the
2200 bit map, but Unicode properties require an XCLASS extended item. */
2203 class_optcount = 10; /* \d, \s etc; make sure > 1 */
2206 /* Anything else increments the possible optimization count. We have to
2207 detect ranges here so that we can compute the number of extra ranges for
2208 caseless wide characters when UCP support is available. If there are wide
2209 characters, we are going to have to use an XCLASS, even for single
2215 /* Come here from handling \ above when it escapes to a char value */
2217 NON_SPECIAL_CHARACTER:
2221 if (safelyCheckNextChar(ptr, patternEnd, '-')) {
2222 const UChar* hyptr = ptr++;
2223 if (safelyCheckNextChar(ptr, patternEnd, '\\')) {
2225 d = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, true);
2229 else if ((ptr + 1 < patternEnd) && ptr[1] != ']')
2232 ptr = hyptr; /* go back to hyphen as data */
2235 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
2236 127 for caseless matching, we will need to use an XCLASS. */
2239 class_optcount = 10; /* Ensure > 1 */
2245 if ((d > 255 || (ignoreCase && d > 127))) {
2246 unsigned char buffer[6];
2247 if (!class_utf8) /* Allow for XCLASS overhead */
2250 length += LINK_SIZE + 2;
2253 /* If we have UCP support, find out how many extra ranges are
2254 needed to map the other case of characters within this range. We
2255 have to mimic the range optimization here, because extending the
2256 range upwards might push d over a boundary that makes it use
2257 another byte in the UTF-8 representation. */
2263 while (getOthercaseRange(&cc, origd, &occ, &ocd)) {
2264 if (occ >= c && ocd <= d)
2265 continue; /* Skip embedded */
2267 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2268 { /* if there is overlap, */
2269 c = occ; /* noting that if occ < c */
2270 continue; /* we can't have ocd > d */
2271 } /* because a subrange is */
2272 if (ocd > d && occ <= d + 1) /* always shorter than */
2273 { /* the basic range. */
2278 /* An extra item is needed */
2280 length += 1 + encodeUTF8(occ, buffer) +
2281 ((occ == ocd) ? 0 : encodeUTF8(ocd, buffer));
2285 /* The length of the (possibly extended) range */
2287 length += 1 + encodeUTF8(c, buffer) + encodeUTF8(d, buffer);
2292 /* We have a single character. There is nothing to be done unless we
2293 are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
2294 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
2298 if ((c > 255 || (ignoreCase && c > 127))) {
2299 unsigned char buffer[6];
2300 class_optcount = 10; /* Ensure > 1 */
2301 if (!class_utf8) /* Allow for XCLASS overhead */
2304 length += LINK_SIZE + 2;
2306 length += (ignoreCase ? 2 : 1) * (1 + encodeUTF8(c, buffer));
2312 if (ptr >= patternEnd) { /* Missing terminating ']' */
2317 /* We can optimize when there was only one optimizable character.
2318 Note that this does not detect the case of a negated single character.
2319 In that case we do an incorrect length computation, but it's not a serious
2320 problem because the computed length is too large rather than too small. */
2322 if (class_optcount == 1)
2325 /* Here, we handle repeats for the class opcodes. */
2329 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
2330 we also need extra for wrapping the whole thing in a sub-pattern. */
2332 if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRepeat(ptr + 2, patternEnd)) {
2333 ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
2336 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
2337 (minRepeats == 1 && maxRepeats == -1))
2341 if (safelyCheckNextChar(ptr, patternEnd, '+')) {
2343 length += 2 + 2 * LINK_SIZE;
2344 } else if (safelyCheckNextChar(ptr, patternEnd, '?'))
2351 /* Brackets may be genuine groups or special things */
2354 int branch_newextra = 0;
2355 int bracket_length = BRA_LEN;
2356 bool capturing = false;
2358 /* Handle special forms of bracket, which all start (? */
2360 if (safelyCheckNextChar(ptr, patternEnd, '?')) {
2361 switch (c = (ptr + 2 < patternEnd ? ptr[2] : 0)) {
2362 /* Non-referencing groups and lookaheads just move the pointer on, and
2363 then behave like a non-special bracket, except that they don't increment
2364 the count of extracting brackets. Ditto for the "once only" bracket,
2365 which is in Perl from version 5.005. */
2373 /* Else loop checking valid options until ) is met. Anything else is an
2374 error. If we are without any brackets, i.e. at top level, the settings
2375 act as if specified in the options, so massage the options immediately.
2376 This is for backward compatibility with Perl 5.004. */
2385 /* Capturing brackets must be counted so we can process escapes in a
2386 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need
2387 an additional 3 bytes of memory per capturing bracket. */
2391 if (bracount > EXTRACT_BASIC_MAX)
2392 bracket_length += 3;
2395 /* Save length for computing whole length at end if there's a repeat that
2396 requires duplication of the group. Also save the current value of
2397 branch_extra, and start the new group with the new value. If non-zero, this
2398 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
2400 if (brastackptr >= sizeof(brastack)/sizeof(int)) {
2405 bralenstack[brastackptr] = branch_extra;
2406 branch_extra = branch_newextra;
2408 brastack[brastackptr++] = length;
2409 length += bracket_length;
2413 /* Handle ket. Look for subsequent maxRepeats/minRepeats; for certain sets of values we
2414 have to replicate this bracket up to that many times. If brastackptr is
2415 0 this is an unmatched bracket which will generate an error, but take care
2416 not to try to access brastack[-1] when computing the length and restoring
2417 the branch_extra value. */
2422 if (brastackptr > 0) {
2423 duplength = length - brastack[--brastackptr];
2424 branch_extra = bralenstack[brastackptr];
2429 /* Leave ptr at the final char; for readRepeatCounts this happens
2430 automatically; for the others we need an increment. */
2432 if ((ptr + 1 < patternEnd) && (c = ptr[1]) == '{' && isCountedRepeat(ptr + 2, patternEnd)) {
2433 ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
2436 } else if (c == '*') {
2440 } else if (c == '+') {
2444 } else if (c == '?') {
2453 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2454 group, and if the maximum is greater than zero, we have to replicate
2455 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2459 if (minRepeats == 0) {
2461 if (maxRepeats > 0) {
2462 repeatsLength = multiplyWithOverflowCheck(maxRepeats - 1, duplength + BRA_LEN + KET_LEN + OPCODE_LEN);
2463 if (repeatsLength < 0) {
2467 length += repeatsLength;
2468 if (length > MAX_PATTERN_SIZE) {
2475 /* When the minimum is greater than zero, we have to replicate up to
2476 minval-1 times, with no additions required in the copies. Then, if there
2477 is a limited maximum we have to replicate up to maxval-1 times allowing
2478 for a BRAZERO item before each optional copy and nesting brackets for all
2479 but one of the optional copies. */
2482 repeatsLength = multiplyWithOverflowCheck(minRepeats - 1, duplength);
2483 if (repeatsLength < 0) {
2487 length += repeatsLength;
2488 if (maxRepeats > minRepeats) { /* Need this test as maxRepeats=-1 means no limit */
2489 repeatsLength = multiplyWithOverflowCheck(maxRepeats - minRepeats, duplength + BRAZERO_LEN + BRA_LEN + KET_LEN);
2490 if (repeatsLength < 0) {
2494 length += repeatsLength - (2 + 2 * LINK_SIZE);
2496 if (length > MAX_PATTERN_SIZE) {
2502 /* Allow space for once brackets for "possessive quantifier" */
2504 if (safelyCheckNextChar(ptr, patternEnd, '+')) {
2506 length += 2 + 2 * LINK_SIZE;
2511 /* Non-special character. It won't be space or # in extended mode, so it is
2512 always a genuine character. If we are in a \Q...\E sequence, check for the
2513 end; if not, we have a literal. */
2517 length += 2; /* For a one-byte character */
2518 lastitemlength = 1; /* Default length of last item for repeats */
2522 for (i = 0; i < jsc_pcre_utf8_table1_size; i++)
2523 if (c <= jsc_pcre_utf8_table1[i])
2526 lastitemlength += i;
2533 length += KET_LEN + OPCODE_LEN; /* For final KET and END */
2535 cd.numCapturingBrackets = bracount;
2539 /*************************************************
2540 * Compile a Regular Expression *
2541 *************************************************/
2543 /* This function takes a string and returns a pointer to a block of store
2544 holding a compiled version of the expression. The original API for this
2545 function had no error code return variable; it is retained for backwards
2546 compatibility. The new function is given a new name.
2549 pattern the regular expression
2550 options various option bits
2551 errorCodePtr pointer to error code variable (pcre_compile2() only)
2552 can be NULL if you don't want a code value
2553 error pointer to pointer to error text
2554 erroroffset ptr offset in pattern where error was detected
2555 tables pointer to character tables or NULL
2557 Returns: pointer to compiled data block, or NULL on error,
2558 with error and erroroffset set
2561 static inline JSRegExp* returnError(ErrorCode errorcode, int *error)
2563 *error = static_cast<int>(errorcode);
2567 JSRegExp* jsRegExpCompile(const UChar* pattern, int patternLength,
2568 JSRegExpIgnoreCaseOption ignoreCase, JSRegExpMultilineOption multiline,
2569 unsigned* numSubpatterns, int *error)
2571 /* We can't pass back an error message if error is NULL; I guess the best we
2572 can do is just return NULL, but we can set a code value if there is a code pointer. */
2579 ErrorCode errorcode = ERR0;
2580 /* Call this once just to count the brackets. */
2581 calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, errorcode);
2582 /* Call it again to compute the length. */
2583 int length = calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, errorcode);
2585 return returnError(errorcode, error);
2587 if (length > MAX_PATTERN_SIZE)
2588 return returnError(ERR16, error);
2590 size_t size = length + sizeof(JSRegExp);
2591 // FIXME: bug 574459 -- no NULL check
2592 JSRegExp* re = reinterpret_cast<JSRegExp*>(js_array_new<char>(size));
2595 return returnError(ERR13, error);
2597 re->options = (ignoreCase ? IgnoreCaseOption : 0) | (multiline ? MatchAcrossMultipleLinesOption : 0);
2599 /* The starting points of the name/number translation table and of the code are
2600 passed around in the compile data block. */
2602 const unsigned char* codeStart = (const unsigned char*)(re + 1);
2604 /* Set up a starting, non-extracting bracket, then compile the expression. On
2605 error, errorcode will be set non-zero, so we don't need to look at the result
2606 of the function here. */
2608 const UChar* ptr = (const UChar*)pattern;
2609 const UChar* patternEnd = pattern + patternLength;
2610 unsigned char* code = const_cast<unsigned char*>(codeStart);
2611 int firstByte, reqByte;
2612 int bracketCount = 0;
2613 if (!cd.needOuterBracket)
2614 compileBranch(re->options, &bracketCount, &code, &ptr, patternEnd, &errorcode, &firstByte, &reqByte, cd);
2617 unsigned char * const codeBefore = code;
2618 compileBracket(re->options, &bracketCount, &code, &ptr, patternEnd, &errorcode, 2, &firstByte, &reqByte, cd);
2619 JS_ASSERT((bracketCount & 0xff) == bracketCount);
2620 put2ByteValue(codeBefore + 1 + LINK_SIZE, 0 << 8 | (bracketCount & 0xff));
2622 re->topBracket = bracketCount;
2623 re->topBackref = cd.topBackref;
2625 /* If not reached end of pattern on success, there's an excess bracket. */
2627 if (errorcode == 0 && ptr < patternEnd)
2630 /* Fill in the terminating state and check for disastrous overflow, but
2631 if debugging, leave the test till after things are printed out. */
2635 JS_ASSERT(code - codeStart <= length);
2636 if (code - codeStart > length)
2639 /* Give an error if there's back reference to a non-existent capturing
2642 if (re->topBackref > re->topBracket)
2645 /* Failed to compile, or error while post-processing */
2647 if (errorcode != ERR0) {
2648 js_array_delete(reinterpret_cast<char*>(re));
2649 return returnError(errorcode, error);
2652 /* If the anchored option was not passed, set the flag if we can determine that
2653 the pattern is anchored by virtue of ^ characters or \A or anything else (such
2654 as starting with .* when DOTALL is set).
2656 Otherwise, if we know what the first character has to be, save it, because that
2657 speeds up unanchored matches no end. If not, see if we can set the
2658 UseMultiLineFirstByteOptimizationOption flag. This is helpful for multiline matches when all branches
2659 start with ^. and also when all branches start with .* for non-DOTALL matches.
2662 if (cd.needOuterBracket ? bracketIsAnchored(codeStart) : branchIsAnchored(codeStart))
2663 re->options |= IsAnchoredOption;
2665 if (firstByte < 0) {
2666 firstByte = (cd.needOuterBracket
2667 ? bracketFindFirstAssertedCharacter(codeStart, false)
2668 : branchFindFirstAssertedCharacter(codeStart, false))
2669 | ((re->options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0);
2671 if (firstByte >= 0) {
2672 int ch = firstByte & 255;
2674 re->firstByte = ((firstByte & REQ_IGNORE_CASE) && flipCase(ch) == ch) ? ch : firstByte;
2675 re->options |= UseFirstByteOptimizationOption;
2678 if (cd.needOuterBracket ? bracketNeedsLineStart(codeStart, 0, cd.backrefMap) : branchNeedsLineStart(codeStart, 0, cd.backrefMap))
2679 re->options |= UseMultiLineFirstByteOptimizationOption;
2683 /* For an anchored pattern, we use the "required byte" only if it follows a
2684 variable length item in the regex. Remove the caseless flag for non-caseable
2687 if (reqByte >= 0 && (!(re->options & IsAnchoredOption) || (reqByte & REQ_VARY))) {
2688 int ch = reqByte & 255;
2690 re->reqByte = ((reqByte & REQ_IGNORE_CASE) && flipCase(ch) == ch) ? (reqByte & ~REQ_IGNORE_CASE) : reqByte;
2691 re->options |= UseRequiredByteOptimizationOption;
2696 *numSubpatterns = re->topBracket;
2701 void jsRegExpFree(JSRegExp* re)
2703 js_array_delete(reinterpret_cast<char*>(re));