1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Copyright (c) 1997-2010 University of Cambridge
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
53 #include "pcre_internal.h"
56 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57 also used by pcretest. PCRE_DEBUG is not defined when building a production
61 #include "pcre_printint.src"
65 /* Macro for setting individual bits in class bitmaps. */
67 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
69 /* Maximum length value to check against when making sure that the integer that
70 holds the compiled pattern length does not overflow. We make it a bit less than
71 INT_MAX to allow for adding in group terminating bytes, so that we don't have
72 to check them every time. */
74 #define OFLOW_MAX (INT_MAX - 20)
77 /*************************************************
78 * Code parameters and static tables *
79 *************************************************/
81 /* This value specifies the size of stack workspace that is used during the
82 first pre-compile phase that determines how much memory is required. The regex
83 is partly compiled into this space, but the compiled parts are discarded as
84 soon as they can be, so that hopefully there will never be an overrun. The code
85 does, however, check for an overrun. The largest amount I've seen used is 218,
86 so this number is very generous.
88 The same workspace is used during the second, actual compile phase for
89 remembering forward references to groups so that they can be filled in at the
90 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91 is 4 there is plenty of room. */
93 #define COMPILE_WORK_SIZE (4096)
95 /* The overrun tests check for a slightly smaller size so that they detect the
96 overrun before it actually does run off the end of the data block. */
98 #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
101 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
102 are simple data values; negative values are for special things like \d and so
103 on. Zero means further processing is needed (for things like \x), or the escape
108 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
111 static const short int escapes[] = {
117 CHAR_COLON, CHAR_SEMICOLON,
118 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
119 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
120 CHAR_COMMERCIAL_AT, -ESC_A,
133 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
134 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
135 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
136 CHAR_GRAVE_ACCENT, 7,
154 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
156 static const short int escapes[] = {
157 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
158 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
159 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
160 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
161 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
162 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
163 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
164 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
165 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
166 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
167 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
168 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
169 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
170 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
171 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
172 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
173 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
174 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
175 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
176 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
177 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
178 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
179 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
184 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
185 searched linearly. Put all the names into a single string, in order to reduce
186 the number of relocations when a shared library is dynamically linked. The
187 string is built from string macros so that it works in UTF-8 mode on EBCDIC
190 typedef struct verbitem {
195 static const char verbnames[] =
204 static const verbitem verbs[] = {
214 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
217 /* Tables of names of POSIX character classes and their lengths. The names are
218 now all in a single string, to reduce the number of relocations when a shared
219 library is dynamically loaded. The list of lengths is terminated by a zero
220 length entry. The first three must be alpha, lower, upper, as this is assumed
221 for handling case independence. */
223 static const char posix_names[] =
224 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
225 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
226 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
227 STRING_word0 STRING_xdigit;
229 static const uschar posix_name_lengths[] = {
230 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
232 /* Table of class bit maps for each POSIX class. Each class is formed from a
233 base map, with an optional addition or removal of another map. Then, for some
234 classes, there is some additional tweaking: for [:blank:] the vertical space
235 characters are removed, and for [:alpha:] and [:alnum:] the underscore
236 character is removed. The triples in the table consist of the base map offset,
237 second map offset or -1 if no second map, and a non-negative value for map
238 addition or a negative value for map subtraction (if there are two maps). The
239 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
240 remove vertical space characters, 2 => remove underscore. */
242 static const int posix_class_maps[] = {
243 cbit_word, cbit_digit, -2, /* alpha */
244 cbit_lower, -1, 0, /* lower */
245 cbit_upper, -1, 0, /* upper */
246 cbit_word, -1, 2, /* alnum - word without underscore */
247 cbit_print, cbit_cntrl, 0, /* ascii */
248 cbit_space, -1, 1, /* blank - a GNU extension */
249 cbit_cntrl, -1, 0, /* cntrl */
250 cbit_digit, -1, 0, /* digit */
251 cbit_graph, -1, 0, /* graph */
252 cbit_print, -1, 0, /* print */
253 cbit_punct, -1, 0, /* punct */
254 cbit_space, -1, 0, /* space */
255 cbit_word, -1, 0, /* word - a Perl extension */
256 cbit_xdigit,-1, 0 /* xdigit */
260 #define STRING(a) # a
261 #define XSTRING(s) STRING(s)
263 /* The texts of compile-time error messages. These are "char *" because they
264 are passed to the outside world. Do not ever re-use any error number, because
265 they are documented. Always add a new error instead. Messages marked DEAD below
266 are no longer used. This used to be a table of strings, but in order to reduce
267 the number of relocations needed when a shared library is loaded dynamically,
268 it is now one long string. We cannot use a table of offsets, because the
269 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
270 simply count through to the one we want - this isn't a performance issue
271 because these strings are used only when there is a compilation error.
273 Each substring ends with \0 to insert a null character. This includes the final
274 substring, so that the whole string ends with \0\0, which can be detected when
277 static const char error_texts[] =
279 "\\ at end of pattern\0"
280 "\\c at end of pattern\0"
281 "unrecognized character follows \\\0"
282 "numbers out of order in {} quantifier\0"
284 "number too big in {} quantifier\0"
285 "missing terminating ] for character class\0"
286 "invalid escape sequence in character class\0"
287 "range out of order in character class\0"
288 "nothing to repeat\0"
290 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
291 "internal error: unexpected repeat\0"
292 "unrecognized character after (? or (?-\0"
293 "POSIX named classes are supported only within a class\0"
296 "reference to non-existent subpattern\0"
297 "erroffset passed as NULL\0"
298 "unknown option bit(s) set\0"
299 "missing ) after comment\0"
300 "parentheses nested too deeply\0" /** DEAD **/
302 "regular expression is too large\0"
303 "failed to get memory\0"
304 "unmatched parentheses\0"
305 "internal error: code overflow\0"
306 "unrecognized character after (?<\0"
308 "lookbehind assertion is not fixed length\0"
309 "malformed number or name after (?(\0"
310 "conditional group contains more than two branches\0"
311 "assertion expected after (?(\0"
312 "(?R or (?[+-]digits must be followed by )\0"
314 "unknown POSIX class name\0"
315 "POSIX collating elements are not supported\0"
316 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
317 "spare error\0" /** DEAD **/
318 "character value in \\x{...} sequence is too large\0"
320 "invalid condition (?(0)\0"
321 "\\C not allowed in lookbehind assertion\0"
322 "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
323 "number after (?C is > 255\0"
324 "closing ) for (?C expected\0"
326 "recursive call could loop indefinitely\0"
327 "unrecognized character after (?P\0"
328 "syntax error in subpattern name (missing terminator)\0"
329 "two named subpatterns have the same name\0"
330 "invalid UTF-8 string\0"
332 "support for \\P, \\p, and \\X has not been compiled\0"
333 "malformed \\P or \\p sequence\0"
334 "unknown property name after \\P or \\p\0"
335 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
336 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
338 "repeated subpattern is too long\0" /** DEAD **/
339 "octal value is greater than \\377 (not in UTF-8 mode)\0"
340 "internal error: overran compiling workspace\0"
341 "internal error: previously-checked referenced subpattern not found\0"
342 "DEFINE group contains more than one branch\0"
344 "repeating a DEFINE group is not allowed\0"
345 "inconsistent NEWLINE options\0"
346 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
347 "a numbered reference must not be zero\0"
348 "(*VERB) with an argument is not supported\0"
350 "(*VERB) not recognized\0"
351 "number is too big\0"
352 "subpattern name expected\0"
353 "digit expected after (?+\0"
354 "] is an invalid data character in JavaScript compatibility mode\0"
356 "different names for subpatterns of the same number are not allowed\0";
359 /* Definition to allow mutual recursion */
362 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
363 int *, int *, branch_chain *, compile_data *, int *);
367 /*************************************************
368 * Find an error text *
369 *************************************************/
371 /* The error texts are now all in one long string, to save on relocations. As
372 some of the text is of unknown length, we can't use a table of offsets.
373 Instead, just count through the strings. This is not a performance issue
374 because it happens only when there has been a compilation error.
376 Argument: the error number
377 Returns: pointer to the error string
381 find_error_text(int n)
383 const char *s = error_texts;
386 while (*s++ != 0) {};
387 if (*s == 0) return "Error text not found (please report)";
393 /*************************************************
395 *************************************************/
397 /* This function is called when a \ has been encountered. It either returns a
398 positive value for a simple escape such as \n, or a negative value which
399 encodes one of the more complicated things such as \d. A backreference to group
400 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
401 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
402 ptr is pointing at the \. On exit, it is on the final character of the escape
406 ptrptr points to the pattern position pointer
407 errorcodeptr points to the errorcode variable
408 bracount number of previous extracting brackets
409 options the options bits
410 isclass TRUE if inside a character class
412 Returns: zero or positive => a data character
413 negative => a special escape sequence
414 on error, errorcodeptr is set
418 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
419 int options, BOOL isclass)
421 BOOL utf8 = (options & PCRE_UTF8) != 0;
422 const uschar *ptr = *ptrptr + 1;
425 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
426 ptr--; /* Set pointer back to the last byte */
428 /* If backslash is at the end of the pattern, it's an error. */
430 if (c == 0) *errorcodeptr = ERR1;
432 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
433 in a table. A non-zero result is something that can be returned immediately.
434 Otherwise further processing may be required. */
436 #ifndef EBCDIC /* ASCII/UTF-8 coding */
437 else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
438 else if ((i = escapes[c - CHAR_0]) != 0) c = i;
440 #else /* EBCDIC coding */
441 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
442 else if ((i = escapes[c - 0x48]) != 0) c = i;
445 /* Escapes that need further processing, or are illegal. */
449 const uschar *oldptr;
450 BOOL braced, negated;
454 /* A number of Perl escapes are not handled by PCRE. We give an explicit
462 *errorcodeptr = ERR37;
465 /* \g must be followed by one of a number of specific things:
467 (1) A number, either plain or braced. If positive, it is an absolute
468 backreference. If negative, it is a relative backreference. This is a Perl
471 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
472 is part of Perl's movement towards a unified syntax for back references. As
473 this is synonymous with \k{name}, we fudge it up by pretending it really
476 (3) For Oniguruma compatibility we also support \g followed by a name or a
477 number either in angle brackets or in single quotes. However, these are
478 (possibly recursive) subroutine calls, _not_ backreferences. Just return
479 the -ESC_g code (cf \k). */
482 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
488 /* Handle the Perl-compatible cases */
490 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
493 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
494 if (*p != CHAR_MINUS && g_ascii_isdigit(*p) == 0) break;
495 if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
505 if (ptr[1] == CHAR_MINUS)
510 else negated = FALSE;
513 while (g_ascii_isdigit(ptr[1]) != 0)
514 c = c * 10 + *(++ptr) - CHAR_0;
516 if (c < 0) /* Integer overflow */
518 *errorcodeptr = ERR61;
522 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
524 *errorcodeptr = ERR57;
530 *errorcodeptr = ERR58;
538 *errorcodeptr = ERR15;
541 c = bracount - (c - 1);
547 /* The handling of escape sequences consisting of a string of digits
548 starting with one that is not zero is not straightforward. By experiment,
549 the way Perl works seems to be as follows:
551 Outside a character class, the digits are read as a decimal number. If the
552 number is less than 10, or if there are that many previous extracting
553 left brackets, then it is a back reference. Otherwise, up to three octal
554 digits are read to form an escaped byte. Thus \123 is likely to be octal
555 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
556 value is greater than 377, the least significant 8 bits are taken. Inside a
557 character class, \ followed by a digit is always an octal number. */
559 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
560 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
566 while (g_ascii_isdigit(ptr[1]) != 0)
567 c = c * 10 + *(++ptr) - CHAR_0;
568 if (c < 0) /* Integer overflow */
570 *errorcodeptr = ERR61;
573 if (c < 10 || c <= bracount)
578 ptr = oldptr; /* Put the pointer back and fall through */
581 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
582 generates a binary zero byte and treats the digit as a following literal.
583 Thus we have to pull back the pointer by one. */
585 if ((c = *ptr) >= CHAR_8)
592 /* \0 always starts an octal number, but we may drop through to here with a
593 larger first octal digit. The original code used just to take the least
594 significant 8 bits of octal numbers (I think this is what early Perls used
595 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
596 than 3 octal digits. */
600 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
601 c = c * 8 + *(++ptr) - CHAR_0;
602 if (!utf8 && c > 255) *errorcodeptr = ERR51;
605 /* \x is complicated. \x{ddd} is a character number which can be greater
606 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
607 treated as a data character. */
610 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
612 const uschar *pt = ptr + 2;
616 while (g_ascii_isxdigit(*pt) != 0)
618 register int cc = *pt++;
619 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
622 #ifndef EBCDIC /* ASCII/UTF-8 coding */
623 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
624 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
625 #else /* EBCDIC coding */
626 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
627 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
631 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
633 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
638 /* If the sequence of hex digits does not end with '}', then we don't
639 recognize this construct; fall through to the normal \x handling. */
642 /* Read just a single-byte hex-defined char */
645 while (i++ < 2 && g_ascii_isxdigit(ptr[1]) != 0)
647 int cc; /* Some compilers don't like */
648 cc = *(++ptr); /* ++ in initializers */
649 #ifndef EBCDIC /* ASCII/UTF-8 coding */
650 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
651 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
652 #else /* EBCDIC coding */
653 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
654 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
659 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
660 This coding is ASCII-specific, but then the whole concept of \cx is
661 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
667 *errorcodeptr = ERR2;
671 #ifndef EBCDIC /* ASCII/UTF-8 coding */
672 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
674 #else /* EBCDIC coding */
675 if (c >= CHAR_a && c <= CHAR_z) c += 64;
680 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
681 other alphanumeric following \ is an error if PCRE_EXTRA was set;
682 otherwise, for Perl compatibility, it is a literal. This code looks a bit
683 odd, but there used to be some cases other than the default, and there may
684 be again in future, so I haven't "optimized" it. */
687 if ((options & PCRE_EXTRA) != 0) switch(c)
690 *errorcodeptr = ERR3;
704 /*************************************************
706 *************************************************/
708 /* This function is called after \P or \p has been encountered, provided that
709 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
710 pointing at the P or p. On exit, it is pointing at the final character of the
714 ptrptr points to the pattern position pointer
715 negptr points to a boolean that is set TRUE for negation else FALSE
716 dptr points to an int that is set to the detailed property value
717 errorcodeptr points to the error code variable
719 Returns: type value from ucp_type_table, or -1 for an invalid type
723 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
726 const uschar *ptr = *ptrptr;
730 if (c == 0) goto ERROR_RETURN;
734 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
737 if (c == CHAR_LEFT_CURLY_BRACKET)
739 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
744 for (i = 0; i < (int)sizeof(name) - 1; i++)
747 if (c == 0) goto ERROR_RETURN;
748 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
751 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
755 /* Otherwise there is just one following character */
765 /* Search for a recognized property name using binary chop */
768 top = _pcre_utt_size;
772 i = (bot + top) >> 1;
773 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
776 *dptr = _pcre_utt[i].value;
777 return _pcre_utt[i].type;
779 if (c > 0) bot = i + 1; else top = i;
782 *errorcodeptr = ERR47;
787 *errorcodeptr = ERR46;
796 /*************************************************
797 * Check for counted repeat *
798 *************************************************/
800 /* This function is called when a '{' is encountered in a place where it might
801 start a quantifier. It looks ahead to see if it really is a quantifier or not.
802 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
803 where the ddds are digits.
806 p pointer to the first char after '{'
808 Returns: TRUE or FALSE
812 is_counted_repeat(const uschar *p)
814 if (g_ascii_isdigit(*p++) == 0) return FALSE;
815 while (g_ascii_isdigit(*p) != 0) p++;
816 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
818 if (*p++ != CHAR_COMMA) return FALSE;
819 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
821 if (g_ascii_isdigit(*p++) == 0) return FALSE;
822 while (g_ascii_isdigit(*p) != 0) p++;
824 return (*p == CHAR_RIGHT_CURLY_BRACKET);
829 /*************************************************
830 * Read repeat counts *
831 *************************************************/
833 /* Read an item of the form {n,m} and return the values. This is called only
834 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
835 so the syntax is guaranteed to be correct, but we need to check the values.
838 p pointer to first char after '{'
839 minp pointer to int for min
840 maxp pointer to int for max
841 returned as -1 if no max
842 errorcodeptr points to error code variable
844 Returns: pointer to '}' on success;
845 current ptr on error, with errorcodeptr set non-zero
848 static const uschar *
849 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
854 /* Read the minimum value and do a paranoid check: a negative value indicates
855 an integer overflow. */
857 while (g_ascii_isdigit(*p) != 0) min = min * 10 + *p++ - CHAR_0;
858 if (min < 0 || min > 65535)
860 *errorcodeptr = ERR5;
864 /* Read the maximum value if there is one, and again do a paranoid on its size.
865 Also, max must not be less than min. */
867 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
869 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
872 while(g_ascii_isdigit(*p) != 0) max = max * 10 + *p++ - CHAR_0;
873 if (max < 0 || max > 65535)
875 *errorcodeptr = ERR5;
880 *errorcodeptr = ERR4;
886 /* Fill in the required variables, and pass back the pointer to the terminating
896 /*************************************************
897 * Subroutine for finding forward reference *
898 *************************************************/
900 /* This recursive function is called only from find_parens() below. The
901 top-level call starts at the beginning of the pattern. All other calls must
902 start at a parenthesis. It scans along a pattern's text looking for capturing
903 subpatterns, and counting them. If it finds a named pattern that matches the
904 name it is given, it returns its number. Alternatively, if the name is NULL, it
905 returns when it reaches a given numbered subpattern. We know that if (?P< is
906 encountered, the name will be terminated by '>' because that is checked in the
907 first pass. Recursion is used to keep track of subpatterns that reset the
908 capturing group numbers - the (?| feature.
911 ptrptr address of the current character pointer (updated)
912 cd compile background data
913 name name to seek, or NULL if seeking a numbered subpattern
914 lorn name length, or subpattern number if name is NULL
915 xmode TRUE if we are in /x mode
916 count pointer to the current capturing subpattern number (updated)
918 Returns: the number of the named subpattern, or -1 if not found
922 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
923 BOOL xmode, int *count)
925 uschar *ptr = *ptrptr;
926 int start_count = *count;
927 int hwm_count = start_count;
928 BOOL dup_parens = FALSE;
930 /* If the first character is a parenthesis, check on the type of group we are
931 dealing with. The very first call may not start with a parenthesis. */
933 if (ptr[0] == CHAR_LEFT_PARENTHESIS)
935 if (ptr[1] == CHAR_QUESTION_MARK &&
936 ptr[2] == CHAR_VERTICAL_LINE)
942 /* Handle a normal, unnamed capturing parenthesis */
944 else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
947 if (name == NULL && *count == lorn) return *count;
951 /* Handle a condition. If it is an assertion, just carry on so that it
952 is processed as normal. If not, skip to the closing parenthesis of the
953 condition (there can't be any nested parens. */
955 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
958 if (ptr[1] != CHAR_QUESTION_MARK)
960 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
961 if (*ptr != 0) ptr++;
965 /* We have either (? or (* and not a condition */
970 if (*ptr == CHAR_P) ptr++; /* Allow optional P */
972 /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
974 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
975 ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
978 const uschar *thisname;
980 if (name == NULL && *count == lorn) return *count;
982 if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
984 while (*ptr != term) ptr++;
985 if (name != NULL && lorn == ptr - thisname &&
986 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
993 /* Past any initial parenthesis handling, scan for parentheses or vertical
996 for (; *ptr != 0; ptr++)
998 /* Skip over backslashed characters and also entire \Q...\E */
1000 if (*ptr == CHAR_BACKSLASH)
1002 if (*(++ptr) == 0) goto FAIL_EXIT;
1003 if (*ptr == CHAR_Q) for (;;)
1005 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1006 if (*ptr == 0) goto FAIL_EXIT;
1007 if (*(++ptr) == CHAR_E) break;
1012 /* Skip over character classes; this logic must be similar to the way they
1013 are handled for real. If the first character is '^', skip it. Also, if the
1014 first few characters (either before or after ^) are \Q\E or \E we skip them
1015 too. This makes for compatibility with Perl. Note the use of STR macros to
1016 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1018 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1020 BOOL negate_class = FALSE;
1023 if (ptr[1] == CHAR_BACKSLASH)
1025 if (ptr[2] == CHAR_E)
1027 else if (strncmp((const char *)ptr+2,
1028 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1033 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1035 negate_class = TRUE;
1041 /* If the next character is ']', it is a data character that must be
1042 skipped, except in JavaScript compatibility mode. */
1044 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1045 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1048 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1050 if (*ptr == 0) return -1;
1051 if (*ptr == CHAR_BACKSLASH)
1053 if (*(++ptr) == 0) goto FAIL_EXIT;
1054 if (*ptr == CHAR_Q) for (;;)
1056 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1057 if (*ptr == 0) goto FAIL_EXIT;
1058 if (*(++ptr) == CHAR_E) break;
1066 /* Skip comments in /x mode */
1068 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1070 while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1071 if (*ptr == 0) goto FAIL_EXIT;
1075 /* Check for the special metacharacters */
1077 if (*ptr == CHAR_LEFT_PARENTHESIS)
1079 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1080 if (rc > 0) return rc;
1081 if (*ptr == 0) goto FAIL_EXIT;
1084 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1086 if (dup_parens && *count < hwm_count) *count = hwm_count;
1091 else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1093 if (*count > hwm_count) hwm_count = *count;
1094 *count = start_count;
1106 /*************************************************
1107 * Find forward referenced subpattern *
1108 *************************************************/
1110 /* This function scans along a pattern's text looking for capturing
1111 subpatterns, and counting them. If it finds a named pattern that matches the
1112 name it is given, it returns its number. Alternatively, if the name is NULL, it
1113 returns when it reaches a given numbered subpattern. This is used for forward
1114 references to subpatterns. We used to be able to start this scan from the
1115 current compiling point, using the current count value from cd->bracount, and
1116 do it all in a single loop, but the addition of the possibility of duplicate
1117 subpattern numbers means that we have to scan from the very start, in order to
1118 take account of such duplicates, and to use a recursive function to keep track
1119 of the different types of group.
1122 cd compile background data
1123 name name to seek, or NULL if seeking a numbered subpattern
1124 lorn name length, or subpattern number if name is NULL
1125 xmode TRUE if we are in /x mode
1127 Returns: the number of the found subpattern, or -1 if not found
1131 find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1133 uschar *ptr = (uschar *)cd->start_pattern;
1137 /* If the pattern does not start with an opening parenthesis, the first call
1138 to find_parens_sub() will scan right to the end (if necessary). However, if it
1139 does start with a parenthesis, find_parens_sub() will return when it hits the
1140 matching closing parens. That is why we have to have a loop. */
1144 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1145 if (rc > 0 || *ptr++ == 0) break;
1154 /*************************************************
1155 * Find first significant op code *
1156 *************************************************/
1158 /* This is called by several functions that scan a compiled expression looking
1159 for a fixed first character, or an anchoring op code etc. It skips over things
1160 that do not influence this. For some calls, a change of option is important.
1161 For some calls, it makes sense to skip negative forward and all backward
1162 assertions, and also the \b assertion; for others it does not.
1165 code pointer to the start of the group
1166 options pointer to external options
1167 optbit the option bit whose changing is significant, or
1169 skipassert TRUE if certain assertions are to be skipped
1171 Returns: pointer to the first significant opcode
1174 static const uschar*
1175 first_significant_code(const uschar *code, int *options, int optbit,
1183 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1184 *options = (int)code[1];
1190 case OP_ASSERTBACK_NOT:
1191 if (!skipassert) return code;
1192 do code += GET(code, 1); while (*code == OP_ALT);
1193 code += _pcre_OP_lengths[*code];
1196 case OP_WORD_BOUNDARY:
1197 case OP_NOT_WORD_BOUNDARY:
1198 if (!skipassert) return code;
1207 code += _pcre_OP_lengths[*code];
1214 /* Control never reaches here */
1220 /*************************************************
1221 * Find the fixed length of a branch *
1222 *************************************************/
1224 /* Scan a branch and compute the fixed length of subject that will match it,
1225 if the length is fixed. This is needed for dealing with backward assertions.
1226 In UTF8 mode, the result is in characters rather than bytes. The branch is
1227 temporarily terminated with OP_END when this function is called.
1229 This function is called when a backward assertion is encountered, so that if it
1230 fails, the error message can point to the correct place in the pattern.
1231 However, we cannot do this when the assertion contains subroutine calls,
1232 because they can be forward references. We solve this by remembering this case
1233 and doing the check at the end; a flag specifies which mode we are running in.
1236 code points to the start of the pattern (the bracket)
1237 options the compiling options
1238 atend TRUE if called when the pattern is complete
1239 cd the "compile data" structure
1241 Returns: the fixed length,
1242 or -1 if there is no fixed length,
1243 or -2 if \C was encountered
1244 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1248 find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1252 register int branchlength = 0;
1253 register uschar *cc = code + 1 + LINK_SIZE;
1255 /* Scan along the opcodes for this branch. If we get to the end of the
1256 branch, check the length against that of the other branches. */
1262 register int op = *cc;
1269 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1270 if (d < 0) return d;
1272 do cc += GET(cc, 1); while (*cc == OP_ALT);
1273 cc += 1 + LINK_SIZE;
1276 /* Reached end of a branch; if it's a ket it is the end of a nested
1277 call. If it's ALT it is an alternation in a nested call. If it is
1278 END it's the end of the outer call. All can be handled by the same code. */
1285 if (length < 0) length = branchlength;
1286 else if (length != branchlength) return -1;
1287 if (*cc != OP_ALT) return length;
1288 cc += 1 + LINK_SIZE;
1292 /* A true recursion implies not fixed length, but a subroutine call may
1293 be OK. If the subroutine is a forward reference, we can't deal with
1294 it until the end of the pattern, so return -3. */
1297 if (!atend) return -3;
1298 cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1299 do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1300 if (cc > cs && cc < ce) return -1; /* Recursion */
1301 d = find_fixedlength(cs + 2, options, atend, cd);
1302 if (d < 0) return d;
1304 cc += 1 + LINK_SIZE;
1307 /* Skip over assertive subpatterns */
1312 case OP_ASSERTBACK_NOT:
1313 do cc += GET(cc, 1); while (*cc == OP_ALT);
1316 /* Skip over things that don't match chars */
1333 case OP_NOT_WORD_BOUNDARY:
1334 case OP_WORD_BOUNDARY:
1335 cc += _pcre_OP_lengths[*cc];
1338 /* Handle literal characters */
1346 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1347 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1351 /* Handle exact repetitions. The count is already in characters, but we
1352 need to skip over a multibyte character in UTF8 mode. */
1355 branchlength += GET2(cc,1);
1358 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1359 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1364 branchlength += GET2(cc,1);
1365 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1369 /* Handle single-char matchers */
1378 case OP_NOT_WHITESPACE:
1380 case OP_NOT_WORDCHAR:
1388 /* The single-byte matcher isn't allowed */
1393 /* Check a class for variable quantification */
1397 cc += GET(cc, 1) - 33;
1415 if (GET2(cc,1) != GET2(cc,3)) return -1;
1416 branchlength += GET2(cc,1);
1425 /* Anything else is variable length */
1431 /* Control never gets here */
1437 /*************************************************
1438 * Scan compiled regex for specific bracket *
1439 *************************************************/
1441 /* This little function scans through a compiled pattern until it finds a
1442 capturing bracket with the given number, or, if the number is negative, an
1443 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1444 so that it can be called from pcre_study() when finding the minimum matching
1448 code points to start of expression
1449 utf8 TRUE in UTF-8 mode
1450 number the required bracket number or negative to find a lookbehind
1452 Returns: pointer to the opcode for the bracket, or NULL if not found
1456 _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1460 register int c = *code;
1461 if (c == OP_END) return NULL;
1463 /* XCLASS is used for classes that cannot be represented just by a bit
1464 map. This includes negated single high-valued characters. The length in
1465 the table is zero; the actual length is stored in the compiled code. */
1467 if (c == OP_XCLASS) code += GET(code, 1);
1469 /* Handle recursion */
1471 else if (c == OP_REVERSE)
1473 if (number < 0) return (uschar *)code;
1474 code += _pcre_OP_lengths[c];
1477 /* Handle capturing bracket */
1479 else if (c == OP_CBRA)
1481 int n = GET2(code, 1+LINK_SIZE);
1482 if (n == number) return (uschar *)code;
1483 code += _pcre_OP_lengths[c];
1486 /* Otherwise, we can get the item's length from the table, except that for
1487 repeated character types, we have to test for \p and \P, which have an extra
1488 two bytes of parameters. */
1495 case OP_TYPEMINSTAR:
1497 case OP_TYPEMINPLUS:
1499 case OP_TYPEMINQUERY:
1500 case OP_TYPEPOSSTAR:
1501 case OP_TYPEPOSPLUS:
1502 case OP_TYPEPOSQUERY:
1503 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1507 case OP_TYPEMINUPTO:
1509 case OP_TYPEPOSUPTO:
1510 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1514 /* Add in the fixed length from the table */
1516 code += _pcre_OP_lengths[c];
1518 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1519 a multi-byte character. The length in the table is a minimum, so we have to
1520 arrange to skip the extra bytes. */
1540 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1544 (void)(utf8); /* Keep compiler happy by referencing function argument */
1552 /*************************************************
1553 * Scan compiled regex for recursion reference *
1554 *************************************************/
1556 /* This little function scans through a compiled pattern until it finds an
1557 instance of OP_RECURSE.
1560 code points to start of expression
1561 utf8 TRUE in UTF-8 mode
1563 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1566 static const uschar *
1567 find_recurse(const uschar *code, BOOL utf8)
1571 register int c = *code;
1572 if (c == OP_END) return NULL;
1573 if (c == OP_RECURSE) return code;
1575 /* XCLASS is used for classes that cannot be represented just by a bit
1576 map. This includes negated single high-valued characters. The length in
1577 the table is zero; the actual length is stored in the compiled code. */
1579 if (c == OP_XCLASS) code += GET(code, 1);
1581 /* Otherwise, we can get the item's length from the table, except that for
1582 repeated character types, we have to test for \p and \P, which have an extra
1583 two bytes of parameters. */
1590 case OP_TYPEMINSTAR:
1592 case OP_TYPEMINPLUS:
1594 case OP_TYPEMINQUERY:
1595 case OP_TYPEPOSSTAR:
1596 case OP_TYPEPOSPLUS:
1597 case OP_TYPEPOSQUERY:
1598 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1601 case OP_TYPEPOSUPTO:
1603 case OP_TYPEMINUPTO:
1605 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1609 /* Add in the fixed length from the table */
1611 code += _pcre_OP_lengths[c];
1613 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1614 by a multi-byte character. The length in the table is a minimum, so we have
1615 to arrange to skip the extra bytes. */
1635 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1639 (void)(utf8); /* Keep compiler happy by referencing function argument */
1647 /*************************************************
1648 * Scan compiled branch for non-emptiness *
1649 *************************************************/
1651 /* This function scans through a branch of a compiled pattern to see whether it
1652 can match the empty string or not. It is called from could_be_empty()
1653 below and from compile_branch() when checking for an unlimited repeat of a
1654 group that can match nothing. Note that first_significant_code() skips over
1655 backward and negative forward assertions when its final argument is TRUE. If we
1656 hit an unclosed bracket, we return "empty" - this means we've struck an inner
1657 bracket whose current branch will already have been scanned.
1660 code points to start of search
1661 endcode points to where to stop
1662 utf8 TRUE if in UTF8 mode
1663 cd contains pointers to tables etc.
1665 Returns: TRUE if what is matched could be empty
1669 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
1673 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1675 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1677 const uschar *ccode;
1681 /* Skip over forward assertions; the other assertions are skipped by
1682 first_significant_code() with a TRUE final argument. */
1686 do code += GET(code, 1); while (*code == OP_ALT);
1691 /* Groups with zero repeats can of course be empty; skip them. */
1693 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1695 code += _pcre_OP_lengths[c];
1696 do code += GET(code, 1); while (*code == OP_ALT);
1701 /* For a recursion/subroutine call, if its end has been reached, which
1702 implies a subroutine call, we can scan it. */
1704 if (c == OP_RECURSE)
1706 BOOL empty_branch = FALSE;
1707 const uschar *scode = cd->start_code + GET(code, 1);
1708 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
1711 if (could_be_empty_branch(scode, endcode, utf8, cd))
1713 empty_branch = TRUE;
1716 scode += GET(scode, 1);
1718 while (*scode == OP_ALT);
1719 if (!empty_branch) return FALSE; /* All branches are non-empty */
1723 /* For other groups, scan the branches. */
1725 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1728 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1730 /* If a conditional group has only one branch, there is a second, implied,
1731 empty branch, so just skip over the conditional, because it could be empty.
1732 Otherwise, scan the individual branches of the group. */
1734 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1735 code += GET(code, 1);
1738 empty_branch = FALSE;
1741 if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
1742 empty_branch = TRUE;
1743 code += GET(code, 1);
1745 while (*code == OP_ALT);
1746 if (!empty_branch) return FALSE; /* All branches are non-empty */
1753 /* Handle the other opcodes */
1757 /* Check for quantifiers after a class. XCLASS is used for classes that
1758 cannot be represented just by a bit map. This includes negated single
1759 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1760 actual length is stored in the compiled code, so we must update "code"
1765 ccode = code += GET(code, 1);
1766 goto CHECK_CLASS_REPEAT;
1779 case OP_CRSTAR: /* These could be empty; continue */
1785 default: /* Non-repeat => class must match */
1786 case OP_CRPLUS: /* These repeats aren't empty */
1792 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1797 /* Opcodes that must match a character */
1804 case OP_NOT_WHITESPACE:
1806 case OP_NOT_WORDCHAR:
1823 case OP_TYPEMINPLUS:
1824 case OP_TYPEPOSPLUS:
1828 /* These are going to continue, as they may be empty, but we have to
1829 fudge the length for the \p and \P cases. */
1832 case OP_TYPEMINSTAR:
1833 case OP_TYPEPOSSTAR:
1835 case OP_TYPEMINQUERY:
1836 case OP_TYPEPOSQUERY:
1837 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1840 /* Same for these */
1843 case OP_TYPEMINUPTO:
1844 case OP_TYPEPOSUPTO:
1845 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1856 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1857 MINUPTO, and POSUPTO may be followed by a multibyte character */
1866 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
1872 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
1876 /* None of the remaining opcodes are required to match a character. */
1888 /*************************************************
1889 * Scan compiled regex for non-emptiness *
1890 *************************************************/
1892 /* This function is called to check for left recursive calls. We want to check
1893 the current branch of the current pattern to see if it could match the empty
1894 string. If it could, we must look outwards for branches at other levels,
1895 stopping when we pass beyond the bracket which is the subject of the recursion.
1898 code points to start of the recursion
1899 endcode points to where to stop (current RECURSE item)
1900 bcptr points to the chain of current (unclosed) branch starts
1901 utf8 TRUE if in UTF-8 mode
1902 cd pointers to tables etc
1904 Returns: TRUE if what is matched could be empty
1908 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1909 BOOL utf8, compile_data *cd)
1911 while (bcptr != NULL && bcptr->current_branch >= code)
1913 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
1915 bcptr = bcptr->outer;
1922 /*************************************************
1923 * Check for POSIX class syntax *
1924 *************************************************/
1926 /* This function is called when the sequence "[:" or "[." or "[=" is
1927 encountered in a character class. It checks whether this is followed by a
1928 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1929 reach an unescaped ']' without the special preceding character, return FALSE.
1931 Originally, this function only recognized a sequence of letters between the
1932 terminators, but it seems that Perl recognizes any sequence of characters,
1933 though of course unknown POSIX names are subsequently rejected. Perl gives an
1934 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1935 didn't consider this to be a POSIX class. Likewise for [:1234:].
1937 The problem in trying to be exactly like Perl is in the handling of escapes. We
1938 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1939 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1940 below handles the special case of \], but does not try to do any other escape
1941 processing. This makes it different from Perl for cases such as [:l\ower:]
1942 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1943 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1947 ptr pointer to the initial [
1948 endptr where to return the end pointer
1950 Returns: TRUE or FALSE
1954 check_posix_syntax(const uschar *ptr, const uschar **endptr)
1956 int terminator; /* Don't combine these lines; the Solaris cc */
1957 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1958 for (++ptr; *ptr != 0; ptr++)
1960 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
1962 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
1963 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
1976 /*************************************************
1977 * Check POSIX class name *
1978 *************************************************/
1980 /* This function is called to check the name given in a POSIX-style class entry
1984 ptr points to the first letter
1985 len the length of the name
1987 Returns: a value representing the name, or -1 if unknown
1991 check_posix_name(const uschar *ptr, int len)
1993 const char *pn = posix_names;
1994 register int yield = 0;
1995 while (posix_name_lengths[yield] != 0)
1997 if (len == posix_name_lengths[yield] &&
1998 strncmp((const char *)ptr, pn, len) == 0) return yield;
1999 pn += posix_name_lengths[yield] + 1;
2006 /*************************************************
2007 * Adjust OP_RECURSE items in repeated group *
2008 *************************************************/
2010 /* OP_RECURSE items contain an offset from the start of the regex to the group
2011 that is referenced. This means that groups can be replicated for fixed
2012 repetition simply by copying (because the recursion is allowed to refer to
2013 earlier groups that are outside the current group). However, when a group is
2014 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2015 inserted before it, after it has been compiled. This means that any OP_RECURSE
2016 items within it that refer to the group itself or any contained groups have to
2017 have their offsets adjusted. That one of the jobs of this function. Before it
2018 is called, the partially compiled regex must be temporarily terminated with
2021 This function has been extended with the possibility of forward references for
2022 recursions and subroutine calls. It must also check the list of such references
2023 for the group we are dealing with. If it finds that one of the recursions in
2024 the current group is on this list, it adjusts the offset in the list, not the
2025 value in the reference (which is a group number).
2028 group points to the start of the group
2029 adjust the amount by which the group is to be moved
2030 utf8 TRUE in UTF-8 mode
2031 cd contains pointers to tables etc.
2032 save_hwm the hwm forward reference pointer at the start of the group
2038 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2041 uschar *ptr = group;
2043 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2048 /* See if this recursion is on the forward reference list. If so, adjust the
2051 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2053 offset = GET(hc, 0);
2054 if (cd->start_code + offset == ptr + 1)
2056 PUT(hc, 0, offset + adjust);
2061 /* Otherwise, adjust the recursion offset if it's after the start of this
2066 offset = GET(ptr, 1);
2067 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2070 ptr += 1 + LINK_SIZE;
2076 /*************************************************
2077 * Insert an automatic callout point *
2078 *************************************************/
2080 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2081 callout points before each pattern item.
2084 code current code pointer
2085 ptr current pattern pointer
2086 cd pointers to tables etc
2088 Returns: new code pointer
2092 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2094 *code++ = OP_CALLOUT;
2096 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
2097 PUT(code, LINK_SIZE, 0); /* Default length */
2098 return code + 2*LINK_SIZE;
2103 /*************************************************
2104 * Complete a callout item *
2105 *************************************************/
2107 /* A callout item contains the length of the next item in the pattern, which
2108 we can't fill in till after we have reached the relevant point. This is used
2109 for both automatic and manual callouts.
2112 previous_callout points to previous callout item
2113 ptr current pattern pointer
2114 cd pointers to tables etc
2120 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2122 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2123 PUT(previous_callout, 2 + LINK_SIZE, length);
2129 /*************************************************
2130 * Get othercase range *
2131 *************************************************/
2133 /* This function is passed the start and end of a class range, in UTF-8 mode
2134 with UCP support. It searches up the characters, looking for internal ranges of
2135 characters in the "other" case. Each call returns the next one, updating the
2139 cptr points to starting character value; updated
2141 ocptr where to put start of othercase range
2142 odptr where to put end of othercase range
2144 Yield: TRUE when range returned; FALSE when no more
2148 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2149 unsigned int *odptr)
2151 unsigned int c, othercase, next;
2153 for (c = *cptr; c <= d; c++)
2154 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2156 if (c > d) return FALSE;
2159 next = othercase + 1;
2161 for (++c; c <= d; c++)
2163 if (UCD_OTHERCASE(c) != next) break;
2172 #endif /* SUPPORT_UCP */
2176 /*************************************************
2177 * Check if auto-possessifying is possible *
2178 *************************************************/
2180 /* This function is called for unlimited repeats of certain items, to see
2181 whether the next thing could possibly match the repeated item. If not, it makes
2182 sense to automatically possessify the repeated item.
2185 op_code the repeated op code
2186 this data for this item, depends on the opcode
2187 utf8 TRUE in UTF-8 mode
2188 utf8_char used for utf8 character bytes, NULL if not relevant
2189 ptr next character in pattern
2190 options options bits
2191 cd contains pointers to tables etc.
2193 Returns: TRUE if possessifying is wanted
2197 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2198 const uschar *ptr, int options, compile_data *cd)
2202 /* Skip whitespace and comments in extended mode */
2204 if ((options & PCRE_EXTENDED) != 0)
2208 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2209 if (*ptr == CHAR_NUMBER_SIGN)
2211 while (*(++ptr) != 0)
2212 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2218 /* If the next item is one that we can handle, get its value. A non-negative
2219 value is a character, a negative value is an escape value. */
2221 if (*ptr == CHAR_BACKSLASH)
2223 int temperrorcode = 0;
2224 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2225 if (temperrorcode != 0) return FALSE;
2226 ptr++; /* Point after the escape sequence */
2229 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2232 if (utf8) { GETCHARINC(next, ptr); } else
2239 /* Skip whitespace and comments in extended mode */
2241 if ((options & PCRE_EXTENDED) != 0)
2245 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2246 if (*ptr == CHAR_NUMBER_SIGN)
2248 while (*(++ptr) != 0)
2249 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2255 /* If the next thing is itself optional, we have to give up. */
2257 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2258 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2261 /* Now compare the next item with the previous opcode. If the previous is a
2262 positive single character match, "item" either contains the character or, if
2263 "item" is greater than 127 in utf8 mode, the character's bytes are in
2267 /* Handle cases when the next item is a character. */
2269 if (next >= 0) switch(op_code)
2273 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2275 (void)(utf8_char); /* Keep compiler happy by referencing function argument */
2277 return item != next;
2279 /* For CHARNC (caseless character) we must check the other case. If we have
2280 Unicode property support, we can use it to test the other case of
2281 high-valued characters. */
2285 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2287 if (item == next) return FALSE;
2291 unsigned int othercase;
2292 if (next < 128) othercase = cd->fcc[next]; else
2294 othercase = UCD_OTHERCASE((unsigned int)next);
2296 othercase = NOTACHAR;
2298 return (unsigned int)item != othercase;
2301 #endif /* SUPPORT_UTF8 */
2302 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2304 /* For OP_NOT, "item" must be a single-byte character. */
2307 if (item == next) return TRUE;
2308 if ((options & PCRE_CASELESS) == 0) return FALSE;
2312 unsigned int othercase;
2313 if (next < 128) othercase = cd->fcc[next]; else
2315 othercase = UCD_OTHERCASE(next);
2317 othercase = NOTACHAR;
2319 return (unsigned int)item == othercase;
2322 #endif /* SUPPORT_UTF8 */
2323 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2326 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2329 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2332 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2334 case OP_NOT_WHITESPACE:
2335 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2338 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2340 case OP_NOT_WORDCHAR:
2341 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2366 return op_code != OP_HSPACE;
2368 return op_code == OP_HSPACE;
2382 return op_code != OP_VSPACE;
2384 return op_code == OP_VSPACE;
2392 /* Handle the case when the next item is \d, \s, etc. */
2399 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2404 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2407 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2410 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2413 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2416 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2419 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2444 return -next != ESC_h;
2446 return -next == ESC_h;
2460 return -next != ESC_v;
2462 return -next == ESC_v;
2470 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2471 next == -ESC_h || next == -ESC_v;
2474 return next == -ESC_d;
2477 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2479 case OP_NOT_WHITESPACE:
2480 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2483 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2486 return next == -ESC_h;
2488 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2490 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2493 return next == -ESC_v;
2496 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2498 case OP_NOT_WORDCHAR:
2499 return next == -ESC_w || next == -ESC_d;
2505 /* Control does not reach here */
2510 /*************************************************
2511 * Compile one branch *
2512 *************************************************/
2514 /* Scan the pattern, compiling it into the a vector. If the options are
2515 changed during the branch, the pointer is used to change the external options
2516 bits. This function is used during the pre-compile phase when we are trying
2517 to find out the amount of memory needed, as well as during the real compile
2518 phase. The value of lengthptr distinguishes the two phases.
2521 optionsptr pointer to the option bits
2522 codeptr points to the pointer to the current code point
2523 ptrptr points to the current pattern pointer
2524 errorcodeptr points to error code variable
2525 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2526 reqbyteptr set to the last literal character required, else < 0
2527 bcptr points to current branch chain
2528 cd contains pointers to tables etc.
2529 lengthptr NULL during the real compile phase
2530 points to length accumulator during pre-compile phase
2532 Returns: TRUE on success
2533 FALSE, with *errorcodeptr set non-zero on error
2537 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2538 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2539 compile_data *cd, int *lengthptr)
2541 int repeat_type, op_type;
2542 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2544 int greedy_default, greedy_non_default;
2545 int firstbyte, reqbyte;
2546 int zeroreqbyte, zerofirstbyte;
2547 int req_caseopt, reqvary, tempreqvary;
2548 int options = *optionsptr;
2549 int after_manual_callout = 0;
2550 int length_prevgroup = 0;
2552 register uschar *code = *codeptr;
2553 uschar *last_code = code;
2554 uschar *orig_code = code;
2556 BOOL inescq = FALSE;
2557 BOOL groupsetfirstbyte = FALSE;
2558 const uschar *ptr = *ptrptr;
2559 const uschar *tempptr;
2560 uschar *previous = NULL;
2561 uschar *previous_callout = NULL;
2562 uschar *save_hwm = NULL;
2563 uschar classbits[32];
2567 BOOL utf8 = (options & PCRE_UTF8) != 0;
2568 uschar *class_utf8data;
2569 uschar *class_utf8data_base;
2570 uschar utf8_char[6];
2573 uschar *utf8_char = NULL;
2577 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2580 /* Set up the default and non-default settings for greediness */
2582 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2583 greedy_non_default = greedy_default ^ 1;
2585 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2586 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2587 matches a non-fixed char first char; reqbyte just remains unset if we never
2590 When we hit a repeat whose minimum is zero, we may have to adjust these values
2591 to take the zero repeat into account. This is implemented by setting them to
2592 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2593 item types that can be repeated set these backoff variables appropriately. */
2595 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2597 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2598 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2599 value > 255. It is added into the firstbyte or reqbyte variables to record the
2600 case status of the value. This is used only for ASCII characters. */
2602 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2604 /* Switch on next character until the end of the branch */
2609 BOOL should_flip_negation;
2610 BOOL possessive_quantifier;
2613 BOOL reset_bracount;
2614 int class_charcount;
2626 /* Get next byte in the pattern */
2630 /* If we are in the pre-compile phase, accumulate the length used for the
2631 previous cycle of this loop. */
2633 if (lengthptr != NULL)
2636 if (code > cd->hwm) cd->hwm = code; /* High water info */
2638 if (code > cd->start_workspace + WORK_SIZE_CHECK) /* Check for overrun */
2640 *errorcodeptr = ERR52;
2644 /* There is at least one situation where code goes backwards: this is the
2645 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2646 the class is simply eliminated. However, it is created first, so we have to
2647 allow memory for it. Therefore, don't ever reduce the length at this point.
2650 if (code < last_code) code = last_code;
2652 /* Paranoid check for integer overflow */
2654 if (OFLOW_MAX - *lengthptr < code - last_code)
2656 *errorcodeptr = ERR20;
2660 *lengthptr += code - last_code;
2661 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2663 /* If "previous" is set and it is not at the start of the work space, move
2664 it back to there, in order to avoid filling up the work space. Otherwise,
2665 if "previous" is NULL, reset the current code pointer to the start. */
2667 if (previous != NULL)
2669 if (previous > orig_code)
2671 memmove(orig_code, previous, code - previous);
2672 code -= previous - orig_code;
2673 previous = orig_code;
2676 else code = orig_code;
2678 /* Remember where this code item starts so we can pick up the length
2684 /* In the real compile phase, just check the workspace used by the forward
2687 else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
2689 *errorcodeptr = ERR52;
2693 /* If in \Q...\E, check for the end; if not, we have a literal */
2695 if (inescq && c != 0)
2697 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2705 if (previous_callout != NULL)
2707 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2708 complete_callout(previous_callout, ptr, cd);
2709 previous_callout = NULL;
2711 if ((options & PCRE_AUTO_CALLOUT) != 0)
2713 previous_callout = code;
2714 code = auto_callout(code, ptr, cd);
2720 /* Fill in length of a previous callout, except when the next thing is
2724 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2725 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2727 if (!is_quantifier && previous_callout != NULL &&
2728 after_manual_callout-- <= 0)
2730 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2731 complete_callout(previous_callout, ptr, cd);
2732 previous_callout = NULL;
2735 /* In extended mode, skip white space and comments */
2737 if ((options & PCRE_EXTENDED) != 0)
2739 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2740 if (c == CHAR_NUMBER_SIGN)
2742 while (*(++ptr) != 0)
2744 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2746 if (*ptr != 0) continue;
2748 /* Else fall through to handle end of string */
2753 /* No auto callout for quantifiers. */
2755 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2757 previous_callout = code;
2758 code = auto_callout(code, ptr, cd);
2763 /* ===================================================================*/
2764 case 0: /* The branch terminates at string end */
2765 case CHAR_VERTICAL_LINE: /* or | or ) */
2766 case CHAR_RIGHT_PARENTHESIS:
2767 *firstbyteptr = firstbyte;
2768 *reqbyteptr = reqbyte;
2771 if (lengthptr != NULL)
2773 if (OFLOW_MAX - *lengthptr < code - last_code)
2775 *errorcodeptr = ERR20;
2778 *lengthptr += code - last_code; /* To include callout length */
2779 DPRINTF((">> end branch\n"));
2784 /* ===================================================================*/
2785 /* Handle single-character metacharacters. In multiline mode, ^ disables
2786 the setting of any following char as a first character. */
2788 case CHAR_CIRCUMFLEX_ACCENT:
2789 if ((options & PCRE_MULTILINE) != 0)
2791 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2797 case CHAR_DOLLAR_SIGN:
2802 /* There can never be a first char if '.' is first, whatever happens about
2803 repeats. The value of reqbyte doesn't change either. */
2806 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2807 zerofirstbyte = firstbyte;
2808 zeroreqbyte = reqbyte;
2810 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2814 /* ===================================================================*/
2815 /* Character classes. If the included characters are all < 256, we build a
2816 32-byte bitmap of the permitted characters, except in the special case
2817 where there is only one such character. For negated classes, we build the
2818 map as usual, then invert it at the end. However, we use a different opcode
2819 so that data characters > 255 can be handled correctly.
2821 If the class contains characters outside the 0-255 range, a different
2822 opcode is compiled. It may optionally have a bit map for characters < 256,
2823 but those above are are explicitly listed afterwards. A flag byte tells
2824 whether the bitmap is present, and whether this is a negated class or not.
2826 In JavaScript compatibility mode, an isolated ']' causes an error. In
2827 default (Perl) mode, it is treated as a data character. */
2829 case CHAR_RIGHT_SQUARE_BRACKET:
2830 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2832 *errorcodeptr = ERR64;
2837 case CHAR_LEFT_SQUARE_BRACKET:
2840 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2841 they are encountered at the top level, so we'll do that too. */
2843 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2844 ptr[1] == CHAR_EQUALS_SIGN) &&
2845 check_posix_syntax(ptr, &tempptr))
2847 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
2851 /* If the first character is '^', set the negation flag and skip it. Also,
2852 if the first few characters (either before or after ^) are \Q\E or \E we
2853 skip them too. This makes for compatibility with Perl. */
2855 negate_class = FALSE;
2859 if (c == CHAR_BACKSLASH)
2861 if (ptr[1] == CHAR_E)
2863 else if (strncmp((const char *)ptr+1,
2864 STR_Q STR_BACKSLASH STR_E, 3) == 0)
2869 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
2870 negate_class = TRUE;
2874 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2875 an initial ']' is taken as a data character -- the code below handles
2876 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2877 [^] must match any character, so generate OP_ALLANY. */
2879 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
2880 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2882 *code++ = negate_class? OP_ALLANY : OP_FAIL;
2883 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2884 zerofirstbyte = firstbyte;
2888 /* If a class contains a negative special such as \S, we need to flip the
2889 negation flag at the end, so that support for characters > 255 works
2890 correctly (they are all included in the class). */
2892 should_flip_negation = FALSE;
2894 /* Keep a count of chars with values < 256 so that we can optimize the case
2895 of just a single character (as long as it's < 256). However, For higher
2896 valued UTF-8 characters, we don't yet do any optimization. */
2898 class_charcount = 0;
2899 class_lastchar = -1;
2901 /* Initialize the 32-char bit map to all zeros. We build the map in a
2902 temporary bit of memory, in case the class contains only 1 character (less
2903 than 256), because in that case the compiled code doesn't use the bit map.
2906 memset(classbits, 0, 32 * sizeof(uschar));
2909 class_utf8 = FALSE; /* No chars >= 256 */
2910 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2911 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2914 /* Process characters until ] is reached. By writing this as a "do" it
2915 means that an initial ] is taken as a data character. At the start of the
2916 loop, c contains the first byte of the character. */
2920 const uschar *oldptr;
2923 if (utf8 && c > 127)
2924 { /* Braces are required because the */
2925 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2928 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2929 data and reset the pointer. This is so that very large classes that
2930 contain a zillion UTF-8 characters no longer overwrite the work space
2931 (which is on the stack). */
2933 if (lengthptr != NULL)
2935 *lengthptr += class_utf8data - class_utf8data_base;
2936 class_utf8data = class_utf8data_base;
2941 /* Inside \Q...\E everything is literal except \E */
2945 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
2947 inescq = FALSE; /* Reset literal state */
2948 ptr++; /* Skip the 'E' */
2949 continue; /* Carry on with next */
2951 goto CHECK_RANGE; /* Could be range if \E follows */
2954 /* Handle POSIX class names. Perl allows a negation extension of the
2955 form [:^name:]. A square bracket that doesn't match the syntax is
2956 treated as a literal. We also recognize the POSIX constructions
2957 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2960 if (c == CHAR_LEFT_SQUARE_BRACKET &&
2961 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2962 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
2964 BOOL local_negate = FALSE;
2965 int posix_class, taboffset, tabopt;
2966 register const uschar *cbits = cd->cbits;
2969 if (ptr[1] != CHAR_COLON)
2971 *errorcodeptr = ERR31;
2976 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
2978 local_negate = TRUE;
2979 should_flip_negation = TRUE; /* Note negative special */
2983 posix_class = check_posix_name(ptr, tempptr - ptr);
2984 if (posix_class < 0)
2986 *errorcodeptr = ERR30;
2990 /* If matching is caseless, upper and lower are converted to
2991 alpha. This relies on the fact that the class table starts with
2992 alpha, lower, upper as the first 3 entries. */
2994 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2997 /* We build the bit map for the POSIX class in a chunk of local store
2998 because we may be adding and subtracting from it, and we don't want to
2999 subtract bits that may be in the main map already. At the end we or the
3000 result into the bit map that is being built. */
3004 /* Copy in the first table (always present) */
3006 memcpy(pbits, cbits + posix_class_maps[posix_class],
3007 32 * sizeof(uschar));
3009 /* If there is a second table, add or remove it as required. */
3011 taboffset = posix_class_maps[posix_class + 1];
3012 tabopt = posix_class_maps[posix_class + 2];
3017 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3019 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3022 /* Not see if we need to remove any special characters. An option
3023 value of 1 removes vertical space and 2 removes underscore. */
3025 if (tabopt < 0) tabopt = -tabopt;
3026 if (tabopt == 1) pbits[1] &= ~0x3c;
3027 else if (tabopt == 2) pbits[11] &= 0x7f;
3029 /* Add the POSIX table or its complement into the main table that is
3030 being built and we are done. */
3033 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3035 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3038 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3039 continue; /* End of POSIX syntax handling */
3042 /* Backslash may introduce a single character, or it may introduce one
3043 of the specials, which just set a flag. The sequence \b is a special
3044 case. Inside a class (and only there) it is treated as backspace.
3045 Elsewhere it marks a word boundary. Other escapes have preset maps ready
3046 to 'or' into the one we are building. We assume they have more than one
3047 character in them, so set class_charcount bigger than one. */
3049 if (c == CHAR_BACKSLASH)
3051 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3052 if (*errorcodeptr != 0) goto FAILED;
3054 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3055 else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */
3056 else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */
3057 else if (-c == ESC_Q) /* Handle start of quoted string */
3059 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3061 ptr += 2; /* avoid empty string */
3066 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3070 register const uschar *cbits = cd->cbits;
3071 class_charcount += 2; /* Greater than 1 is what matters */
3073 /* Save time by not doing this in the pre-compile phase. */
3075 if (lengthptr == NULL) switch (-c)
3078 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3082 should_flip_negation = TRUE;
3083 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3087 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3091 should_flip_negation = TRUE;
3092 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3096 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3097 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
3101 should_flip_negation = TRUE;
3102 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3103 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3106 default: /* Not recognized; fall through */
3107 break; /* Need "default" setting to stop compiler warning. */
3110 /* In the pre-compile phase, just do the recognition. */
3112 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3113 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3115 /* We need to deal with \H, \h, \V, and \v in both phases because
3116 they use extra memory. */
3120 SETBIT(classbits, 0x09); /* VT */
3121 SETBIT(classbits, 0x20); /* SPACE */
3122 SETBIT(classbits, 0xa0); /* NSBP */
3127 *class_utf8data++ = XCL_SINGLE;
3128 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3129 *class_utf8data++ = XCL_SINGLE;
3130 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3131 *class_utf8data++ = XCL_RANGE;
3132 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3133 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3134 *class_utf8data++ = XCL_SINGLE;
3135 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3136 *class_utf8data++ = XCL_SINGLE;
3137 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3138 *class_utf8data++ = XCL_SINGLE;
3139 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3147 for (c = 0; c < 32; c++)
3152 case 0x09/8: x ^= 1 << (0x09%8); break;
3153 case 0x20/8: x ^= 1 << (0x20%8); break;
3154 case 0xa0/8: x ^= 1 << (0xa0%8); break;
3164 *class_utf8data++ = XCL_RANGE;
3165 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3166 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3167 *class_utf8data++ = XCL_RANGE;
3168 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3169 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3170 *class_utf8data++ = XCL_RANGE;
3171 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3172 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3173 *class_utf8data++ = XCL_RANGE;
3174 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3175 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3176 *class_utf8data++ = XCL_RANGE;
3177 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3178 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3179 *class_utf8data++ = XCL_RANGE;
3180 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3181 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3182 *class_utf8data++ = XCL_RANGE;
3183 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3184 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3192 SETBIT(classbits, 0x0a); /* LF */
3193 SETBIT(classbits, 0x0b); /* VT */
3194 SETBIT(classbits, 0x0c); /* FF */
3195 SETBIT(classbits, 0x0d); /* CR */
3196 SETBIT(classbits, 0x85); /* NEL */
3201 *class_utf8data++ = XCL_RANGE;
3202 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3203 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3211 for (c = 0; c < 32; c++)
3216 case 0x0a/8: x ^= 1 << (0x0a%8);
3221 case 0x85/8: x ^= 1 << (0x85%8); break;
3231 *class_utf8data++ = XCL_RANGE;
3232 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3233 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3234 *class_utf8data++ = XCL_RANGE;
3235 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3236 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3242 /* We need to deal with \P and \p in both phases. */
3245 if (-c == ESC_p || -c == ESC_P)
3249 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3250 if (ptype < 0) goto FAILED;
3252 *class_utf8data++ = ((-c == ESC_p) != negated)?
3253 XCL_PROP : XCL_NOTPROP;
3254 *class_utf8data++ = ptype;
3255 *class_utf8data++ = pdata;
3256 class_charcount -= 2; /* Not a < 256 character */
3260 /* Unrecognized escapes are faulted if PCRE is running in its
3261 strict mode. By default, for compatibility with Perl, they are
3262 treated as literals. */
3264 if ((options & PCRE_EXTRA) != 0)
3266 *errorcodeptr = ERR7;
3270 class_charcount -= 2; /* Undo the default count from above */
3271 c = *ptr; /* Get the final character and fall through */
3274 /* Fall through if we have a single character (c >= 0). This may be
3275 greater than 256 in UTF-8 mode. */
3277 } /* End of backslash handling */
3279 /* A single character may be followed by '-' to form a range. However,
3280 Perl does not permit ']' to be the end of the range. A '-' character
3281 at the end is treated as a literal. Perl ignores orphaned \E sequences
3282 entirely. The code for handling \Q and \E is messy. */
3285 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3293 /* Remember \r or \n */
3295 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3297 /* Check for range */
3299 if (!inescq && ptr[1] == CHAR_MINUS)
3303 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3305 /* If we hit \Q (not followed by \E) at this point, go into escaped
3308 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3311 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3312 { ptr += 2; continue; }
3317 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3320 goto LONE_SINGLE_CHARACTER;
3325 { /* Braces are required because the */
3326 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3330 d = *ptr; /* Not UTF-8 mode */
3332 /* The second part of a range can be a single-character escape, but
3333 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3334 in such circumstances. */
3336 if (!inescq && d == CHAR_BACKSLASH)
3338 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3339 if (*errorcodeptr != 0) goto FAILED;
3341 /* \b is backspace; \X is literal X; \R is literal R; any other
3342 special means the '-' was literal */
3346 if (d == -ESC_b) d = CHAR_BS;
3347 else if (d == -ESC_X) d = CHAR_X;
3348 else if (d == -ESC_R) d = CHAR_R; else
3351 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3356 /* Check that the two values are in the correct order. Optimize
3357 one-character ranges */
3361 *errorcodeptr = ERR8;
3365 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3367 /* Remember \r or \n */
3369 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3371 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3372 matching, we have to use an XCLASS with extra data items. Caseless
3373 matching for characters > 127 is available only if UCP support is
3377 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3381 /* With UCP support, we can find the other case equivalents of
3382 the relevant characters. There may be several ranges. Optimize how
3383 they fit with the basic range. */
3386 if ((options & PCRE_CASELESS) != 0)
3388 unsigned int occ, ocd;
3389 unsigned int cc = c;
3390 unsigned int origd = d;
3391 while (get_othercase_range(&cc, origd, &occ, &ocd))
3393 if (occ >= (unsigned int)c &&
3394 ocd <= (unsigned int)d)
3395 continue; /* Skip embedded ranges */
3397 if (occ < (unsigned int)c &&
3398 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3399 { /* if there is overlap, */
3400 c = occ; /* noting that if occ < c */
3401 continue; /* we can't have ocd > d */
3402 } /* because a subrange is */
3403 if (ocd > (unsigned int)d &&
3404 occ <= (unsigned int)d + 1) /* always shorter than */
3405 { /* the basic range. */
3412 *class_utf8data++ = XCL_SINGLE;
3416 *class_utf8data++ = XCL_RANGE;
3417 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3419 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3422 #endif /* SUPPORT_UCP */
3424 /* Now record the original range, possibly modified for UCP caseless
3425 overlapping ranges. */
3427 *class_utf8data++ = XCL_RANGE;
3428 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3429 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3431 /* With UCP support, we are done. Without UCP support, there is no
3432 caseless matching for UTF-8 characters > 127; we can use the bit map
3433 for the smaller ones. */
3436 continue; /* With next character in the class */
3438 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3440 /* Adjust upper limit and fall through to set up the map */
3444 #endif /* SUPPORT_UCP */
3446 #endif /* SUPPORT_UTF8 */
3448 /* We use the bit map for all cases when not in UTF-8 mode; else
3449 ranges that lie entirely within 0-127 when there is UCP support; else
3450 for partial ranges without UCP support. */
3452 class_charcount += d - c + 1;
3455 /* We can save a bit of time by skipping this in the pre-compile. */
3457 if (lengthptr == NULL) for (; c <= d; c++)
3459 classbits[c/8] |= (1 << (c&7));
3460 if ((options & PCRE_CASELESS) != 0)
3462 int uc = cd->fcc[c]; /* flip case */
3463 classbits[uc/8] |= (1 << (uc&7));
3467 continue; /* Go get the next char in the class */
3470 /* Handle a lone single character - we can get here for a normal
3471 non-escape char, or after \ that introduces a single character or for an
3472 apparent range that isn't. */
3474 LONE_SINGLE_CHARACTER:
3476 /* Handle a character that cannot go in the bit map */
3479 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3482 *class_utf8data++ = XCL_SINGLE;
3483 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3486 if ((options & PCRE_CASELESS) != 0)
3488 unsigned int othercase;
3489 if ((othercase = UCD_OTHERCASE(c)) != c)
3491 *class_utf8data++ = XCL_SINGLE;
3492 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3495 #endif /* SUPPORT_UCP */
3499 #endif /* SUPPORT_UTF8 */
3501 /* Handle a single-byte character */
3503 classbits[c/8] |= (1 << (c&7));
3504 if ((options & PCRE_CASELESS) != 0)
3506 c = cd->fcc[c]; /* flip case */
3507 classbits[c/8] |= (1 << (c&7));
3514 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3516 while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3518 if (c == 0) /* Missing terminating ']' */
3520 *errorcodeptr = ERR6;
3525 /* This code has been disabled because it would mean that \s counts as
3526 an explicit \r or \n reference, and that's not really what is wanted. Now
3527 we set the flag only if there is a literal "\r" or "\n" in the class. */
3530 /* Remember whether \r or \n are in this class */
3534 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3538 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3543 /* If class_charcount is 1, we saw precisely one character whose value is
3544 less than 256. As long as there were no characters >= 128 and there was no
3545 use of \p or \P, in other words, no use of any XCLASS features, we can
3548 In UTF-8 mode, we can optimize the negative case only if there were no
3549 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3550 operate on single-bytes only. This is an historical hangover. Maybe one day
3551 we can tidy these opcodes to handle multi-byte characters.
3553 The optimization throws away the bit map. We turn the item into a
3554 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3555 that OP_NOT does not support multibyte characters. In the positive case, it
3556 can cause firstbyte to be set. Otherwise, there can be no first char if
3557 this item is first, whatever repeat count may follow. In the case of
3558 reqbyte, save the previous value for reinstating. */
3561 if (class_charcount == 1 && !class_utf8 &&
3562 (!utf8 || !negate_class || class_lastchar < 128))
3564 if (class_charcount == 1)
3567 zeroreqbyte = reqbyte;
3569 /* The OP_NOT opcode works on one-byte characters only. */
3573 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3574 zerofirstbyte = firstbyte;
3576 *code++ = class_lastchar;
3580 /* For a single, positive character, get the value into mcbuffer, and
3581 then we can handle this with the normal one-character code. */
3584 if (utf8 && class_lastchar > 127)
3585 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3589 mcbuffer[0] = class_lastchar;
3593 } /* End of 1-char optimization */
3595 /* The general case - not the one-char optimization. If this is the first
3596 thing in the branch, there can be no first char setting, whatever the
3597 repeat count. Any reqbyte setting must remain unchanged after any kind of
3600 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3601 zerofirstbyte = firstbyte;
3602 zeroreqbyte = reqbyte;
3604 /* If there are characters with values > 255, we have to compile an
3605 extended class, with its own opcode, unless there was a negated special
3606 such as \S in the class, because in that case all characters > 255 are in
3607 the class, so any that were explicitly given as well can be ignored. If
3608 (when there are explicit characters > 255 that must be listed) there are no
3609 characters < 256, we can omit the bitmap in the actual compiled code. */
3612 if (class_utf8 && !should_flip_negation)
3614 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3615 *code++ = OP_XCLASS;
3617 *code = negate_class? XCL_NOT : 0;
3619 /* If the map is required, move up the extra data to make room for it;
3620 otherwise just move the code pointer to the end of the extra data. */
3622 if (class_charcount > 0)
3625 memmove(code + 32, code, class_utf8data - code);
3626 memcpy(code, classbits, 32);
3627 code = class_utf8data + 32;
3629 else code = class_utf8data;
3631 /* Now fill in the complete length of the item */
3633 PUT(previous, 1, code - previous);
3634 break; /* End of class handling */
3638 /* If there are no characters > 255, set the opcode to OP_CLASS or
3639 OP_NCLASS, depending on whether the whole class was negated and whether
3640 there were negative specials such as \S in the class. Then copy the 32-byte
3641 map into the code vector, negating it if necessary. */
3643 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3646 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3647 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3651 memcpy(code, classbits, 32);
3657 /* ===================================================================*/
3658 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3659 has been tested above. */
3661 case CHAR_LEFT_CURLY_BRACKET:
3662 if (!is_quantifier) goto NORMAL_CHAR;
3663 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3664 if (*errorcodeptr != 0) goto FAILED;
3677 case CHAR_QUESTION_MARK:
3682 if (previous == NULL)
3684 *errorcodeptr = ERR9;
3688 if (repeat_min == 0)
3690 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3691 reqbyte = zeroreqbyte; /* Ditto */
3694 /* Remember whether this is a variable length repeat */
3696 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3698 op_type = 0; /* Default single-char op codes */
3699 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3701 /* Save start of previous item, in case we have to move it up to make space
3702 for an inserted OP_ONCE for the additional '+' extension. */
3704 tempcode = previous;
3706 /* If the next character is '+', we have a possessive quantifier. This
3707 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3708 If the next character is '?' this is a minimizing repeat, by default,
3709 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3710 repeat type to the non-default. */
3712 if (ptr[1] == CHAR_PLUS)
3714 repeat_type = 0; /* Force greedy */
3715 possessive_quantifier = TRUE;
3718 else if (ptr[1] == CHAR_QUESTION_MARK)
3720 repeat_type = greedy_non_default;
3723 else repeat_type = greedy_default;
3725 /* If previous was a character match, abolish the item and generate a
3726 repeat item instead. If a char item has a minumum of more than one, ensure
3727 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3728 the first thing in a branch because the x will have gone into firstbyte
3731 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3733 /* Deal with UTF-8 characters that take up more than one byte. It's
3734 easier to write this out separately than try to macrify it. Use c to
3735 hold the length of the character in bytes, plus 0x80 to flag that it's a
3736 length rather than a small character. */
3739 if (utf8 && (code[-1] & 0x80) != 0)
3741 uschar *lastchar = code - 1;
3742 while((*lastchar & 0xc0) == 0x80) lastchar--;
3743 c = code - lastchar; /* Length of UTF-8 character */
3744 memcpy(utf8_char, lastchar, c); /* Save the char */
3745 c |= 0x80; /* Flag c as a length */
3750 /* Handle the case of a single byte - either with no UTF8 support, or
3751 with UTF-8 disabled, or for a UTF-8 character < 128. */
3755 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3758 /* If the repetition is unlimited, it pays to see if the next thing on
3759 the line is something that cannot possibly match this character. If so,
3760 automatically possessifying this item gains some performance in the case
3761 where the match fails. */
3763 if (!possessive_quantifier &&
3765 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3768 repeat_type = 0; /* Force greedy */
3769 possessive_quantifier = TRUE;
3772 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3775 /* If previous was a single negated character ([^a] or similar), we use
3776 one of the special opcodes, replacing it. The code is shared with single-
3777 character repeats by setting opt_type to add a suitable offset into
3778 repeat_type. We can also test for auto-possessification. OP_NOT is
3779 currently used only for single-byte chars. */
3781 else if (*previous == OP_NOT)
3783 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3785 if (!possessive_quantifier &&
3787 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3789 repeat_type = 0; /* Force greedy */
3790 possessive_quantifier = TRUE;
3792 goto OUTPUT_SINGLE_REPEAT;
3795 /* If previous was a character type match (\d or similar), abolish it and
3796 create a suitable repeat item. The code is shared with single-character
3797 repeats by setting op_type to add a suitable offset into repeat_type. Note
3798 the the Unicode property types will be present only when SUPPORT_UCP is
3799 defined, but we don't wrap the little bits of code here because it just
3800 makes it horribly messy. */
3802 else if (*previous < OP_EODN)
3805 int prop_type, prop_value;
3806 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3809 if (!possessive_quantifier &&
3811 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3813 repeat_type = 0; /* Force greedy */
3814 possessive_quantifier = TRUE;
3817 OUTPUT_SINGLE_REPEAT:
3818 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3820 prop_type = previous[1];
3821 prop_value = previous[2];
3823 else prop_type = prop_value = -1;
3826 code = previous; /* Usually overwrite previous item */
3828 /* If the maximum is zero then the minimum must also be zero; Perl allows
3829 this case, so we do too - by simply omitting the item altogether. */
3831 if (repeat_max == 0) goto END_REPEAT;
3833 /*--------------------------------------------------------------------*/
3834 /* This code is obsolete from release 8.00; the restriction was finally
3837 /* All real repeats make it impossible to handle partial matching (maybe
3838 one day we will be able to remove this restriction). */
3840 /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
3841 /*--------------------------------------------------------------------*/
3843 /* Combine the op_type with the repeat_type */
3845 repeat_type += op_type;
3847 /* A minimum of zero is handled either as the special case * or ?, or as
3848 an UPTO, with the maximum given. */
3850 if (repeat_min == 0)
3852 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3853 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3856 *code++ = OP_UPTO + repeat_type;
3857 PUT2INC(code, 0, repeat_max);
3861 /* A repeat minimum of 1 is optimized into some special cases. If the
3862 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3863 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3864 one less than the maximum. */
3866 else if (repeat_min == 1)
3868 if (repeat_max == -1)
3869 *code++ = OP_PLUS + repeat_type;
3872 code = oldcode; /* leave previous item in place */
3873 if (repeat_max == 1) goto END_REPEAT;
3874 *code++ = OP_UPTO + repeat_type;
3875 PUT2INC(code, 0, repeat_max - 1);
3879 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3880 handled as an EXACT followed by an UPTO. */
3884 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3885 PUT2INC(code, 0, repeat_min);
3887 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3888 we have to insert the character for the previous code. For a repeated
3889 Unicode property match, there are two extra bytes that define the
3890 required property. In UTF-8 mode, long characters have their length in
3891 c, with the 0x80 bit as a flag. */
3896 if (utf8 && c >= 128)
3898 memcpy(code, utf8_char, c & 7);
3907 *code++ = prop_type;
3908 *code++ = prop_value;
3911 *code++ = OP_STAR + repeat_type;
3914 /* Else insert an UPTO if the max is greater than the min, again
3915 preceded by the character, for the previously inserted code. If the
3916 UPTO is just for 1 instance, we can use QUERY instead. */
3918 else if (repeat_max != repeat_min)
3921 if (utf8 && c >= 128)
3923 memcpy(code, utf8_char, c & 7);
3931 *code++ = prop_type;
3932 *code++ = prop_value;
3934 repeat_max -= repeat_min;
3936 if (repeat_max == 1)
3938 *code++ = OP_QUERY + repeat_type;
3942 *code++ = OP_UPTO + repeat_type;
3943 PUT2INC(code, 0, repeat_max);
3948 /* The character or character type itself comes last in all cases. */
3951 if (utf8 && c >= 128)
3953 memcpy(code, utf8_char, c & 7);
3960 /* For a repeated Unicode property match, there are two extra bytes that
3961 define the required property. */
3966 *code++ = prop_type;
3967 *code++ = prop_value;
3972 /* If previous was a character class or a back reference, we put the repeat
3973 stuff after it, but just skip the item if the repeat was {0,0}. */
3975 else if (*previous == OP_CLASS ||
3976 *previous == OP_NCLASS ||
3978 *previous == OP_XCLASS ||
3980 *previous == OP_REF)
3982 if (repeat_max == 0)
3988 /*--------------------------------------------------------------------*/
3989 /* This code is obsolete from release 8.00; the restriction was finally
3992 /* All real repeats make it impossible to handle partial matching (maybe
3993 one day we will be able to remove this restriction). */
3995 /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
3996 /*--------------------------------------------------------------------*/
3998 if (repeat_min == 0 && repeat_max == -1)
3999 *code++ = OP_CRSTAR + repeat_type;
4000 else if (repeat_min == 1 && repeat_max == -1)
4001 *code++ = OP_CRPLUS + repeat_type;
4002 else if (repeat_min == 0 && repeat_max == 1)
4003 *code++ = OP_CRQUERY + repeat_type;
4006 *code++ = OP_CRRANGE + repeat_type;
4007 PUT2INC(code, 0, repeat_min);
4008 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
4009 PUT2INC(code, 0, repeat_max);
4013 /* If previous was a bracket group, we may have to replicate it in certain
4016 else if (*previous == OP_BRA || *previous == OP_CBRA ||
4017 *previous == OP_ONCE || *previous == OP_COND)
4021 int len = code - previous;
4022 uschar *bralink = NULL;
4024 /* Repeating a DEFINE group is pointless */
4026 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
4028 *errorcodeptr = ERR55;
4032 /* If the maximum repeat count is unlimited, find the end of the bracket
4033 by scanning through from the start, and compute the offset back to it
4034 from the current code pointer. There may be an OP_OPT setting following
4035 the final KET, so we can't find the end just by going back from the code
4038 if (repeat_max == -1)
4040 register uschar *ket = previous;
4041 do ket += GET(ket, 1); while (*ket != OP_KET);
4042 ketoffset = code - ket;
4045 /* The case of a zero minimum is special because of the need to stick
4046 OP_BRAZERO in front of it, and because the group appears once in the
4047 data, whereas in other cases it appears the minimum number of times. For
4048 this reason, it is simplest to treat this case separately, as otherwise
4049 the code gets far too messy. There are several special subcases when the
4052 if (repeat_min == 0)
4054 /* If the maximum is also zero, we used to just omit the group from the
4055 output altogether, like this:
4057 ** if (repeat_max == 0)
4063 However, that fails when a group is referenced as a subroutine from
4064 elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
4065 so that it is skipped on execution. As we don't have a list of which
4066 groups are referenced, we cannot do this selectively.
4068 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
4069 and do no more at this point. However, we do need to adjust any
4070 OP_RECURSE calls inside the group that refer to the group itself or any
4071 internal or forward referenced group, because the offset is from the
4072 start of the whole regex. Temporarily terminate the pattern while doing
4075 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
4078 adjust_recurse(previous, 1, utf8, cd, save_hwm);
4079 memmove(previous+1, previous, len);
4081 if (repeat_max == 0)
4083 *previous++ = OP_SKIPZERO;
4086 *previous++ = OP_BRAZERO + repeat_type;
4089 /* If the maximum is greater than 1 and limited, we have to replicate
4090 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
4091 The first one has to be handled carefully because it's the original
4092 copy, which has to be moved up. The remainder can be handled by code
4093 that is common with the non-zero minimum case below. We have to
4094 adjust the value or repeat_max, since one less copy is required. Once
4095 again, we may have to adjust any OP_RECURSE calls inside the group. */
4101 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
4102 memmove(previous + 2 + LINK_SIZE, previous, len);
4103 code += 2 + LINK_SIZE;
4104 *previous++ = OP_BRAZERO + repeat_type;
4105 *previous++ = OP_BRA;
4107 /* We chain together the bracket offset fields that have to be
4108 filled in later when the ends of the brackets are reached. */
4110 offset = (bralink == NULL)? 0 : previous - bralink;
4112 PUTINC(previous, 0, offset);
4118 /* If the minimum is greater than zero, replicate the group as many
4119 times as necessary, and adjust the maximum to the number of subsequent
4120 copies that we need. If we set a first char from the group, and didn't
4121 set a required char, copy the latter from the former. If there are any
4122 forward reference subroutine calls in the group, there will be entries on
4123 the workspace list; replicate these with an appropriate increment. */
4129 /* In the pre-compile phase, we don't actually do the replication. We
4130 just adjust the length as if we had. Do some paranoid checks for
4131 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
4132 integer type when available, otherwise double. */
4134 if (lengthptr != NULL)
4136 int delta = (repeat_min - 1)*length_prevgroup;
4137 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
4138 (INT64_OR_DOUBLE)length_prevgroup >
4139 (INT64_OR_DOUBLE)INT_MAX ||
4140 OFLOW_MAX - *lengthptr < delta)
4142 *errorcodeptr = ERR20;
4145 *lengthptr += delta;
4148 /* This is compiling for real */
4152 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
4153 for (i = 1; i < repeat_min; i++)
4156 uschar *this_hwm = cd->hwm;
4157 memcpy(code, previous, len);
4158 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4160 PUT(cd->hwm, 0, GET(hc, 0) + len);
4161 cd->hwm += LINK_SIZE;
4163 save_hwm = this_hwm;
4169 if (repeat_max > 0) repeat_max -= repeat_min;
4172 /* This code is common to both the zero and non-zero minimum cases. If
4173 the maximum is limited, it replicates the group in a nested fashion,
4174 remembering the bracket starts on a stack. In the case of a zero minimum,
4175 the first one was set up above. In all cases the repeat_max now specifies
4176 the number of additional copies needed. Again, we must remember to
4177 replicate entries on the forward reference list. */
4179 if (repeat_max >= 0)
4181 /* In the pre-compile phase, we don't actually do the replication. We
4182 just adjust the length as if we had. For each repetition we must add 1
4183 to the length for BRAZERO and for all but the last repetition we must
4184 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4185 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
4186 a 64-bit integer type when available, otherwise double. */
4188 if (lengthptr != NULL && repeat_max > 0)
4190 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4191 2 - 2*LINK_SIZE; /* Last one doesn't nest */
4192 if ((INT64_OR_DOUBLE)repeat_max *
4193 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4194 > (INT64_OR_DOUBLE)INT_MAX ||
4195 OFLOW_MAX - *lengthptr < delta)
4197 *errorcodeptr = ERR20;
4200 *lengthptr += delta;
4203 /* This is compiling for real */
4205 else for (i = repeat_max - 1; i >= 0; i--)
4208 uschar *this_hwm = cd->hwm;
4210 *code++ = OP_BRAZERO + repeat_type;
4212 /* All but the final copy start a new nesting, maintaining the
4213 chain of brackets outstanding. */
4219 offset = (bralink == NULL)? 0 : code - bralink;
4221 PUTINC(code, 0, offset);
4224 memcpy(code, previous, len);
4225 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4227 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4228 cd->hwm += LINK_SIZE;
4230 save_hwm = this_hwm;
4234 /* Now chain through the pending brackets, and fill in their length
4235 fields (which are holding the chain links pro tem). */
4237 while (bralink != NULL)
4240 int offset = code - bralink + 1;
4241 uschar *bra = code - offset;
4242 oldlinkoffset = GET(bra, 1);
4243 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4245 PUTINC(code, 0, offset);
4246 PUT(bra, 1, offset);
4250 /* If the maximum is unlimited, set a repeater in the final copy. We
4251 can't just offset backwards from the current code point, because we
4252 don't know if there's been an options resetting after the ket. The
4253 correct offset was computed above.
4255 Then, when we are doing the actual compile phase, check to see whether
4256 this group is a non-atomic one that could match an empty string. If so,
4257 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4258 that runtime checking can be done. [This check is also applied to
4259 atomic groups at runtime, but in a different way.] */
4263 uschar *ketcode = code - ketoffset;
4264 uschar *bracode = ketcode - GET(ketcode, 1);
4265 *ketcode = OP_KETRMAX + repeat_type;
4266 if (lengthptr == NULL && *bracode != OP_ONCE)
4268 uschar *scode = bracode;
4271 if (could_be_empty_branch(scode, ketcode, utf8, cd))
4273 *bracode += OP_SBRA - OP_BRA;
4276 scode += GET(scode, 1);
4278 while (*scode == OP_ALT);
4283 /* If previous is OP_FAIL, it was generated by an empty class [] in
4284 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4285 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4286 error above. We can just ignore the repeat in JS case. */
4288 else if (*previous == OP_FAIL) goto END_REPEAT;
4290 /* Else there's some kind of shambles */
4294 *errorcodeptr = ERR11;
4298 /* If the character following a repeat is '+', or if certain optimization
4299 tests above succeeded, possessive_quantifier is TRUE. For some of the
4300 simpler opcodes, there is an special alternative opcode for this. For
4301 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4302 The '+' notation is just syntactic sugar, taken from Sun's Java package,
4303 but the special opcodes can optimize it a bit. The repeated item starts at
4304 tempcode, not at previous, which might be the first part of a string whose
4305 (former) last char we repeated.
4307 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4308 an 'upto' may follow. We skip over an 'exact' item, and then test the
4309 length of what remains before proceeding. */
4311 if (possessive_quantifier)
4315 if (*tempcode == OP_TYPEEXACT)
4316 tempcode += _pcre_OP_lengths[*tempcode] +
4317 ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);
4319 else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
4321 tempcode += _pcre_OP_lengths[*tempcode];
4323 if (utf8 && tempcode[-1] >= 0xc0)
4324 tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
4328 len = code - tempcode;
4329 if (len > 0) switch (*tempcode)
4331 case OP_STAR: *tempcode = OP_POSSTAR; break;
4332 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4333 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4334 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4336 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4337 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4338 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4339 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4341 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4342 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4343 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4344 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4346 /* Because we are moving code along, we must ensure that any
4347 pending recursive references are updated. */
4351 adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);
4352 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4353 code += 1 + LINK_SIZE;
4354 len += 1 + LINK_SIZE;
4355 tempcode[0] = OP_ONCE;
4357 PUTINC(code, 0, len);
4358 PUT(tempcode, 1, len);
4363 /* In all case we no longer have a previous item. We also set the
4364 "follows varying string" flag for subsequently encountered reqbytes if
4365 it isn't already set and we have just passed a varying length item. */
4369 cd->req_varyopt |= reqvary;
4373 /* ===================================================================*/
4374 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4375 lookbehind or option setting or condition or all the other extended
4376 parenthesis forms. */
4378 case CHAR_LEFT_PARENTHESIS:
4379 newoptions = options;
4383 reset_bracount = FALSE;
4385 /* First deal with various "verbs" that can be introduced by '*'. */
4387 if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4390 const char *vn = verbnames;
4391 const uschar *name = ++ptr;
4393 while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4394 if (*ptr == CHAR_COLON)
4396 *errorcodeptr = ERR59; /* Not supported */
4399 if (*ptr != CHAR_RIGHT_PARENTHESIS)
4401 *errorcodeptr = ERR60;
4404 namelen = ptr - name;
4405 for (i = 0; i < verbcount; i++)
4407 if (namelen == verbs[i].len &&
4408 strncmp((char *)name, vn, namelen) == 0)
4410 /* Check for open captures before ACCEPT */
4412 if (verbs[i].op == OP_ACCEPT)
4415 cd->had_accept = TRUE;
4416 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
4419 PUT2INC(code, 0, oc->number);
4422 *code++ = verbs[i].op;
4425 vn += verbs[i].len + 1;
4427 if (i < verbcount) continue;
4428 *errorcodeptr = ERR60;
4432 /* Deal with the extended parentheses; all are introduced by '?', and the
4433 appearance of any of them means that this is not a capturing group. */
4435 else if (*ptr == CHAR_QUESTION_MARK)
4437 int i, set, unset, namelen;
4444 case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
4446 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4449 *errorcodeptr = ERR18;
4455 /* ------------------------------------------------------------ */
4456 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
4457 reset_bracount = TRUE;
4460 /* ------------------------------------------------------------ */
4461 case CHAR_COLON: /* Non-capturing bracket */
4467 /* ------------------------------------------------------------ */
4468 case CHAR_LEFT_PARENTHESIS:
4469 bravalue = OP_COND; /* Conditional group */
4471 /* A condition can be an assertion, a number (referring to a numbered
4472 group), a name (referring to a named group), or 'R', referring to
4473 recursion. R<digits> and R&name are also permitted for recursion tests.
4475 There are several syntaxes for testing a named group: (?(name)) is used
4476 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4478 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4479 be the recursive thing or the name 'R' (and similarly for 'R' followed
4480 by digits), and (b) a number could be a name that consists of digits.
4481 In both cases, we look for a name first; if not found, we try the other
4484 /* For conditions that are assertions, check the syntax, and then exit
4485 the switch. This will take control down to where bracketed groups,
4486 including assertions, are processed. */
4488 if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
4489 ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
4492 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4493 below), and all need to skip 3 bytes at the start of the group. */
4495 code[1+LINK_SIZE] = OP_CREF;
4499 /* Check for a test for recursion in a named group. */
4501 if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
4505 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4508 /* Check for a test for a named group's having been set, using the Perl
4509 syntax (?(<name>) or (?('name') */
4511 else if (ptr[1] == CHAR_LESS_THAN_SIGN)
4513 terminator = CHAR_GREATER_THAN_SIGN;
4516 else if (ptr[1] == CHAR_APOSTROPHE)
4518 terminator = CHAR_APOSTROPHE;
4524 if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
4527 /* We now expect to read a name; any thing else is an error */
4529 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4531 ptr += 1; /* To get the right offset */
4532 *errorcodeptr = ERR28;
4536 /* Read the name, but also get it as a number if it's all digits */
4540 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4543 recno = (g_ascii_isdigit(*ptr) != 0)?
4544 recno * 10 + *ptr - CHAR_0 : -1;
4547 namelen = ptr - name;
4549 if ((terminator > 0 && *ptr++ != terminator) ||
4550 *ptr++ != CHAR_RIGHT_PARENTHESIS)
4552 ptr--; /* Error offset */
4553 *errorcodeptr = ERR26;
4557 /* Do no further checking in the pre-compile phase. */
4559 if (lengthptr != NULL) break;
4561 /* In the real compile we do the work of looking for the actual
4562 reference. If the string started with "+" or "-" we require the rest to
4563 be digits, in which case recno will be set. */
4569 *errorcodeptr = ERR58;
4572 recno = (refsign == CHAR_MINUS)?
4573 cd->bracount - recno + 1 : recno +cd->bracount;
4574 if (recno <= 0 || recno > cd->final_bracount)
4576 *errorcodeptr = ERR15;
4579 PUT2(code, 2+LINK_SIZE, recno);
4583 /* Otherwise (did not start with "+" or "-"), start by looking for the
4584 name. If we find a name, add one to the opcode to change OP_CREF or
4585 OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
4586 except they record that the reference was originally to a name. The
4587 information is used to check duplicate names. */
4589 slot = cd->name_table;
4590 for (i = 0; i < cd->names_found; i++)
4592 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4593 slot += cd->name_entry_size;
4596 /* Found a previous named subpattern */
4598 if (i < cd->names_found)
4600 recno = GET2(slot, 0);
4601 PUT2(code, 2+LINK_SIZE, recno);
4602 code[1+LINK_SIZE]++;
4605 /* Search the pattern for a forward reference */
4607 else if ((i = find_parens(cd, name, namelen,
4608 (options & PCRE_EXTENDED) != 0)) > 0)
4610 PUT2(code, 2+LINK_SIZE, i);
4611 code[1+LINK_SIZE]++;
4614 /* If terminator == 0 it means that the name followed directly after
4615 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4616 some further alternatives to try. For the cases where terminator != 0
4617 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4618 now checked all the possibilities, so give an error. */
4620 else if (terminator != 0)
4622 *errorcodeptr = ERR15;
4626 /* Check for (?(R) for recursion. Allow digits after R to specify a
4627 specific group number. */
4629 else if (*name == CHAR_R)
4632 for (i = 1; i < namelen; i++)
4634 if (g_ascii_isdigit(name[i]) == 0)
4636 *errorcodeptr = ERR15;
4639 recno = recno * 10 + name[i] - CHAR_0;
4641 if (recno == 0) recno = RREF_ANY;
4642 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4643 PUT2(code, 2+LINK_SIZE, recno);
4646 /* Similarly, check for the (?(DEFINE) "condition", which is always
4649 else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
4651 code[1+LINK_SIZE] = OP_DEF;
4655 /* Check for the "name" actually being a subpattern number. We are
4656 in the second pass here, so final_bracount is set. */
4658 else if (recno > 0 && recno <= cd->final_bracount)
4660 PUT2(code, 2+LINK_SIZE, recno);
4663 /* Either an unidentified subpattern, or a reference to (?(0) */
4667 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4673 /* ------------------------------------------------------------ */
4674 case CHAR_EQUALS_SIGN: /* Positive lookahead */
4675 bravalue = OP_ASSERT;
4680 /* ------------------------------------------------------------ */
4681 case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
4683 if (*ptr == CHAR_RIGHT_PARENTHESIS) /* Optimize (?!) */
4689 bravalue = OP_ASSERT_NOT;
4693 /* ------------------------------------------------------------ */
4694 case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
4697 case CHAR_EQUALS_SIGN: /* Positive lookbehind */
4698 bravalue = OP_ASSERTBACK;
4702 case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
4703 bravalue = OP_ASSERTBACK_NOT;
4707 default: /* Could be name define, else bad */
4708 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4709 ptr++; /* Correct offset for error */
4710 *errorcodeptr = ERR24;
4716 /* ------------------------------------------------------------ */
4717 case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
4723 /* ------------------------------------------------------------ */
4724 case CHAR_C: /* Callout - may be followed by digits; */
4725 previous_callout = code; /* Save for later completion */
4726 after_manual_callout = 1; /* Skip one item before completing */
4727 *code++ = OP_CALLOUT;
4730 while (g_ascii_isdigit(*(++ptr)) != 0)
4731 n = n * 10 + *ptr - CHAR_0;
4732 if (*ptr != CHAR_RIGHT_PARENTHESIS)
4734 *errorcodeptr = ERR39;
4739 *errorcodeptr = ERR38;
4743 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4744 PUT(code, LINK_SIZE, 0); /* Default length */
4745 code += 2 * LINK_SIZE;
4751 /* ------------------------------------------------------------ */
4752 case CHAR_P: /* Python-style named subpattern handling */
4753 if (*(++ptr) == CHAR_EQUALS_SIGN ||
4754 *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
4756 is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
4757 terminator = CHAR_RIGHT_PARENTHESIS;
4758 goto NAMED_REF_OR_RECURSE;
4760 else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
4762 *errorcodeptr = ERR41;
4765 /* Fall through to handle (?P< as (?< is handled */
4768 /* ------------------------------------------------------------ */
4769 DEFINE_NAME: /* Come here from (?< handling */
4770 case CHAR_APOSTROPHE:
4772 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
4773 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
4776 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4777 namelen = ptr - name;
4779 /* In the pre-compile phase, just do a syntax check. */
4781 if (lengthptr != NULL)
4783 if (*ptr != terminator)
4785 *errorcodeptr = ERR42;
4788 if (cd->names_found >= MAX_NAME_COUNT)
4790 *errorcodeptr = ERR49;
4793 if (namelen + 3 > cd->name_entry_size)
4795 cd->name_entry_size = namelen + 3;
4796 if (namelen > MAX_NAME_SIZE)
4798 *errorcodeptr = ERR48;
4804 /* In the real compile, create the entry in the table, maintaining
4805 alphabetical order. Duplicate names for different numbers are
4806 permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
4807 number are always OK. (An existing number can be re-used if (?|
4808 appears in the pattern.) In either event, a duplicate name results in
4809 a duplicate entry in the table, even if the number is the same. This
4810 is because the number of names, and hence the table size, is computed
4811 in the pre-compile, and it affects various numbers and pointers which
4812 would all have to be modified, and the compiled code moved down, if
4813 duplicates with the same number were omitted from the table. This
4814 doesn't seem worth the hassle. However, *different* names for the
4815 same number are not permitted. */
4819 BOOL dupname = FALSE;
4820 slot = cd->name_table;
4822 for (i = 0; i < cd->names_found; i++)
4824 int crc = memcmp(name, slot+2, namelen);
4827 if (slot[2+namelen] == 0)
4829 if (GET2(slot, 0) != cd->bracount + 1 &&
4830 (options & PCRE_DUPNAMES) == 0)
4832 *errorcodeptr = ERR43;
4835 else dupname = TRUE;
4837 else crc = -1; /* Current name is a substring */
4840 /* Make space in the table and break the loop for an earlier
4841 name. For a duplicate or later name, carry on. We do this for
4842 duplicates so that in the simple case (when ?(| is not used) they
4843 are in order of their numbers. */
4847 memmove(slot + cd->name_entry_size, slot,
4848 (cd->names_found - i) * cd->name_entry_size);
4852 /* Continue the loop for a later or duplicate name */
4854 slot += cd->name_entry_size;
4857 /* For non-duplicate names, check for a duplicate number before
4858 adding the new name. */
4862 uschar *cslot = cd->name_table;
4863 for (i = 0; i < cd->names_found; i++)
4867 if (GET2(cslot, 0) == cd->bracount + 1)
4869 *errorcodeptr = ERR65;
4874 cslot += cd->name_entry_size;
4878 PUT2(slot, 0, cd->bracount + 1);
4879 memcpy(slot + 2, name, namelen);
4880 slot[2+namelen] = 0;
4884 /* In both pre-compile and compile, count the number of names we've
4888 ptr++; /* Move past > or ' */
4889 goto NUMBERED_GROUP;
4892 /* ------------------------------------------------------------ */
4893 case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
4894 terminator = CHAR_RIGHT_PARENTHESIS;
4898 /* We come here from the Python syntax above that handles both
4899 references (?P=name) and recursion (?P>name), as well as falling
4900 through from the Perl recursion syntax (?&name). We also come here from
4901 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4902 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
4904 NAMED_REF_OR_RECURSE:
4906 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4907 namelen = ptr - name;
4909 /* In the pre-compile phase, do a syntax check and set a dummy
4910 reference number. */
4912 if (lengthptr != NULL)
4916 *errorcodeptr = ERR62;
4919 if (*ptr != terminator)
4921 *errorcodeptr = ERR42;
4924 if (namelen > MAX_NAME_SIZE)
4926 *errorcodeptr = ERR48;
4932 /* In the real compile, seek the name in the table. We check the name
4933 first, and then check that we have reached the end of the name in the
4934 table. That way, if the name that is longer than any in the table,
4935 the comparison will fail without reading beyond the table entry. */
4939 slot = cd->name_table;
4940 for (i = 0; i < cd->names_found; i++)
4942 if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4943 slot[2+namelen] == 0)
4945 slot += cd->name_entry_size;
4948 if (i < cd->names_found) /* Back reference */
4950 recno = GET2(slot, 0);
4952 else if ((recno = /* Forward back reference */
4953 find_parens(cd, name, namelen,
4954 (options & PCRE_EXTENDED) != 0)) <= 0)
4956 *errorcodeptr = ERR15;
4961 /* In both phases, we can now go to the code than handles numerical
4962 recursion or backreferences. */
4964 if (is_recurse) goto HANDLE_RECURSION;
4965 else goto HANDLE_REFERENCE;
4968 /* ------------------------------------------------------------ */
4969 case CHAR_R: /* Recursion */
4970 ptr++; /* Same as (?0) */
4974 /* ------------------------------------------------------------ */
4975 case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */
4976 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
4977 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
4979 const uschar *called;
4980 terminator = CHAR_RIGHT_PARENTHESIS;
4982 /* Come here from the \g<...> and \g'...' code (Oniguruma
4983 compatibility). However, the syntax has been checked to ensure that
4984 the ... are a (signed) number, so that neither ERR63 nor ERR29 will
4985 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
4988 HANDLE_NUMERICAL_RECURSION:
4990 if ((refsign = *ptr) == CHAR_PLUS)
4993 if (g_ascii_isdigit(*ptr) == 0)
4995 *errorcodeptr = ERR63;
4999 else if (refsign == CHAR_MINUS)
5001 if (g_ascii_isdigit(ptr[1]) == 0)
5002 goto OTHER_CHAR_AFTER_QUERY;
5007 while(g_ascii_isdigit(*ptr) != 0)
5008 recno = recno * 10 + *ptr++ - CHAR_0;
5010 if (*ptr != terminator)
5012 *errorcodeptr = ERR29;
5016 if (refsign == CHAR_MINUS)
5020 *errorcodeptr = ERR58;
5023 recno = cd->bracount - recno + 1;
5026 *errorcodeptr = ERR15;
5030 else if (refsign == CHAR_PLUS)
5034 *errorcodeptr = ERR58;
5037 recno += cd->bracount;
5040 /* Come here from code above that handles a named recursion */
5045 called = cd->start_code;
5047 /* When we are actually compiling, find the bracket that is being
5048 referenced. Temporarily end the regex in case it doesn't exist before
5049 this point. If we end up with a forward reference, first check that
5050 the bracket does occur later so we can give the error (and position)
5051 now. Then remember this forward reference in the workspace so it can
5052 be filled in at the end. */
5054 if (lengthptr == NULL)
5058 called = _pcre_find_bracket(cd->start_code, utf8, recno);
5060 /* Forward reference */
5064 if (find_parens(cd, NULL, recno,
5065 (options & PCRE_EXTENDED) != 0) < 0)
5067 *errorcodeptr = ERR15;
5071 /* Fudge the value of "called" so that when it is inserted as an
5072 offset below, what it actually inserted is the reference number
5075 called = cd->start_code + recno;
5076 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
5079 /* If not a forward reference, and the subpattern is still open,
5080 this is a recursive call. We check to see if this is a left
5081 recursion that could loop for ever, and diagnose that case. */
5083 else if (GET(called, 1) == 0 &&
5084 could_be_empty(called, code, bcptr, utf8, cd))
5086 *errorcodeptr = ERR40;
5091 /* Insert the recursion/subroutine item, automatically wrapped inside
5092 "once" brackets. Set up a "previous group" length so that a
5093 subsequent quantifier will work. */
5096 PUT(code, 1, 2 + 2*LINK_SIZE);
5097 code += 1 + LINK_SIZE;
5100 PUT(code, 1, called - cd->start_code);
5101 code += 1 + LINK_SIZE;
5104 PUT(code, 1, 2 + 2*LINK_SIZE);
5105 code += 1 + LINK_SIZE;
5107 length_prevgroup = 3 + 3*LINK_SIZE;
5110 /* Can't determine a first byte now */
5112 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5116 /* ------------------------------------------------------------ */
5117 default: /* Other characters: check option setting */
5118 OTHER_CHAR_AFTER_QUERY:
5122 while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
5126 case CHAR_MINUS: optset = &unset; break;
5128 case CHAR_J: /* Record that it changed in the external options */
5129 *optset |= PCRE_DUPNAMES;
5130 cd->external_flags |= PCRE_JCHANGED;
5133 case CHAR_i: *optset |= PCRE_CASELESS; break;
5134 case CHAR_m: *optset |= PCRE_MULTILINE; break;
5135 case CHAR_s: *optset |= PCRE_DOTALL; break;
5136 case CHAR_x: *optset |= PCRE_EXTENDED; break;
5137 case CHAR_U: *optset |= PCRE_UNGREEDY; break;
5138 case CHAR_X: *optset |= PCRE_EXTRA; break;
5140 default: *errorcodeptr = ERR12;
5141 ptr--; /* Correct the offset */
5146 /* Set up the changed option bits, but don't change anything yet. */
5148 newoptions = (options | set) & (~unset);
5150 /* If the options ended with ')' this is not the start of a nested
5151 group with option changes, so the options change at this level. If this
5152 item is right at the start of the pattern, the options can be
5153 abstracted and made external in the pre-compile phase, and ignored in
5154 the compile phase. This can be helpful when matching -- for instance in
5155 caseless checking of required bytes.
5157 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
5158 definitely *not* at the start of the pattern because something has been
5159 compiled. In the pre-compile phase, however, the code pointer can have
5160 that value after the start, because it gets reset as code is discarded
5161 during the pre-compile. However, this can happen only at top level - if
5162 we are within parentheses, the starting BRA will still be present. At
5163 any parenthesis level, the length value can be used to test if anything
5164 has been compiled at that level. Thus, a test for both these conditions
5165 is necessary to ensure we correctly detect the start of the pattern in
5168 If we are not at the pattern start, compile code to change the ims
5169 options if this setting actually changes any of them, and reset the
5170 greedy defaults and the case value for firstbyte and reqbyte. */
5172 if (*ptr == CHAR_RIGHT_PARENTHESIS)
5174 if (code == cd->start_code + 1 + LINK_SIZE &&
5175 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
5177 cd->external_options = newoptions;
5181 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
5184 *code++ = newoptions & PCRE_IMS;
5186 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
5187 greedy_non_default = greedy_default ^ 1;
5188 req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
5191 /* Change options at this level, and pass them back for use
5192 in subsequent branches. When not at the start of the pattern, this
5193 information is also necessary so that a resetting item can be
5194 compiled at the end of a group (if we are in a group). */
5196 *optionsptr = options = newoptions;
5197 previous = NULL; /* This item can't be repeated */
5198 continue; /* It is complete */
5201 /* If the options ended with ':' we are heading into a nested group
5202 with possible change of options. Such groups are non-capturing and are
5203 not assertions of any kind. All we need to do is skip over the ':';
5204 the newoptions value is handled below. */
5208 } /* End of switch for character following (? */
5209 } /* End of (? handling */
5211 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
5212 all unadorned brackets become non-capturing and behave like (?:...)
5215 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
5220 /* Else we have a capturing group. */
5226 PUT2(code, 1+LINK_SIZE, cd->bracount);
5230 /* Process nested bracketed regex. Assertions may not be repeated, but
5231 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
5232 non-register variable in order to be able to pass its address because some
5233 compilers complain otherwise. Pass in a new setting for the ims options if
5234 they have changed. */
5236 previous = (bravalue >= OP_ONCE)? code : NULL;
5239 tempreqvary = cd->req_varyopt; /* Save value before bracket */
5240 length_prevgroup = 0; /* Initialize for pre-compile phase */
5243 newoptions, /* The complete new option state */
5244 options & PCRE_IMS, /* The previous ims option state */
5245 &tempcode, /* Where to put code (updated) */
5246 &ptr, /* Input pointer (updated) */
5247 errorcodeptr, /* Where to put an error message */
5248 (bravalue == OP_ASSERTBACK ||
5249 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
5250 reset_bracount, /* True if (?| group */
5251 skipbytes, /* Skip over bracket number */
5252 &subfirstbyte, /* For possible first char */
5253 &subreqbyte, /* For possible last char */
5254 bcptr, /* Current branch chain */
5255 cd, /* Tables block */
5256 (lengthptr == NULL)? NULL : /* Actual compile phase */
5257 &length_prevgroup /* Pre-compile phase */
5261 /* At the end of compiling, code is still pointing to the start of the
5262 group, while tempcode has been updated to point past the end of the group
5263 and any option resetting that may follow it. The pattern pointer (ptr)
5264 is on the bracket. */
5266 /* If this is a conditional bracket, check that there are no more than
5267 two branches in the group, or just one if it's a DEFINE group. We do this
5268 in the real compile phase, not in the pre-pass, where the whole group may
5269 not be available. */
5271 if (bravalue == OP_COND && lengthptr == NULL)
5280 while (*tc != OP_KET);
5282 /* A DEFINE group is never obeyed inline (the "condition" is always
5283 false). It must have only one branch. */
5285 if (code[LINK_SIZE+1] == OP_DEF)
5289 *errorcodeptr = ERR54;
5292 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
5295 /* A "normal" conditional group. If there is just one branch, we must not
5296 make use of its firstbyte or reqbyte, because this is equivalent to an
5297 empty second branch. */
5303 *errorcodeptr = ERR27;
5306 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
5310 /* Error if hit end of pattern */
5312 if (*ptr != CHAR_RIGHT_PARENTHESIS)
5314 *errorcodeptr = ERR14;
5318 /* In the pre-compile phase, update the length by the length of the group,
5319 less the brackets at either end. Then reduce the compiled code to just a
5320 set of non-capturing brackets so that it doesn't use much memory if it is
5321 duplicated by a quantifier.*/
5323 if (lengthptr != NULL)
5325 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
5327 *errorcodeptr = ERR20;
5330 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5332 PUTINC(code, 0, 1 + LINK_SIZE);
5334 PUTINC(code, 0, 1 + LINK_SIZE);
5335 break; /* No need to waste time with special character handling */
5338 /* Otherwise update the main code pointer to the end of the group. */
5342 /* For a DEFINE group, required and first character settings are not
5345 if (bravalue == OP_DEF) break;
5347 /* Handle updating of the required and first characters for other types of
5348 group. Update for normal brackets of all kinds, and conditions with two
5349 branches (see code above). If the bracket is followed by a quantifier with
5350 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5351 zerofirstbyte outside the main loop so that they can be accessed for the
5354 zeroreqbyte = reqbyte;
5355 zerofirstbyte = firstbyte;
5356 groupsetfirstbyte = FALSE;
5358 if (bravalue >= OP_ONCE)
5360 /* If we have not yet set a firstbyte in this branch, take it from the
5361 subpattern, remembering that it was set here so that a repeat of more
5362 than one can replicate it as reqbyte if necessary. If the subpattern has
5363 no firstbyte, set "none" for the whole branch. In both cases, a zero
5364 repeat forces firstbyte to "none". */
5366 if (firstbyte == REQ_UNSET)
5368 if (subfirstbyte >= 0)
5370 firstbyte = subfirstbyte;
5371 groupsetfirstbyte = TRUE;
5373 else firstbyte = REQ_NONE;
5374 zerofirstbyte = REQ_NONE;
5377 /* If firstbyte was previously set, convert the subpattern's firstbyte
5378 into reqbyte if there wasn't one, using the vary flag that was in
5379 existence beforehand. */
5381 else if (subfirstbyte >= 0 && subreqbyte < 0)
5382 subreqbyte = subfirstbyte | tempreqvary;
5384 /* If the subpattern set a required byte (or set a first byte that isn't
5385 really the first byte - see above), set it. */
5387 if (subreqbyte >= 0) reqbyte = subreqbyte;
5390 /* For a forward assertion, we take the reqbyte, if set. This can be
5391 helpful if the pattern that follows the assertion doesn't set a different
5392 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5393 for an assertion, however because it leads to incorrect effect for patterns
5394 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5395 of a firstbyte. This is overcome by a scan at the end if there's no
5396 firstbyte, looking for an asserted first char. */
5398 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5399 break; /* End of processing '(' */
5402 /* ===================================================================*/
5403 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5404 are arranged to be the negation of the corresponding OP_values. For the
5405 back references, the values are ESC_REF plus the reference number. Only
5406 back references and those types that consume a character may be repeated.
5407 We can test for values between ESC_b and ESC_Z for the latter; this may
5408 have to change if any new ones are ever created. */
5410 case CHAR_BACKSLASH:
5412 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5413 if (*errorcodeptr != 0) goto FAILED;
5417 if (-c == ESC_Q) /* Handle start of quoted string */
5419 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5420 ptr += 2; /* avoid empty string */
5425 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5427 /* For metasequences that actually match a character, we disable the
5428 setting of a first character if it hasn't already been set. */
5430 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5431 firstbyte = REQ_NONE;
5433 /* Set values to reset to if this is followed by a zero repeat. */
5435 zerofirstbyte = firstbyte;
5436 zeroreqbyte = reqbyte;
5438 /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5439 is a subroutine call by number (Oniguruma syntax). In fact, the value
5440 -ESC_g is returned only for these cases. So we don't need to check for <
5441 or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5442 -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5443 that is a synonym for a named back reference). */
5448 save_hwm = cd->hwm; /* Normally this is set when '(' is read */
5449 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5450 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5452 /* These two statements stop the compiler for warning about possibly
5453 unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5454 fact, because we actually check for a number below, the paths that
5455 would actually be in error are never taken. */
5458 reset_bracount = FALSE;
5460 /* Test for a name */
5462 if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
5464 BOOL isnumber = TRUE;
5465 for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5467 if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5468 if ((cd->ctypes[*p] & ctype_word) == 0) break;
5470 if (*p != terminator)
5472 *errorcodeptr = ERR57;
5478 goto HANDLE_NUMERICAL_RECURSION;
5481 goto NAMED_REF_OR_RECURSE;
5484 /* Test a signed number in angle brackets or quotes. */
5487 while (g_ascii_isdigit(*p) != 0) p++;
5488 if (*p != terminator)
5490 *errorcodeptr = ERR57;
5494 goto HANDLE_NUMERICAL_RECURSION;
5497 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5498 We also support \k{name} (.NET syntax) */
5500 if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||
5501 ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))
5504 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5505 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
5506 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
5507 goto NAMED_REF_OR_RECURSE;
5510 /* Back references are handled specially; must disable firstbyte if
5511 not set to cope with cases like (?=(\w+))\1: which would otherwise set
5517 recno = -c - ESC_REF;
5519 HANDLE_REFERENCE: /* Come here from named backref handling */
5520 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5523 PUT2INC(code, 0, recno);
5524 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5525 if (recno > cd->top_backref) cd->top_backref = recno;
5527 /* Check to see if this back reference is recursive, that it, it
5528 is inside the group that it references. A flag is set so that the
5529 group can be made atomic. */
5531 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
5533 if (oc->number == recno)
5541 /* So are Unicode property matches, if supported. */
5544 else if (-c == ESC_P || -c == ESC_p)
5548 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5549 if (ptype < 0) goto FAILED;
5551 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5557 /* If Unicode properties are not supported, \X, \P, and \p are not
5560 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5562 *errorcodeptr = ERR45;
5567 /* For the rest (including \X when Unicode properties are supported), we
5568 can obtain the OP value by negating the escape value. */
5572 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5578 /* We have a data character whose value is in c. In UTF-8 mode it may have
5579 a value > 127. We set its representation in the length/buffer, and then
5580 handle it as a data character. */
5583 if (utf8 && c > 127)
5584 mclength = _pcre_ord2utf8(c, mcbuffer);
5595 /* ===================================================================*/
5596 /* Handle a literal character. It is guaranteed not to be whitespace or #
5597 when the extended flag is set. If we are in UTF-8 mode, it may be a
5598 multi-byte literal character. */
5606 if (utf8 && c >= 0xc0)
5608 while ((ptr[1] & 0xc0) == 0x80)
5609 mcbuffer[mclength++] = *(++ptr);
5613 /* At this point we have the character's bytes in mcbuffer, and the length
5614 in mclength. When not in UTF-8 mode, the length is always 1. */
5618 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5619 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5621 /* Remember if \r or \n were seen */
5623 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
5624 cd->external_flags |= PCRE_HASCRORLF;
5626 /* Set the first and required bytes appropriately. If no previous first
5627 byte, set it from this character, but revert to none on a zero repeat.
5628 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5631 if (firstbyte == REQ_UNSET)
5633 zerofirstbyte = REQ_NONE;
5634 zeroreqbyte = reqbyte;
5636 /* If the character is more than one byte long, we can set firstbyte
5637 only if it is not to be matched caselessly. */
5639 if (mclength == 1 || req_caseopt == 0)
5641 firstbyte = mcbuffer[0] | req_caseopt;
5642 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5644 else firstbyte = reqbyte = REQ_NONE;
5647 /* firstbyte was previously set; we can set reqbyte only the length is
5648 1 or the matching is caseful. */
5652 zerofirstbyte = firstbyte;
5653 zeroreqbyte = reqbyte;
5654 if (mclength == 1 || req_caseopt == 0)
5655 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5658 break; /* End of literal character handling */
5660 } /* end of big loop */
5663 /* Control never reaches here by falling through, only by a goto for all the
5664 error states. Pass back the position in the pattern so that it can be displayed
5665 to the user for diagnosing the error. */
5675 /*************************************************
5676 * Compile sequence of alternatives *
5677 *************************************************/
5679 /* On entry, ptr is pointing past the bracket character, but on return it
5680 points to the closing bracket, or vertical bar, or end of string. The code
5681 variable is pointing at the byte into which the BRA operator has been stored.
5682 If the ims options are changed at the start (for a (?ims: group) or during any
5683 branch, we need to insert an OP_OPT item at the start of every following branch
5684 to ensure they get set correctly at run time, and also pass the new options
5685 into every subsequent branch compile.
5687 This function is used during the pre-compile phase when we are trying to find
5688 out the amount of memory needed, as well as during the real compile phase. The
5689 value of lengthptr distinguishes the two phases.
5692 options option bits, including any changes for this subpattern
5693 oldims previous settings of ims option bits
5694 codeptr -> the address of the current code pointer
5695 ptrptr -> the address of the current pattern pointer
5696 errorcodeptr -> pointer to error code variable
5697 lookbehind TRUE if this is a lookbehind assertion
5698 reset_bracount TRUE to reset the count for each branch
5699 skipbytes skip this many bytes at start (for brackets and OP_COND)
5700 firstbyteptr place to put the first required character, or a negative number
5701 reqbyteptr place to put the last required character, or a negative number
5702 bcptr pointer to the chain of currently open branches
5703 cd points to the data block with tables pointers etc.
5704 lengthptr NULL during the real compile phase
5705 points to length accumulator during pre-compile phase
5707 Returns: TRUE on success
5711 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5712 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5713 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5716 const uschar *ptr = *ptrptr;
5717 uschar *code = *codeptr;
5718 uschar *last_branch = code;
5719 uschar *start_bracket = code;
5720 uschar *reverse_count = NULL;
5721 open_capitem capitem;
5723 int firstbyte, reqbyte;
5724 int branchfirstbyte, branchreqbyte;
5728 int old_external_options = cd->external_options;
5732 bc.current_branch = code;
5734 firstbyte = reqbyte = REQ_UNSET;
5736 /* Accumulate the length for use in the pre-compile phase. Start with the
5737 length of the BRA and KET and any extra bytes that are required at the
5738 beginning. We accumulate in a local variable to save frequent testing of
5739 lenthptr for NULL. We cannot do this by looking at the value of code at the
5740 start and end of each alternative, because compiled items are discarded during
5741 the pre-compile phase so that the work space is not exceeded. */
5743 length = 2 + 2*LINK_SIZE + skipbytes;
5745 /* WARNING: If the above line is changed for any reason, you must also change
5746 the code that abstracts option settings at the start of the pattern and makes
5747 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5748 pre-compile phase to find out whether anything has yet been compiled or not. */
5750 /* If this is a capturing subpattern, add to the chain of open capturing items
5751 so that we can detect them if (*ACCEPT) is encountered. This is also used to
5752 detect groups that contain recursive back references to themselves. */
5754 if (*code == OP_CBRA)
5756 capnumber = GET2(code, 1 + LINK_SIZE);
5757 capitem.number = capnumber;
5758 capitem.next = cd->open_caps;
5759 capitem.flag = FALSE;
5760 cd->open_caps = &capitem;
5763 /* Offset is set zero to mark that this bracket is still open */
5766 code += 1 + LINK_SIZE + skipbytes;
5768 /* Loop for each alternative branch */
5770 orig_bracount = max_bracount = cd->bracount;
5773 /* For a (?| group, reset the capturing bracket count so that each branch
5774 uses the same numbers. */
5776 if (reset_bracount) cd->bracount = orig_bracount;
5778 /* Handle a change of ims options at the start of the branch */
5780 if ((options & PCRE_IMS) != oldims)
5783 *code++ = options & PCRE_IMS;
5787 /* Set up dummy OP_REVERSE if lookbehind assertion */
5791 *code++ = OP_REVERSE;
5792 reverse_count = code;
5794 length += 1 + LINK_SIZE;
5797 /* Now compile the branch; in the pre-compile phase its length gets added
5800 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5801 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5807 /* If the external options have changed during this branch, it means that we
5808 are at the top level, and a leading option setting has been encountered. We
5809 need to re-set the original option values to take account of this so that,
5810 during the pre-compile phase, we know to allow for a re-set at the start of
5811 subsequent branches. */
5813 if (old_external_options != cd->external_options)
5814 oldims = cd->external_options & PCRE_IMS;
5816 /* Keep the highest bracket count in case (?| was used and some branch
5817 has fewer than the rest. */
5819 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5821 /* In the real compile phase, there is some post-processing to be done. */
5823 if (lengthptr == NULL)
5825 /* If this is the first branch, the firstbyte and reqbyte values for the
5826 branch become the values for the regex. */
5828 if (*last_branch != OP_ALT)
5830 firstbyte = branchfirstbyte;
5831 reqbyte = branchreqbyte;
5834 /* If this is not the first branch, the first char and reqbyte have to
5835 match the values from all the previous branches, except that if the
5836 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5837 and we set REQ_VARY for the regex. */
5841 /* If we previously had a firstbyte, but it doesn't match the new branch,
5842 we have to abandon the firstbyte for the regex, but if there was
5843 previously no reqbyte, it takes on the value of the old firstbyte. */
5845 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5847 if (reqbyte < 0) reqbyte = firstbyte;
5848 firstbyte = REQ_NONE;
5851 /* If we (now or from before) have no firstbyte, a firstbyte from the
5852 branch becomes a reqbyte if there isn't a branch reqbyte. */
5854 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5855 branchreqbyte = branchfirstbyte;
5857 /* Now ensure that the reqbytes match */
5859 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5861 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5864 /* If lookbehind, check that this branch matches a fixed-length string, and
5865 put the length into the OP_REVERSE item. Temporarily mark the end of the
5866 branch with OP_END. If the branch contains OP_RECURSE, the result is -3
5867 because there may be forward references that we can't check here. Set a
5868 flag to cause another lookbehind check at the end. Why not do it all at the
5869 end? Because common, erroneous checks are picked up here and the offset of
5870 the problem can be shown. */
5876 fixed_length = find_fixedlength(last_branch, options, FALSE, cd);
5877 DPRINTF(("fixed length = %d\n", fixed_length));
5878 if (fixed_length == -3)
5880 cd->check_lookbehind = TRUE;
5882 else if (fixed_length < 0)
5884 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5888 else { PUT(reverse_count, 0, fixed_length); }
5892 /* Reached end of expression, either ')' or end of pattern. In the real
5893 compile phase, go back through the alternative branches and reverse the chain
5894 of offsets, with the field in the BRA item now becoming an offset to the
5895 first alternative. If there are no alternatives, it points to the end of the
5896 group. The length in the terminating ket is always the length of the whole
5897 bracketed item. If any of the ims options were changed inside the group,
5898 compile a resetting op-code following, except at the very end of the pattern.
5899 Return leaving the pointer at the terminating char. */
5901 if (*ptr != CHAR_VERTICAL_LINE)
5903 if (lengthptr == NULL)
5905 int branch_length = code - last_branch;
5908 int prev_length = GET(last_branch, 1);
5909 PUT(last_branch, 1, branch_length);
5910 branch_length = prev_length;
5911 last_branch -= branch_length;
5913 while (branch_length > 0);
5916 /* Fill in the ket */
5919 PUT(code, 1, code - start_bracket);
5920 code += 1 + LINK_SIZE;
5922 /* If it was a capturing subpattern, check to see if it contained any
5923 recursive back references. If so, we must wrap it in atomic brackets.
5924 In any event, remove the block from the chain. */
5928 if (cd->open_caps->flag)
5930 memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
5931 code - start_bracket);
5932 *start_bracket = OP_ONCE;
5933 code += 1 + LINK_SIZE;
5934 PUT(start_bracket, 1, code - start_bracket);
5936 PUT(code, 1, code - start_bracket);
5937 code += 1 + LINK_SIZE;
5938 length += 2 + 2*LINK_SIZE;
5940 cd->open_caps = cd->open_caps->next;
5943 /* Reset options if needed. */
5945 if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS)
5952 /* Retain the highest bracket number, in case resetting was used. */
5954 cd->bracount = max_bracount;
5956 /* Set values to pass back */
5960 *firstbyteptr = firstbyte;
5961 *reqbyteptr = reqbyte;
5962 if (lengthptr != NULL)
5964 if (OFLOW_MAX - *lengthptr < length)
5966 *errorcodeptr = ERR20;
5969 *lengthptr += length;
5974 /* Another branch follows. In the pre-compile phase, we can move the code
5975 pointer back to where it was for the start of the first branch. (That is,
5976 pretend that each branch is the only one.)
5978 In the real compile phase, insert an ALT node. Its length field points back
5979 to the previous branch while the bracket remains open. At the end the chain
5980 is reversed. It's done like this so that the start of the bracket has a
5981 zero offset until it is closed, making it possible to detect recursion. */
5983 if (lengthptr != NULL)
5985 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5986 length += 1 + LINK_SIZE;
5991 PUT(code, 1, code - last_branch);
5992 bc.current_branch = last_branch = code;
5993 code += 1 + LINK_SIZE;
5998 /* Control never reaches here */
6004 /*************************************************
6005 * Check for anchored expression *
6006 *************************************************/
6008 /* Try to find out if this is an anchored regular expression. Consider each
6009 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
6010 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
6011 it's anchored. However, if this is a multiline pattern, then only OP_SOD
6012 counts, since OP_CIRC can match in the middle.
6014 We can also consider a regex to be anchored if OP_SOM starts all its branches.
6015 This is the code for \G, which means "match at start of match position, taking
6016 into account the match offset".
6018 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
6019 because that will try the rest of the pattern at all possible matching points,
6020 so there is no point trying again.... er ....
6022 .... except when the .* appears inside capturing parentheses, and there is a
6023 subsequent back reference to those parentheses. We haven't enough information
6024 to catch that case precisely.
6026 At first, the best we could do was to detect when .* was in capturing brackets
6027 and the highest back reference was greater than or equal to that level.
6028 However, by keeping a bitmap of the first 31 back references, we can catch some
6029 of the more common cases more precisely.
6032 code points to start of expression (the bracket)
6033 options points to the options setting
6034 bracket_map a bitmap of which brackets we are inside while testing; this
6035 handles up to substring 31; after that we just have to take
6036 the less precise approach
6037 backref_map the back reference bitmap
6039 Returns: TRUE or FALSE
6043 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
6044 unsigned int backref_map)
6047 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6048 options, PCRE_MULTILINE, FALSE);
6049 register int op = *scode;
6051 /* Non-capturing brackets */
6055 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
6058 /* Capturing brackets */
6060 else if (op == OP_CBRA)
6062 int n = GET2(scode, 1+LINK_SIZE);
6063 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6064 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
6067 /* Other brackets */
6069 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
6071 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
6074 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
6075 it isn't in brackets that are or may be referenced. */
6077 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
6078 op == OP_TYPEPOSSTAR))
6080 if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
6084 /* Check for explicit anchoring */
6086 else if (op != OP_SOD && op != OP_SOM &&
6087 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
6089 code += GET(code, 1);
6091 while (*code == OP_ALT); /* Loop for each alternative */
6097 /*************************************************
6098 * Check for starting with ^ or .* *
6099 *************************************************/
6101 /* This is called to find out if every branch starts with ^ or .* so that
6102 "first char" processing can be done to speed things up in multiline
6103 matching and for non-DOTALL patterns that start with .* (which must start at
6104 the beginning or after \n). As in the case of is_anchored() (see above), we
6105 have to take account of back references to capturing brackets that contain .*
6106 because in that case we can't make the assumption.
6109 code points to start of expression (the bracket)
6110 bracket_map a bitmap of which brackets we are inside while testing; this
6111 handles up to substring 31; after that we just have to take
6112 the less precise approach
6113 backref_map the back reference bitmap
6115 Returns: TRUE or FALSE
6119 is_startline(const uschar *code, unsigned int bracket_map,
6120 unsigned int backref_map)
6123 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6125 register int op = *scode;
6127 /* If we are at the start of a conditional assertion group, *both* the
6128 conditional assertion *and* what follows the condition must satisfy the test
6129 for start of line. Other kinds of condition fail. Note that there may be an
6130 auto-callout at the start of a condition. */
6134 scode += 1 + LINK_SIZE;
6135 if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];
6145 default: /* Assertion */
6146 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6147 do scode += GET(scode, 1); while (*scode == OP_ALT);
6148 scode += 1 + LINK_SIZE;
6151 scode = first_significant_code(scode, NULL, 0, FALSE);
6155 /* Non-capturing brackets */
6159 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6162 /* Capturing brackets */
6164 else if (op == OP_CBRA)
6166 int n = GET2(scode, 1+LINK_SIZE);
6167 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6168 if (!is_startline(scode, new_map, backref_map)) return FALSE;
6171 /* Other brackets */
6173 else if (op == OP_ASSERT || op == OP_ONCE)
6175 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6178 /* .* means "start at start or after \n" if it isn't in brackets that
6179 may be referenced. */
6181 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
6183 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
6186 /* Check for explicit circumflex */
6188 else if (op != OP_CIRC) return FALSE;
6190 /* Move on to the next alternative */
6192 code += GET(code, 1);
6194 while (*code == OP_ALT); /* Loop for each alternative */
6200 /*************************************************
6201 * Check for asserted fixed first char *
6202 *************************************************/
6204 /* During compilation, the "first char" settings from forward assertions are
6205 discarded, because they can cause conflicts with actual literals that follow.
6206 However, if we end up without a first char setting for an unanchored pattern,
6207 it is worth scanning the regex to see if there is an initial asserted first
6208 char. If all branches start with the same asserted char, or with a bracket all
6209 of whose alternatives start with the same asserted char (recurse ad lib), then
6210 we return that char, otherwise -1.
6213 code points to start of expression (the bracket)
6214 options pointer to the options (used to check casing changes)
6215 inassert TRUE if in an assertion
6217 Returns: -1 or the fixed first char
6221 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
6223 register int c = -1;
6226 const uschar *scode =
6227 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
6228 register int op = *scode;
6240 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
6242 if (c < 0) c = d; else if (c != d) return -1;
6245 case OP_EXACT: /* Fall through */
6253 if (!inassert) return -1;
6257 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
6259 else if (c != scode[1]) return -1;
6263 code += GET(code, 1);
6265 while (*code == OP_ALT);
6271 /*************************************************
6272 * Compile a Regular Expression *
6273 *************************************************/
6275 /* This function takes a string and returns a pointer to a block of store
6276 holding a compiled version of the expression. The original API for this
6277 function had no error code return variable; it is retained for backwards
6278 compatibility. The new function is given a new name.
6281 pattern the regular expression
6282 options various option bits
6283 errorcodeptr pointer to error code variable (pcre_compile2() only)
6284 can be NULL if you don't want a code value
6285 errorptr pointer to pointer to error text
6286 erroroffset ptr offset in pattern where error was detected
6287 tables pointer to character tables or NULL
6289 Returns: pointer to compiled data block, or NULL on error,
6290 with errorptr and erroroffset set
6293 #ifdef NOT_USED_IN_GLIB
6295 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6296 pcre_compile(const char *pattern, int options, const char **errorptr,
6297 int *erroroffset, const unsigned char *tables)
6299 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
6304 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6305 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
6306 const char **errorptr, int *erroroffset, const unsigned char *tables)
6309 int length = 1; /* For final END opcode */
6310 int firstbyte, reqbyte, newline;
6312 int skipatstart = 0;
6313 BOOL utf8 = (options & PCRE_UTF8) != 0;
6316 const uschar *codestart;
6318 compile_data compile_block;
6319 compile_data *cd = &compile_block;
6321 /* This space is used for "compiling" into during the first phase, when we are
6322 computing the amount of memory that is needed. Compiled items are thrown away
6323 as soon as possible, so that a fairly large buffer should be sufficient for
6324 this purpose. The same space is used in the second phase for remembering where
6325 to fill in forward references to subpatterns. */
6327 uschar cworkspace[COMPILE_WORK_SIZE];
6329 /* Set this early so that early errors get offset 0. */
6331 ptr = (const uschar *)pattern;
6333 /* We can't pass back an error message if errorptr is NULL; I guess the best we
6334 can do is just return NULL, but we can set a code value if there is a code
6337 if (errorptr == NULL)
6339 if (errorcodeptr != NULL) *errorcodeptr = 99;
6344 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
6346 /* However, we can give a message for this error */
6348 if (erroroffset == NULL)
6351 goto PCRE_EARLY_ERROR_RETURN2;
6356 /* Set up pointers to the individual character tables */
6358 if (tables == NULL) tables = _pcre_default_tables;
6359 cd->lcc = tables + lcc_offset;
6360 cd->fcc = tables + fcc_offset;
6361 cd->cbits = tables + cbits_offset;
6362 cd->ctypes = tables + ctypes_offset;
6364 /* Check that all undefined public option bits are zero */
6366 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
6369 goto PCRE_EARLY_ERROR_RETURN;
6372 /* Check for global one-time settings at the start of the pattern, and remember
6373 the offset for later. */
6375 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
6376 ptr[skipatstart+1] == CHAR_ASTERISK)
6381 if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
6382 { skipatstart += 7; options |= PCRE_UTF8; continue; }
6384 if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
6385 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6386 else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3) == 0)
6387 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
6388 else if (strncmp((char *)(ptr+skipatstart+2), STRING_CRLF_RIGHTPAR, 5) == 0)
6389 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
6390 else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANY_RIGHTPAR, 4) == 0)
6391 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
6392 else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANYCRLF_RIGHTPAR, 8) == 0)
6393 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
6395 else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
6396 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
6397 else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
6398 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
6401 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
6402 else if (newbsr != 0)
6403 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
6407 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
6410 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6411 (*erroroffset = _pcre_valid_utf8((USPTR)pattern, -1)) >= 0)
6414 goto PCRE_EARLY_ERROR_RETURN2;
6420 goto PCRE_EARLY_ERROR_RETURN;
6424 /* Check validity of \R options. */
6426 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6429 case PCRE_BSR_ANYCRLF:
6430 case PCRE_BSR_UNICODE:
6432 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6435 /* Handle different types of newline. The three bits give seven cases. The
6436 current code allows for fixed one- or two-byte sequences, plus "any" and
6439 switch (options & PCRE_NEWLINE_BITS)
6441 case 0: newline = NEWLINE; break; /* Build-time default */
6442 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6443 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6444 case PCRE_NEWLINE_CR+
6445 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6446 case PCRE_NEWLINE_ANY: newline = -1; break;
6447 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6448 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6453 cd->nltype = NLTYPE_ANYCRLF;
6455 else if (newline < 0)
6457 cd->nltype = NLTYPE_ANY;
6461 cd->nltype = NLTYPE_FIXED;
6465 cd->nl[0] = (newline >> 8) & 255;
6466 cd->nl[1] = newline & 255;
6471 cd->nl[0] = newline;
6475 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
6476 references to help in deciding whether (.*) can be treated as anchored or not.
6479 cd->top_backref = 0;
6480 cd->backref_map = 0;
6482 /* Reflect pattern for debugging output */
6484 DPRINTF(("------------------------------------------------------------------\n"));
6485 DPRINTF(("%s\n", pattern));
6487 /* Pretend to compile the pattern while actually just accumulating the length
6488 of memory required. This behaviour is triggered by passing a non-NULL final
6489 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
6490 to compile parts of the pattern into; the compiled code is discarded when it is
6491 no longer needed, so hopefully this workspace will never overflow, though there
6492 is a test for its doing so. */
6494 cd->bracount = cd->final_bracount = 0;
6495 cd->names_found = 0;
6496 cd->name_entry_size = 0;
6497 cd->name_table = NULL;
6498 cd->start_workspace = cworkspace;
6499 cd->start_code = cworkspace;
6500 cd->hwm = cworkspace;
6501 cd->start_pattern = (const uschar *)pattern;
6502 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
6503 cd->req_varyopt = 0;
6504 cd->external_options = options;
6505 cd->external_flags = 0;
6506 cd->open_caps = NULL;
6508 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6509 don't need to look at the result of the function here. The initial options have
6510 been put into the cd block so that they can be changed if an option setting is
6511 found within the regex right at the beginning. Bringing initial option settings
6512 outside can help speed up starting point checks. */
6517 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
6518 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
6520 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
6522 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
6523 cd->hwm - cworkspace));
6525 if (length > MAX_PATTERN_SIZE)
6528 goto PCRE_EARLY_ERROR_RETURN;
6531 /* Compute the size of data block needed and get it, either from malloc or
6532 externally provided function. Integer overflow should no longer be possible
6533 because nowadays we limit the maximum value of cd->names_found and
6534 cd->name_entry_size. */
6536 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
6537 re = (real_pcre *)(pcre_malloc)(size);
6542 goto PCRE_EARLY_ERROR_RETURN;
6545 /* Put in the magic number, and save the sizes, initial options, internal
6546 flags, and character table pointer. NULL is used for the default character
6547 tables. The nullpad field is at the end; it's there to help in the case when a
6548 regex compiled on a system with 4-byte pointers is run on another with 8-byte
6551 re->magic_number = MAGIC_NUMBER;
6553 re->options = cd->external_options;
6554 re->flags = cd->external_flags;
6558 re->name_table_offset = sizeof(real_pcre);
6559 re->name_entry_size = cd->name_entry_size;
6560 re->name_count = cd->names_found;
6562 re->tables = (tables == _pcre_default_tables)? NULL : tables;
6565 /* The starting points of the name/number translation table and of the code are
6566 passed around in the compile data block. The start/end pattern and initial
6567 options are already set from the pre-compile phase, as is the name_entry_size
6568 field. Reset the bracket count and the names_found field. Also reset the hwm
6569 field; this time it's used for remembering forward references to subpatterns.
6572 cd->final_bracount = cd->bracount; /* Save for checking forward references */
6574 cd->names_found = 0;
6575 cd->name_table = (uschar *)re + re->name_table_offset;
6576 codestart = cd->name_table + re->name_entry_size * re->name_count;
6577 cd->start_code = codestart;
6578 cd->hwm = cworkspace;
6579 cd->req_varyopt = 0;
6580 cd->had_accept = FALSE;
6581 cd->check_lookbehind = FALSE;
6582 cd->open_caps = NULL;
6584 /* Set up a starting, non-extracting bracket, then compile the expression. On
6585 error, errorcode will be set non-zero, so we don't need to look at the result
6586 of the function here. */
6588 ptr = (const uschar *)pattern + skipatstart;
6589 code = (uschar *)codestart;
6591 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6592 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6593 re->top_bracket = cd->bracount;
6594 re->top_backref = cd->top_backref;
6595 re->flags = cd->external_flags;
6597 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
6599 /* If not reached end of pattern on success, there's an excess bracket. */
6601 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6603 /* Fill in the terminating state and check for disastrous overflow, but
6604 if debugging, leave the test till after things are printed out. */
6609 if (code - codestart > length) errorcode = ERR23;
6612 /* Fill in any forward references that are required. */
6614 while (errorcode == 0 && cd->hwm > cworkspace)
6617 const uschar *groupptr;
6618 cd->hwm -= LINK_SIZE;
6619 offset = GET(cd->hwm, 0);
6620 recno = GET(codestart, offset);
6621 groupptr = _pcre_find_bracket(codestart, utf8, recno);
6622 if (groupptr == NULL) errorcode = ERR53;
6623 else PUT(((uschar *)codestart), offset, groupptr - codestart);
6626 /* Give an error if there's back reference to a non-existent capturing
6629 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6631 /* If there were any lookbehind assertions that contained OP_RECURSE
6632 (recursions or subroutine calls), a flag is set for them to be checked here,
6633 because they may contain forward references. Actual recursions can't be fixed
6634 length, but subroutine calls can. It is done like this so that those without
6635 OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
6636 exceptional ones forgo this. We scan the pattern to check that they are fixed
6637 length, and set their lengths. */
6639 if (cd->check_lookbehind)
6641 uschar *cc = (uschar *)codestart;
6643 /* Loop, searching for OP_REVERSE items, and process those that do not have
6644 their length set. (Actually, it will also re-process any that have a length
6645 of zero, but that is a pathological case, and it does no harm.) When we find
6646 one, we temporarily terminate the branch it is in while we scan it. */
6648 for (cc = (uschar *)_pcre_find_bracket(codestart, utf8, -1);
6650 cc = (uschar *)_pcre_find_bracket(cc, utf8, -1))
6652 if (GET(cc, 1) == 0)
6655 uschar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
6658 fixed_length = find_fixedlength(cc, re->options, TRUE, cd);
6660 DPRINTF(("fixed length = %d\n", fixed_length));
6661 if (fixed_length < 0)
6663 errorcode = (fixed_length == -2)? ERR36 : ERR25;
6666 PUT(cc, 1, fixed_length);
6668 cc += 1 + LINK_SIZE;
6672 /* Failed to compile, or error while post-processing */
6677 PCRE_EARLY_ERROR_RETURN:
6678 *erroroffset = ptr - (const uschar *)pattern;
6679 PCRE_EARLY_ERROR_RETURN2:
6680 *errorptr = find_error_text(errorcode);
6681 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6685 /* If the anchored option was not passed, set the flag if we can determine that
6686 the pattern is anchored by virtue of ^ characters or \A or anything else (such
6687 as starting with .* when DOTALL is set).
6689 Otherwise, if we know what the first byte has to be, save it, because that
6690 speeds up unanchored matches no end. If not, see if we can set the
6691 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6692 start with ^. and also when all branches start with .* for non-DOTALL matches.
6695 if ((re->options & PCRE_ANCHORED) == 0)
6697 int temp_options = re->options; /* May get changed during these scans */
6698 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6699 re->options |= PCRE_ANCHORED;
6703 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6704 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
6706 int ch = firstbyte & 255;
6707 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6708 cd->fcc[ch] == ch)? ch : firstbyte;
6709 re->flags |= PCRE_FIRSTSET;
6711 else if (is_startline(codestart, 0, cd->backref_map))
6712 re->flags |= PCRE_STARTLINE;
6716 /* For an anchored pattern, we use the "required byte" only if it follows a
6717 variable length item in the regex. Remove the caseless flag for non-caseable
6721 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6723 int ch = reqbyte & 255;
6724 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6725 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6726 re->flags |= PCRE_REQCHSET;
6729 /* Print out the compiled data if debugging is enabled. This is never the
6730 case when building a production library. */
6733 printf("Length = %d top_bracket = %d top_backref = %d\n",
6734 length, re->top_bracket, re->top_backref);
6736 printf("Options=%08x\n", re->options);
6738 if ((re->flags & PCRE_FIRSTSET) != 0)
6740 int ch = re->first_byte & 255;
6741 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6743 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6744 else printf("First char = \\x%02x%s\n", ch, caseless);
6747 if ((re->flags & PCRE_REQCHSET) != 0)
6749 int ch = re->req_byte & 255;
6750 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6752 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6753 else printf("Req char = \\x%02x%s\n", ch, caseless);
6756 pcre_printint(re, stdout, TRUE);
6758 /* This check is done here in the debugging case so that the code that
6759 was compiled can be seen. */
6761 if (code - codestart > length)
6764 *errorptr = find_error_text(ERR23);
6765 *erroroffset = ptr - (uschar *)pattern;
6766 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6769 #endif /* PCRE_DEBUG */
6774 /* End of pcre_compile.c */