1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Copyright (c) 1997-2006 University of Cambridge
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
45 #define NLBLOCK cd /* Block containing newline information */
46 #define PSSTART start_pattern /* Field containing processed string start */
47 #define PSEND end_pattern /* Field containing processed string end */
50 #include "pcre_internal.h"
53 /* When DEBUG is defined, we need the pcre_printint() function, which is also
54 used by pcretest. DEBUG is not defined when building a production library. */
57 #include "pcre_printint.src"
61 /*************************************************
62 * Code parameters and static tables *
63 *************************************************/
65 /* This value specifies the size of stack workspace that is used during the
66 first pre-compile phase that determines how much memory is required. The regex
67 is partly compiled into this space, but the compiled parts are discarded as
68 soon as they can be, so that hopefully there will never be an overrun. The code
69 does, however, check for an overrun. The largest amount I've seen used is 218,
70 so this number is very generous.
72 The same workspace is used during the second, actual compile phase for
73 remembering forward references to groups so that they can be filled in at the
74 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
75 is 4 there is plenty of room. */
77 #define COMPILE_WORK_SIZE (4096)
80 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
81 are simple data values; negative values are for special things like \d and so
82 on. Zero means further processing is needed (for things like \x), or the escape
85 #if !EBCDIC /* This is the "normal" table for ASCII systems */
86 static const short int escapes[] = {
87 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
88 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
89 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
90 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
91 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
92 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
93 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
94 0, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
95 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
96 0, 0, -ESC_z /* x - z */
99 #else /* This is the "abnormal" table for EBCDIC systems */
100 static const short int escapes[] = {
101 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
102 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
103 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
104 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
105 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
106 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
107 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
108 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
109 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
110 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
111 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
112 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
113 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
114 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
115 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
116 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
117 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
118 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
119 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
120 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
121 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
122 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
123 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
128 /* Tables of names of POSIX character classes and their lengths. The list is
129 terminated by a zero length entry. The first three must be alpha, lower, upper,
130 as this is assumed for handling case independence. */
132 static const char posix_names[] =
148 static const uschar posix_name_lengths[] = {
149 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
151 /* Table of class bit maps for each POSIX class. Each class is formed from a
152 base map, with an optional addition or removal of another map. Then, for some
153 classes, there is some additional tweaking: for [:blank:] the vertical space
154 characters are removed, and for [:alpha:] and [:alnum:] the underscore
155 character is removed. The triples in the table consist of the base map offset,
156 second map offset or -1 if no second map, and a non-negative value for map
157 addition or a negative value for map subtraction (if there are two maps). The
158 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
159 remove vertical space characters, 2 => remove underscore. */
161 static const int posix_class_maps[] = {
162 cbit_word, cbit_digit, -2, /* alpha */
163 cbit_lower, -1, 0, /* lower */
164 cbit_upper, -1, 0, /* upper */
165 cbit_word, -1, 2, /* alnum - word without underscore */
166 cbit_print, cbit_cntrl, 0, /* ascii */
167 cbit_space, -1, 1, /* blank - a GNU extension */
168 cbit_cntrl, -1, 0, /* cntrl */
169 cbit_digit, -1, 0, /* digit */
170 cbit_graph, -1, 0, /* graph */
171 cbit_print, -1, 0, /* print */
172 cbit_punct, -1, 0, /* punct */
173 cbit_space, -1, 0, /* space */
174 cbit_word, -1, 0, /* word - a Perl extension */
175 cbit_xdigit,-1, 0 /* xdigit */
179 #define STRING(a) # a
180 #define XSTRING(s) STRING(s)
182 /* The texts of compile-time error messages. These are "char *" because they
183 are passed to the outside world. Do not ever re-use any error number, because
184 they are documented. Always add a new error instead. Messages marked DEAD below
185 are no longer used. */
189 static const char error_texts[] =
191 "\\ at end of pattern\0"
192 "\\c at end of pattern\0"
193 "unrecognized character follows \\\0"
194 "numbers out of order in {} quantifier\0"
196 "number too big in {} quantifier\0"
197 "missing terminating ] for character class\0"
198 "invalid escape sequence in character class\0"
199 "range out of order in character class\0"
200 "nothing to repeat\0"
202 DEAD("operand of unlimited repeat could match the empty string")
203 "internal error: unexpected repeat\0"
204 "unrecognized character after (?\0"
205 "POSIX named classes are supported only within a class\0"
208 "reference to non-existent subpattern\0"
209 "erroffset passed as NULL\0"
210 "unknown option bit(s) set\0"
211 "missing ) after comment\0"
212 DEAD("parentheses nested too deeply")
214 "regular expression too large\0"
215 "failed to get memory\0"
216 "unmatched parentheses\0"
217 "internal error: code overflow\0"
218 "unrecognized character after (?<\0"
220 "lookbehind assertion is not fixed length\0"
221 "malformed number or name after (?(\0"
222 "conditional group contains more than two branches\0"
223 "assertion expected after (?(\0"
224 "(?R or (?digits must be followed by )\0"
226 "unknown POSIX class name\0"
227 "POSIX collating elements are not supported\0"
228 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
230 "character value in \\x{...} sequence is too large\0"
232 "invalid condition (?(0)\0"
233 "\\C not allowed in lookbehind assertion\0"
234 "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
235 "number after (?C is > 255\0"
236 "closing ) for (?C expected\0"
238 "recursive call could loop indefinitely\0"
239 "unrecognized character after (?P\0"
240 "syntax error in subpattern name (missing terminator)\0"
241 "two named subpatterns have the same name\0"
242 "invalid UTF-8 string\0"
244 "support for \\P, \\p, and \\X has not been compiled\0"
245 "malformed \\P or \\p sequence\0"
246 "unknown property name after \\P or \\p\0"
247 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
248 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
250 "repeated subpattern is too long\0"
251 "octal value is greater than \\377 (not in UTF-8 mode)\0"
252 "internal error: overran compiling workspace\0"
253 "internal error: previously-checked referenced subpattern not found\0"
254 "DEFINE group contains more than one branch\0"
256 "repeating a DEFINE group is not allowed\0"
257 "inconsistent NEWLINE options\0"
258 "\\g is not followed by an (optionally braced) non-zero number";
260 static const int error_texts_offsets[] = {
322 /* Definition to allow mutual recursion */
325 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,
326 int *, branch_chain *, compile_data *, int *);
330 /*************************************************
332 *************************************************/
334 /* This function is called when a \ has been encountered. It either returns a
335 positive value for a simple escape such as \n, or a negative value which
336 encodes one of the more complicated things such as \d. A backreference to group
337 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
338 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
339 ptr is pointing at the \. On exit, it is on the final character of the escape
343 ptrptr points to the pattern position pointer
344 errorcodeptr points to the errorcode variable
345 bracount number of previous extracting brackets
346 options the options bits
347 isclass TRUE if inside a character class
349 Returns: zero or positive => a data character
350 negative => a special escape sequence
351 on error, errorptr is set
355 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
356 int options, BOOL isclass)
358 BOOL utf8 = (options & PCRE_UTF8) != 0;
359 const uschar *ptr = *ptrptr + 1;
362 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
363 ptr--; /* Set pointer back to the last byte */
365 /* If backslash is at the end of the pattern, it's an error. */
367 if (c == 0) *errorcodeptr = ERR1;
369 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
370 a table. A non-zero result is something that can be returned immediately.
371 Otherwise further processing may be required. */
373 #if !EBCDIC /* ASCII coding */
374 else if (c < '0' || c > 'z') {} /* Not alphameric */
375 else if ((i = escapes[c - '0']) != 0) c = i;
377 #else /* EBCDIC coding */
378 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
379 else if ((i = escapes[c - 0x48]) != 0) c = i;
382 /* Escapes that need further processing, or are illegal. */
386 const uschar *oldptr;
387 BOOL braced, negated;
391 /* A number of Perl escapes are not handled by PCRE. We give an explicit
399 *errorcodeptr = ERR37;
402 /* \g must be followed by a number, either plain or braced. If positive, it
403 is an absolute backreference. If negative, it is a relative backreference.
404 This is a Perl 5.10 feature. */
419 else negated = FALSE;
422 while (g_ascii_isdigit(ptr[1]) != 0)
423 c = c * 10 + *(++ptr) - '0';
425 if (c == 0 || (braced && *(++ptr) != '}'))
427 *errorcodeptr = ERR57;
435 *errorcodeptr = ERR15;
438 c = bracount - (c - 1);
444 /* The handling of escape sequences consisting of a string of digits
445 starting with one that is not zero is not straightforward. By experiment,
446 the way Perl works seems to be as follows:
448 Outside a character class, the digits are read as a decimal number. If the
449 number is less than 10, or if there are that many previous extracting
450 left brackets, then it is a back reference. Otherwise, up to three octal
451 digits are read to form an escaped byte. Thus \123 is likely to be octal
452 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
453 value is greater than 377, the least significant 8 bits are taken. Inside a
454 character class, \ followed by a digit is always an octal number. */
456 case '1': case '2': case '3': case '4': case '5':
457 case '6': case '7': case '8': case '9':
463 while (g_ascii_isdigit(ptr[1]) != 0)
464 c = c * 10 + *(++ptr) - '0';
465 if (c < 10 || c <= bracount)
470 ptr = oldptr; /* Put the pointer back and fall through */
473 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
474 generates a binary zero byte and treats the digit as a following literal.
475 Thus we have to pull back the pointer by one. */
477 if ((c = *ptr) >= '8')
484 /* \0 always starts an octal number, but we may drop through to here with a
485 larger first octal digit. The original code used just to take the least
486 significant 8 bits of octal numbers (I think this is what early Perls used
487 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
488 than 3 octal digits. */
492 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
493 c = c * 8 + *(++ptr) - '0';
494 if (!utf8 && c > 255) *errorcodeptr = ERR51;
497 /* \x is complicated. \x{ddd} is a character number which can be greater
498 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
499 treated as a data character. */
504 const uschar *pt = ptr + 2;
508 while (g_ascii_isxdigit(*pt) != 0)
510 register int cc = *pt++;
511 if (c == 0 && cc == '0') continue; /* Leading zeroes */
514 #if !EBCDIC /* ASCII coding */
515 if (cc >= 'a') cc -= 32; /* Convert to upper case */
516 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
517 #else /* EBCDIC coding */
518 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
519 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
525 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
530 /* If the sequence of hex digits does not end with '}', then we don't
531 recognize this construct; fall through to the normal \x handling. */
534 /* Read just a single-byte hex-defined char */
537 while (i++ < 2 && g_ascii_isxdigit(ptr[1]) != 0)
539 int cc; /* Some compilers don't like ++ */
540 cc = *(++ptr); /* in initializers */
541 #if !EBCDIC /* ASCII coding */
542 if (cc >= 'a') cc -= 32; /* Convert to upper case */
543 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
544 #else /* EBCDIC coding */
545 if (cc <= 'z') cc += 64; /* Convert to upper case */
546 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
551 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
552 This coding is ASCII-specific, but then the whole concept of \cx is
553 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
559 *errorcodeptr = ERR2;
563 #if !EBCDIC /* ASCII coding */
564 if (c >= 'a' && c <= 'z') c -= 32;
566 #else /* EBCDIC coding */
567 if (c >= 'a' && c <= 'z') c += 64;
572 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
573 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
574 for Perl compatibility, it is a literal. This code looks a bit odd, but
575 there used to be some cases other than the default, and there may be again
576 in future, so I haven't "optimized" it. */
579 if ((options & PCRE_EXTRA) != 0) switch(c)
582 *errorcodeptr = ERR3;
596 /*************************************************
598 *************************************************/
600 /* This function is called after \P or \p has been encountered, provided that
601 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
602 pointing at the P or p. On exit, it is pointing at the final character of the
606 ptrptr points to the pattern position pointer
607 negptr points to a boolean that is set TRUE for negation else FALSE
608 dptr points to an int that is set to the detailed property value
609 errorcodeptr points to the error code variable
611 Returns: type value from ucp_type_table, or -1 for an invalid type
615 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
618 const uschar *ptr = *ptrptr;
622 if (c == 0) goto ERROR_RETURN;
626 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
636 for (i = 0; i < sizeof(name) - 1; i++)
639 if (c == 0) goto ERROR_RETURN;
643 if (c !='}') goto ERROR_RETURN;
647 /* Otherwise there is just one following character */
657 /* Search for a recognized property name using binary chop */
660 top = _pcre_utt_size;
664 i = (bot + top) >> 1;
665 c = strcmp(name, &_pcre_ucp_names[_pcre_utt[i].offset]);
668 *dptr = _pcre_utt[i].value;
669 return _pcre_utt[i].type;
671 if (c > 0) bot = i + 1; else top = i;
674 *errorcodeptr = ERR47;
679 *errorcodeptr = ERR46;
688 /*************************************************
689 * Check for counted repeat *
690 *************************************************/
692 /* This function is called when a '{' is encountered in a place where it might
693 start a quantifier. It looks ahead to see if it really is a quantifier or not.
694 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
695 where the ddds are digits.
698 p pointer to the first char after '{'
700 Returns: TRUE or FALSE
704 is_counted_repeat(const uschar *p)
706 if (g_ascii_isdigit(*p++) == 0) return FALSE;
707 while (g_ascii_isdigit(*p) != 0) p++;
708 if (*p == '}') return TRUE;
710 if (*p++ != ',') return FALSE;
711 if (*p == '}') return TRUE;
713 if (g_ascii_isdigit(*p++) == 0) return FALSE;
714 while (g_ascii_isdigit(*p) != 0) p++;
721 /*************************************************
722 * Read repeat counts *
723 *************************************************/
725 /* Read an item of the form {n,m} and return the values. This is called only
726 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
727 so the syntax is guaranteed to be correct, but we need to check the values.
730 p pointer to first char after '{'
731 minp pointer to int for min
732 maxp pointer to int for max
733 returned as -1 if no max
734 errorcodeptr points to error code variable
736 Returns: pointer to '}' on success;
737 current ptr on error, with errorcodeptr set non-zero
740 static const uschar *
741 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
746 /* Read the minimum value and do a paranoid check: a negative value indicates
747 an integer overflow. */
749 while (g_ascii_isdigit(*p) != 0) min = min * 10 + *p++ - '0';
750 if (min < 0 || min > 65535)
752 *errorcodeptr = ERR5;
756 /* Read the maximum value if there is one, and again do a paranoid on its size.
757 Also, max must not be less than min. */
759 if (*p == '}') max = min; else
764 while(g_ascii_isdigit(*p) != 0) max = max * 10 + *p++ - '0';
765 if (max < 0 || max > 65535)
767 *errorcodeptr = ERR5;
772 *errorcodeptr = ERR4;
778 /* Fill in the required variables, and pass back the pointer to the terminating
788 /*************************************************
789 * Find forward referenced subpattern *
790 *************************************************/
792 /* This function scans along a pattern's text looking for capturing
793 subpatterns, and counting them. If it finds a named pattern that matches the
794 name it is given, it returns its number. Alternatively, if the name is NULL, it
795 returns when it reaches a given numbered subpattern. This is used for forward
796 references to subpatterns. We know that if (?P< is encountered, the name will
797 be terminated by '>' because that is checked in the first pass.
800 ptr current position in the pattern
801 count current count of capturing parens so far encountered
802 name name to seek, or NULL if seeking a numbered subpattern
803 lorn name length, or subpattern number if name is NULL
804 xmode TRUE if we are in /x mode
806 Returns: the number of the named subpattern, or -1 if not found
810 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
813 const uschar *thisname;
815 for (; *ptr != 0; ptr++)
819 /* Skip over backslashed characters and also entire \Q...\E */
823 if (*(++ptr) == 0) return -1;
824 if (*ptr == 'Q') for (;;)
826 while (*(++ptr) != 0 && *ptr != '\\');
827 if (*ptr == 0) return -1;
828 if (*(++ptr) == 'E') break;
833 /* Skip over character classes */
837 while (*(++ptr) != ']')
841 if (*(++ptr) == 0) return -1;
842 if (*ptr == 'Q') for (;;)
844 while (*(++ptr) != 0 && *ptr != '\\');
845 if (*ptr == 0) return -1;
846 if (*(++ptr) == 'E') break;
854 /* Skip comments in /x mode */
856 if (xmode && *ptr == '#')
858 while (*(++ptr) != 0 && *ptr != '\n');
859 if (*ptr == 0) return -1;
863 /* An opening parens must now be a real metacharacter */
865 if (*ptr != '(') continue;
869 if (name == NULL && count == lorn) return count;
874 if (*ptr == 'P') ptr++; /* Allow optional P */
876 /* We have to disambiguate (?<! and (?<= from (?<name> */
878 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
884 if (name == NULL && count == lorn) return count;
886 if (term == '<') term = '>';
888 while (*ptr != term) ptr++;
889 if (name != NULL && lorn == ptr - thisname &&
890 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
899 /*************************************************
900 * Find first significant op code *
901 *************************************************/
903 /* This is called by several functions that scan a compiled expression looking
904 for a fixed first character, or an anchoring op code etc. It skips over things
905 that do not influence this. For some calls, a change of option is important.
906 For some calls, it makes sense to skip negative forward and all backward
907 assertions, and also the \b assertion; for others it does not.
910 code pointer to the start of the group
911 options pointer to external options
912 optbit the option bit whose changing is significant, or
914 skipassert TRUE if certain assertions are to be skipped
916 Returns: pointer to the first significant opcode
920 first_significant_code(const uschar *code, int *options, int optbit,
928 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
929 *options = (int)code[1];
935 case OP_ASSERTBACK_NOT:
936 if (!skipassert) return code;
937 do code += GET(code, 1); while (*code == OP_ALT);
938 code += _pcre_OP_lengths[*code];
941 case OP_WORD_BOUNDARY:
942 case OP_NOT_WORD_BOUNDARY:
943 if (!skipassert) return code;
950 code += _pcre_OP_lengths[*code];
957 /* Control never reaches here */
963 /*************************************************
964 * Find the fixed length of a pattern *
965 *************************************************/
967 /* Scan a pattern and compute the fixed length of subject that will match it,
968 if the length is fixed. This is needed for dealing with backward assertions.
969 In UTF8 mode, the result is in characters rather than bytes.
972 code points to the start of the pattern (the bracket)
973 options the compiling options
975 Returns: the fixed length, or -1 if there is no fixed length,
976 or -2 if \C was encountered
980 find_fixedlength(uschar *code, int options)
984 register int branchlength = 0;
985 register uschar *cc = code + 1 + LINK_SIZE;
987 /* Scan along the opcodes for this branch. If we get to the end of the
988 branch, check the length against that of the other branches. */
993 register int op = *cc;
1001 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1002 if (d < 0) return d;
1004 do cc += GET(cc, 1); while (*cc == OP_ALT);
1005 cc += 1 + LINK_SIZE;
1008 /* Reached end of a branch; if it's a ket it is the end of a nested
1009 call. If it's ALT it is an alternation in a nested call. If it is
1010 END it's the end of the outer call. All can be handled by the same code. */
1017 if (length < 0) length = branchlength;
1018 else if (length != branchlength) return -1;
1019 if (*cc != OP_ALT) return length;
1020 cc += 1 + LINK_SIZE;
1024 /* Skip over assertive subpatterns */
1029 case OP_ASSERTBACK_NOT:
1030 do cc += GET(cc, 1); while (*cc == OP_ALT);
1033 /* Skip over things that don't match chars */
1047 case OP_NOT_WORD_BOUNDARY:
1048 case OP_WORD_BOUNDARY:
1049 cc += _pcre_OP_lengths[*cc];
1052 /* Handle literal characters */
1060 if ((options & PCRE_UTF8) != 0)
1062 while ((*cc & 0xc0) == 0x80) cc++;
1067 /* Handle exact repetitions. The count is already in characters, but we
1068 need to skip over a multibyte character in UTF8 mode. */
1071 branchlength += GET2(cc,1);
1074 if ((options & PCRE_UTF8) != 0)
1076 while((*cc & 0x80) == 0x80) cc++;
1082 branchlength += GET2(cc,1);
1086 /* Handle single-char matchers */
1095 case OP_NOT_WHITESPACE:
1097 case OP_NOT_WORDCHAR:
1104 /* The single-byte matcher isn't allowed */
1109 /* Check a class for variable quantification */
1113 cc += GET(cc, 1) - 33;
1131 if (GET2(cc,1) != GET2(cc,3)) return -1;
1132 branchlength += GET2(cc,1);
1141 /* Anything else is variable length */
1147 /* Control never gets here */
1153 /*************************************************
1154 * Scan compiled regex for numbered bracket *
1155 *************************************************/
1157 /* This little function scans through a compiled pattern until it finds a
1158 capturing bracket with the given number.
1161 code points to start of expression
1162 utf8 TRUE in UTF-8 mode
1163 number the required bracket number
1165 Returns: pointer to the opcode for the bracket, or NULL if not found
1168 static const uschar *
1169 find_bracket(const uschar *code, BOOL utf8, int number)
1173 register int c = *code;
1174 if (c == OP_END) return NULL;
1176 /* XCLASS is used for classes that cannot be represented just by a bit
1177 map. This includes negated single high-valued characters. The length in
1178 the table is zero; the actual length is stored in the compiled code. */
1180 if (c == OP_XCLASS) code += GET(code, 1);
1182 /* Handle capturing bracket */
1184 else if (c == OP_CBRA)
1186 int n = GET2(code, 1+LINK_SIZE);
1187 if (n == number) return (uschar *)code;
1188 code += _pcre_OP_lengths[c];
1191 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1192 a multi-byte character. The length in the table is a minimum, so we have to
1193 arrange to skip the extra bytes. */
1197 code += _pcre_OP_lengths[c];
1215 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1224 /*************************************************
1225 * Scan compiled regex for recursion reference *
1226 *************************************************/
1228 /* This little function scans through a compiled pattern until it finds an
1229 instance of OP_RECURSE.
1232 code points to start of expression
1233 utf8 TRUE in UTF-8 mode
1235 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1238 static const uschar *
1239 find_recurse(const uschar *code, BOOL utf8)
1243 register int c = *code;
1244 if (c == OP_END) return NULL;
1245 if (c == OP_RECURSE) return code;
1247 /* XCLASS is used for classes that cannot be represented just by a bit
1248 map. This includes negated single high-valued characters. The length in
1249 the table is zero; the actual length is stored in the compiled code. */
1251 if (c == OP_XCLASS) code += GET(code, 1);
1253 /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1254 that are followed by a character may be followed by a multi-byte character.
1255 The length in the table is a minimum, so we have to arrange to skip the extra
1260 code += _pcre_OP_lengths[c];
1278 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1287 /*************************************************
1288 * Scan compiled branch for non-emptiness *
1289 *************************************************/
1291 /* This function scans through a branch of a compiled pattern to see whether it
1292 can match the empty string or not. It is called from could_be_empty()
1293 below and from compile_branch() when checking for an unlimited repeat of a
1294 group that can match nothing. Note that first_significant_code() skips over
1295 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1296 struck an inner bracket whose current branch will already have been scanned.
1299 code points to start of search
1300 endcode points to where to stop
1301 utf8 TRUE if in UTF8 mode
1303 Returns: TRUE if what is matched could be empty
1307 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1310 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1312 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1314 const uschar *ccode;
1318 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1321 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1323 /* Scan a closed bracket */
1325 empty_branch = FALSE;
1328 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1329 empty_branch = TRUE;
1330 code += GET(code, 1);
1332 while (*code == OP_ALT);
1333 if (!empty_branch) return FALSE; /* All branches are non-empty */
1335 /* Move past the KET and fudge things so that the increment in the "for"
1336 above has no effect. */
1339 code += 1 + LINK_SIZE - _pcre_OP_lengths[c];
1343 /* Handle the other opcodes */
1347 /* Check for quantifiers after a class */
1351 ccode = code + GET(code, 1);
1352 goto CHECK_CLASS_REPEAT;
1365 case OP_CRSTAR: /* These could be empty; continue */
1371 default: /* Non-repeat => class must match */
1372 case OP_CRPLUS: /* These repeats aren't empty */
1378 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1383 /* Opcodes that must match a character */
1390 case OP_NOT_WHITESPACE:
1392 case OP_NOT_WORDCHAR:
1408 case OP_TYPEMINPLUS:
1409 case OP_TYPEPOSPLUS:
1421 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1422 MINUPTO, and POSUPTO may be followed by a multibyte character */
1434 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1445 /*************************************************
1446 * Scan compiled regex for non-emptiness *
1447 *************************************************/
1449 /* This function is called to check for left recursive calls. We want to check
1450 the current branch of the current pattern to see if it could match the empty
1451 string. If it could, we must look outwards for branches at other levels,
1452 stopping when we pass beyond the bracket which is the subject of the recursion.
1455 code points to start of the recursion
1456 endcode points to where to stop (current RECURSE item)
1457 bcptr points to the chain of current (unclosed) branch starts
1458 utf8 TRUE if in UTF-8 mode
1460 Returns: TRUE if what is matched could be empty
1464 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1467 while (bcptr != NULL && bcptr->current >= code)
1469 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1470 bcptr = bcptr->outer;
1477 /*************************************************
1478 * Check for POSIX class syntax *
1479 *************************************************/
1481 /* This function is called when the sequence "[:" or "[." or "[=" is
1482 encountered in a character class. It checks whether this is followed by an
1483 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1487 ptr pointer to the initial [
1488 endptr where to return the end pointer
1489 cd pointer to compile data
1491 Returns: TRUE or FALSE
1495 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1497 int terminator; /* Don't combine these lines; the Solaris cc */
1498 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1499 if (*(++ptr) == '^') ptr++;
1500 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1501 if (*ptr == terminator && ptr[1] == ']')
1512 /*************************************************
1513 * Check POSIX class name *
1514 *************************************************/
1516 /* This function is called to check the name given in a POSIX-style class entry
1520 ptr points to the first letter
1521 len the length of the name
1523 Returns: a value representing the name, or -1 if unknown
1527 check_posix_name(const uschar *ptr, int len)
1531 while (posix_name_lengths[yield] != 0)
1533 if (len == posix_name_lengths[yield] &&
1534 strcmp((const char *)ptr, posix_names + offset) == 0) return yield;
1535 offset += posix_name_lengths[yield] + 1;
1542 /*************************************************
1543 * Adjust OP_RECURSE items in repeated group *
1544 *************************************************/
1546 /* OP_RECURSE items contain an offset from the start of the regex to the group
1547 that is referenced. This means that groups can be replicated for fixed
1548 repetition simply by copying (because the recursion is allowed to refer to
1549 earlier groups that are outside the current group). However, when a group is
1550 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1551 it, after it has been compiled. This means that any OP_RECURSE items within it
1552 that refer to the group itself or any contained groups have to have their
1553 offsets adjusted. That one of the jobs of this function. Before it is called,
1554 the partially compiled regex must be temporarily terminated with OP_END.
1556 This function has been extended with the possibility of forward references for
1557 recursions and subroutine calls. It must also check the list of such references
1558 for the group we are dealing with. If it finds that one of the recursions in
1559 the current group is on this list, it adjusts the offset in the list, not the
1560 value in the reference (which is a group number).
1563 group points to the start of the group
1564 adjust the amount by which the group is to be moved
1565 utf8 TRUE in UTF-8 mode
1566 cd contains pointers to tables etc.
1567 save_hwm the hwm forward reference pointer at the start of the group
1573 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1576 uschar *ptr = group;
1577 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1582 /* See if this recursion is on the forward reference list. If so, adjust the
1585 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1587 offset = GET(hc, 0);
1588 if (cd->start_code + offset == ptr + 1)
1590 PUT(hc, 0, offset + adjust);
1595 /* Otherwise, adjust the recursion offset if it's after the start of this
1600 offset = GET(ptr, 1);
1601 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1604 ptr += 1 + LINK_SIZE;
1610 /*************************************************
1611 * Insert an automatic callout point *
1612 *************************************************/
1614 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1615 callout points before each pattern item.
1618 code current code pointer
1619 ptr current pattern pointer
1620 cd pointers to tables etc
1622 Returns: new code pointer
1626 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1628 *code++ = OP_CALLOUT;
1630 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1631 PUT(code, LINK_SIZE, 0); /* Default length */
1632 return code + 2*LINK_SIZE;
1637 /*************************************************
1638 * Complete a callout item *
1639 *************************************************/
1641 /* A callout item contains the length of the next item in the pattern, which
1642 we can't fill in till after we have reached the relevant point. This is used
1643 for both automatic and manual callouts.
1646 previous_callout points to previous callout item
1647 ptr current pattern pointer
1648 cd pointers to tables etc
1654 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1656 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1657 PUT(previous_callout, 2 + LINK_SIZE, length);
1663 /*************************************************
1664 * Get othercase range *
1665 *************************************************/
1667 /* This function is passed the start and end of a class range, in UTF-8 mode
1668 with UCP support. It searches up the characters, looking for internal ranges of
1669 characters in the "other" case. Each call returns the next one, updating the
1673 cptr points to starting character value; updated
1675 ocptr where to put start of othercase range
1676 odptr where to put end of othercase range
1678 Yield: TRUE when range returned; FALSE when no more
1682 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1683 unsigned int *odptr)
1685 unsigned int c, othercase, next;
1687 for (c = *cptr; c <= d; c++)
1688 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1690 if (c > d) return FALSE;
1693 next = othercase + 1;
1695 for (++c; c <= d; c++)
1697 if (_pcre_ucp_othercase(c) != next) break;
1706 #endif /* SUPPORT_UCP */
1710 /*************************************************
1711 * Check if auto-possessifying is possible *
1712 *************************************************/
1714 /* This function is called for unlimited repeats of certain items, to see
1715 whether the next thing could possibly match the repeated item. If not, it makes
1716 sense to automatically possessify the repeated item.
1719 op_code the repeated op code
1720 this data for this item, depends on the opcode
1721 utf8 TRUE in UTF-8 mode
1722 utf8_char used for utf8 character bytes, NULL if not relevant
1723 ptr next character in pattern
1724 options options bits
1725 cd contains pointers to tables etc.
1727 Returns: TRUE if possessifying is wanted
1731 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1732 const uschar *ptr, int options, compile_data *cd)
1736 /* Skip whitespace and comments in extended mode */
1738 if ((options & PCRE_EXTENDED) != 0)
1742 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1745 while (*(++ptr) != 0)
1746 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1752 /* If the next item is one that we can handle, get its value. A non-negative
1753 value is a character, a negative value is an escape value. */
1757 int temperrorcode = 0;
1758 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1759 if (temperrorcode != 0) return FALSE;
1760 ptr++; /* Point after the escape sequence */
1763 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1766 if (utf8) { GETCHARINC(next, ptr); } else
1773 /* Skip whitespace and comments in extended mode */
1775 if ((options & PCRE_EXTENDED) != 0)
1779 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1782 while (*(++ptr) != 0)
1783 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1789 /* If the next thing is itself optional, we have to give up. */
1791 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1794 /* Now compare the next item with the previous opcode. If the previous is a
1795 positive single character match, "item" either contains the character or, if
1796 "item" is greater than 127 in utf8 mode, the character's bytes are in
1800 /* Handle cases when the next item is a character. */
1802 if (next >= 0) switch(op_code)
1806 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1808 return item != next;
1810 /* For CHARNC (caseless character) we must check the other case. If we have
1811 Unicode property support, we can use it to test the other case of
1812 high-valued characters. */
1816 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1818 if (item == next) return FALSE;
1822 unsigned int othercase;
1823 if (next < 128) othercase = cd->fcc[next]; else
1825 othercase = _pcre_ucp_othercase((unsigned int)next);
1827 othercase = NOTACHAR;
1829 return (unsigned int)item != othercase;
1832 #endif /* SUPPORT_UTF8 */
1833 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1835 /* For OP_NOT, "item" must be a single-byte character. */
1838 if (next < 0) return FALSE; /* Not a character */
1839 if (item == next) return TRUE;
1840 if ((options & PCRE_CASELESS) == 0) return FALSE;
1844 unsigned int othercase;
1845 if (next < 128) othercase = cd->fcc[next]; else
1847 othercase = _pcre_ucp_othercase(next);
1849 othercase = NOTACHAR;
1851 return (unsigned int)item == othercase;
1854 #endif /* SUPPORT_UTF8 */
1855 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1858 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1861 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1864 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1866 case OP_NOT_WHITESPACE:
1867 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1870 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1872 case OP_NOT_WORDCHAR:
1873 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1880 /* Handle the case when the next item is \d, \s, etc. */
1887 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1892 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1895 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1898 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1901 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1904 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1907 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1914 return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1917 return next == -ESC_d;
1920 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1922 case OP_NOT_WHITESPACE:
1923 return next == -ESC_s;
1926 return next == -ESC_W || next == -ESC_s;
1928 case OP_NOT_WORDCHAR:
1929 return next == -ESC_w || next == -ESC_d;
1935 /* Control does not reach here */
1940 /*************************************************
1941 * Compile one branch *
1942 *************************************************/
1944 /* Scan the pattern, compiling it into the a vector. If the options are
1945 changed during the branch, the pointer is used to change the external options
1946 bits. This function is used during the pre-compile phase when we are trying
1947 to find out the amount of memory needed, as well as during the real compile
1948 phase. The value of lengthptr distinguishes the two phases.
1951 optionsptr pointer to the option bits
1952 codeptr points to the pointer to the current code point
1953 ptrptr points to the current pattern pointer
1954 errorcodeptr points to error code variable
1955 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1956 reqbyteptr set to the last literal character required, else < 0
1957 bcptr points to current branch chain
1958 cd contains pointers to tables etc.
1959 lengthptr NULL during the real compile phase
1960 points to length accumulator during pre-compile phase
1962 Returns: TRUE on success
1963 FALSE, with *errorcodeptr set non-zero on error
1967 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
1968 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
1969 compile_data *cd, int *lengthptr)
1971 int repeat_type, op_type;
1972 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
1974 int greedy_default, greedy_non_default;
1975 int firstbyte, reqbyte;
1976 int zeroreqbyte, zerofirstbyte;
1977 int req_caseopt, reqvary, tempreqvary;
1978 int options = *optionsptr;
1979 int after_manual_callout = 0;
1980 int length_prevgroup = 0;
1982 register uschar *code = *codeptr;
1983 uschar *last_code = code;
1984 uschar *orig_code = code;
1986 BOOL inescq = FALSE;
1987 BOOL groupsetfirstbyte = FALSE;
1988 const uschar *ptr = *ptrptr;
1989 const uschar *tempptr;
1990 uschar *previous = NULL;
1991 uschar *previous_callout = NULL;
1992 uschar *save_hwm = NULL;
1993 uschar classbits[32];
1997 BOOL utf8 = (options & PCRE_UTF8) != 0;
1998 uschar *class_utf8data;
1999 uschar utf8_char[6];
2002 uschar *utf8_char = NULL;
2006 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2009 /* Set up the default and non-default settings for greediness */
2011 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2012 greedy_non_default = greedy_default ^ 1;
2014 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2015 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2016 matches a non-fixed char first char; reqbyte just remains unset if we never
2019 When we hit a repeat whose minimum is zero, we may have to adjust these values
2020 to take the zero repeat into account. This is implemented by setting them to
2021 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2022 item types that can be repeated set these backoff variables appropriately. */
2024 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2026 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2027 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2028 value > 255. It is added into the firstbyte or reqbyte variables to record the
2029 case status of the value. This is used only for ASCII characters. */
2031 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2033 /* Switch on next character until the end of the branch */
2038 BOOL possessive_quantifier;
2041 int class_charcount;
2052 /* Get next byte in the pattern */
2056 /* If we are in the pre-compile phase, accumulate the length used for the
2057 previous cycle of this loop. */
2059 if (lengthptr != NULL)
2062 if (code > cd->hwm) cd->hwm = code; /* High water info */
2064 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2066 *errorcodeptr = ERR52;
2070 /* There is at least one situation where code goes backwards: this is the
2071 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2072 the class is simply eliminated. However, it is created first, so we have to
2073 allow memory for it. Therefore, don't ever reduce the length at this point.
2076 if (code < last_code) code = last_code;
2077 *lengthptr += code - last_code;
2078 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2080 /* If "previous" is set and it is not at the start of the work space, move
2081 it back to there, in order to avoid filling up the work space. Otherwise,
2082 if "previous" is NULL, reset the current code pointer to the start. */
2084 if (previous != NULL)
2086 if (previous > orig_code)
2088 memmove(orig_code, previous, code - previous);
2089 code -= previous - orig_code;
2090 previous = orig_code;
2093 else code = orig_code;
2095 /* Remember where this code item starts so we can pick up the length
2101 /* In the real compile phase, just check the workspace used by the forward
2104 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2106 *errorcodeptr = ERR52;
2110 /* If in \Q...\E, check for the end; if not, we have a literal */
2112 if (inescq && c != 0)
2114 if (c == '\\' && ptr[1] == 'E')
2122 if (previous_callout != NULL)
2124 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2125 complete_callout(previous_callout, ptr, cd);
2126 previous_callout = NULL;
2128 if ((options & PCRE_AUTO_CALLOUT) != 0)
2130 previous_callout = code;
2131 code = auto_callout(code, ptr, cd);
2137 /* Fill in length of a previous callout, except when the next thing is
2140 is_quantifier = c == '*' || c == '+' || c == '?' ||
2141 (c == '{' && is_counted_repeat(ptr+1));
2143 if (!is_quantifier && previous_callout != NULL &&
2144 after_manual_callout-- <= 0)
2146 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2147 complete_callout(previous_callout, ptr, cd);
2148 previous_callout = NULL;
2151 /* In extended mode, skip white space and comments */
2153 if ((options & PCRE_EXTENDED) != 0)
2155 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2158 while (*(++ptr) != 0)
2160 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2162 if (*ptr != 0) continue;
2164 /* Else fall through to handle end of string */
2169 /* No auto callout for quantifiers. */
2171 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2173 previous_callout = code;
2174 code = auto_callout(code, ptr, cd);
2179 /* ===================================================================*/
2180 case 0: /* The branch terminates at string end */
2181 case '|': /* or | or ) */
2183 *firstbyteptr = firstbyte;
2184 *reqbyteptr = reqbyte;
2187 if (lengthptr != NULL)
2189 *lengthptr += code - last_code; /* To include callout length */
2190 DPRINTF((">> end branch\n"));
2195 /* ===================================================================*/
2196 /* Handle single-character metacharacters. In multiline mode, ^ disables
2197 the setting of any following char as a first character. */
2200 if ((options & PCRE_MULTILINE) != 0)
2202 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2213 /* There can never be a first char if '.' is first, whatever happens about
2214 repeats. The value of reqbyte doesn't change either. */
2217 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2218 zerofirstbyte = firstbyte;
2219 zeroreqbyte = reqbyte;
2225 /* ===================================================================*/
2226 /* Character classes. If the included characters are all < 256, we build a
2227 32-byte bitmap of the permitted characters, except in the special case
2228 where there is only one such character. For negated classes, we build the
2229 map as usual, then invert it at the end. However, we use a different opcode
2230 so that data characters > 255 can be handled correctly.
2232 If the class contains characters outside the 0-255 range, a different
2233 opcode is compiled. It may optionally have a bit map for characters < 256,
2234 but those above are are explicitly listed afterwards. A flag byte tells
2235 whether the bitmap is present, and whether this is a negated class or not.
2241 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2242 they are encountered at the top level, so we'll do that too. */
2244 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2245 check_posix_syntax(ptr, &tempptr, cd))
2247 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2251 /* If the first character is '^', set the negation flag and skip it. */
2253 if ((c = *(++ptr)) == '^')
2255 negate_class = TRUE;
2260 negate_class = FALSE;
2263 /* Keep a count of chars with values < 256 so that we can optimize the case
2264 of just a single character (as long as it's < 256). However, For higher
2265 valued UTF-8 characters, we don't yet do any optimization. */
2267 class_charcount = 0;
2268 class_lastchar = -1;
2270 /* Initialize the 32-char bit map to all zeros. We build the map in a
2271 temporary bit of memory, in case the class contains only 1 character (less
2272 than 256), because in that case the compiled code doesn't use the bit map.
2275 memset(classbits, 0, 32 * sizeof(uschar));
2278 class_utf8 = FALSE; /* No chars >= 256 */
2279 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2282 /* Process characters until ] is reached. By writing this as a "do" it
2283 means that an initial ] is taken as a data character. At the start of the
2284 loop, c contains the first byte of the character. */
2288 const uschar *oldptr;
2291 if (utf8 && c > 127)
2292 { /* Braces are required because the */
2293 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2297 /* Inside \Q...\E everything is literal except \E */
2301 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2303 inescq = FALSE; /* Reset literal state */
2304 ptr++; /* Skip the 'E' */
2305 continue; /* Carry on with next */
2307 goto CHECK_RANGE; /* Could be range if \E follows */
2310 /* Handle POSIX class names. Perl allows a negation extension of the
2311 form [:^name:]. A square bracket that doesn't match the syntax is
2312 treated as a literal. We also recognize the POSIX constructions
2313 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2317 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2318 check_posix_syntax(ptr, &tempptr, cd))
2320 BOOL local_negate = FALSE;
2321 int posix_class, taboffset, tabopt;
2322 register const uschar *cbits = cd->cbits;
2327 *errorcodeptr = ERR31;
2334 local_negate = TRUE;
2338 posix_class = check_posix_name(ptr, tempptr - ptr);
2339 if (posix_class < 0)
2341 *errorcodeptr = ERR30;
2345 /* If matching is caseless, upper and lower are converted to
2346 alpha. This relies on the fact that the class table starts with
2347 alpha, lower, upper as the first 3 entries. */
2349 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2352 /* We build the bit map for the POSIX class in a chunk of local store
2353 because we may be adding and subtracting from it, and we don't want to
2354 subtract bits that may be in the main map already. At the end we or the
2355 result into the bit map that is being built. */
2359 /* Copy in the first table (always present) */
2361 memcpy(pbits, cbits + posix_class_maps[posix_class],
2362 32 * sizeof(uschar));
2364 /* If there is a second table, add or remove it as required. */
2366 taboffset = posix_class_maps[posix_class + 1];
2367 tabopt = posix_class_maps[posix_class + 2];
2372 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2374 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2377 /* Not see if we need to remove any special characters. An option
2378 value of 1 removes vertical space and 2 removes underscore. */
2380 if (tabopt < 0) tabopt = -tabopt;
2381 if (tabopt == 1) pbits[1] &= ~0x3c;
2382 else if (tabopt == 2) pbits[11] &= 0x7f;
2384 /* Add the POSIX table or its complement into the main table that is
2385 being built and we are done. */
2388 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2390 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2393 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2394 continue; /* End of POSIX syntax handling */
2397 /* Backslash may introduce a single character, or it may introduce one
2398 of the specials, which just set a flag. The sequence \b is a special
2399 case. Inside a class (and only there) it is treated as backspace.
2400 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2401 to or into the one we are building. We assume they have more than one
2402 character in them, so set class_charcount bigger than one. */
2406 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2407 if (*errorcodeptr != 0) goto FAILED;
2409 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2410 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2411 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2412 else if (-c == ESC_Q) /* Handle start of quoted string */
2414 if (ptr[1] == '\\' && ptr[2] == 'E')
2416 ptr += 2; /* avoid empty string */
2424 register const uschar *cbits = cd->cbits;
2425 class_charcount += 2; /* Greater than 1 is what matters */
2427 /* Save time by not doing this in the pre-compile phase. */
2429 if (lengthptr == NULL) switch (-c)
2432 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2436 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2440 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2444 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2448 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2449 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2453 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2454 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2457 case ESC_E: /* Perl ignores an orphan \E */
2460 default: /* Not recognized; fall through */
2461 break; /* Need "default" setting to stop compiler warning. */
2464 /* In the pre-compile phase, just do the recognition. */
2466 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2467 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2469 /* We need to deal with \P and \p in both phases. */
2472 if (-c == ESC_p || -c == ESC_P)
2476 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2477 if (ptype < 0) goto FAILED;
2479 *class_utf8data++ = ((-c == ESC_p) != negated)?
2480 XCL_PROP : XCL_NOTPROP;
2481 *class_utf8data++ = ptype;
2482 *class_utf8data++ = pdata;
2483 class_charcount -= 2; /* Not a < 256 character */
2487 /* Unrecognized escapes are faulted if PCRE is running in its
2488 strict mode. By default, for compatibility with Perl, they are
2489 treated as literals. */
2491 if ((options & PCRE_EXTRA) != 0)
2493 *errorcodeptr = ERR7;
2497 class_charcount -= 2; /* Undo the default count from above */
2498 c = *ptr; /* Get the final character and fall through */
2501 /* Fall through if we have a single character (c >= 0). This may be
2502 greater than 256 in UTF-8 mode. */
2504 } /* End of backslash handling */
2506 /* A single character may be followed by '-' to form a range. However,
2507 Perl does not permit ']' to be the end of the range. A '-' character
2508 at the end is treated as a literal. Perl ignores orphaned \E sequences
2509 entirely. The code for handling \Q and \E is messy. */
2512 while (ptr[1] == '\\' && ptr[2] == 'E')
2520 if (!inescq && ptr[1] == '-')
2524 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2526 /* If we hit \Q (not followed by \E) at this point, go into escaped
2529 while (*ptr == '\\' && ptr[1] == 'Q')
2532 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2537 if (*ptr == 0 || (!inescq && *ptr == ']'))
2540 goto LONE_SINGLE_CHARACTER;
2545 { /* Braces are required because the */
2546 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2550 d = *ptr; /* Not UTF-8 mode */
2552 /* The second part of a range can be a single-character escape, but
2553 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2554 in such circumstances. */
2556 if (!inescq && d == '\\')
2558 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2559 if (*errorcodeptr != 0) goto FAILED;
2561 /* \b is backslash; \X is literal X; \R is literal R; any other
2562 special means the '-' was literal */
2566 if (d == -ESC_b) d = '\b';
2567 else if (d == -ESC_X) d = 'X';
2568 else if (d == -ESC_R) d = 'R'; else
2571 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2576 /* Check that the two values are in the correct order. Optimize
2577 one-character ranges */
2581 *errorcodeptr = ERR8;
2585 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2587 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2588 matching, we have to use an XCLASS with extra data items. Caseless
2589 matching for characters > 127 is available only if UCP support is
2593 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2597 /* With UCP support, we can find the other case equivalents of
2598 the relevant characters. There may be several ranges. Optimize how
2599 they fit with the basic range. */
2602 if ((options & PCRE_CASELESS) != 0)
2604 unsigned int occ, ocd;
2605 unsigned int cc = c;
2606 unsigned int origd = d;
2607 while (get_othercase_range(&cc, origd, &occ, &ocd))
2609 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2611 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2612 { /* if there is overlap, */
2613 c = occ; /* noting that if occ < c */
2614 continue; /* we can't have ocd > d */
2615 } /* because a subrange is */
2616 if (ocd > d && occ <= d + 1) /* always shorter than */
2617 { /* the basic range. */
2624 *class_utf8data++ = XCL_SINGLE;
2628 *class_utf8data++ = XCL_RANGE;
2629 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2631 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2634 #endif /* SUPPORT_UCP */
2636 /* Now record the original range, possibly modified for UCP caseless
2637 overlapping ranges. */
2639 *class_utf8data++ = XCL_RANGE;
2640 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2641 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2643 /* With UCP support, we are done. Without UCP support, there is no
2644 caseless matching for UTF-8 characters > 127; we can use the bit map
2645 for the smaller ones. */
2648 continue; /* With next character in the class */
2650 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2652 /* Adjust upper limit and fall through to set up the map */
2656 #endif /* SUPPORT_UCP */
2658 #endif /* SUPPORT_UTF8 */
2660 /* We use the bit map for all cases when not in UTF-8 mode; else
2661 ranges that lie entirely within 0-127 when there is UCP support; else
2662 for partial ranges without UCP support. */
2664 class_charcount += d - c + 1;
2667 /* We can save a bit of time by skipping this in the pre-compile. */
2669 if (lengthptr == NULL) for (; c <= d; c++)
2671 classbits[c/8] |= (1 << (c&7));
2672 if ((options & PCRE_CASELESS) != 0)
2674 int uc = cd->fcc[c]; /* flip case */
2675 classbits[uc/8] |= (1 << (uc&7));
2679 continue; /* Go get the next char in the class */
2682 /* Handle a lone single character - we can get here for a normal
2683 non-escape char, or after \ that introduces a single character or for an
2684 apparent range that isn't. */
2686 LONE_SINGLE_CHARACTER:
2688 /* Handle a character that cannot go in the bit map */
2691 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2694 *class_utf8data++ = XCL_SINGLE;
2695 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2698 if ((options & PCRE_CASELESS) != 0)
2700 unsigned int othercase;
2701 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
2703 *class_utf8data++ = XCL_SINGLE;
2704 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2707 #endif /* SUPPORT_UCP */
2711 #endif /* SUPPORT_UTF8 */
2713 /* Handle a single-byte character */
2715 classbits[c/8] |= (1 << (c&7));
2716 if ((options & PCRE_CASELESS) != 0)
2718 c = cd->fcc[c]; /* flip case */
2719 classbits[c/8] |= (1 << (c&7));
2726 /* Loop until ']' reached. This "while" is the end of the "do" above. */
2728 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2730 if (c == 0) /* Missing terminating ']' */
2732 *errorcodeptr = ERR6;
2736 /* If class_charcount is 1, we saw precisely one character whose value is
2737 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2738 can optimize the negative case only if there were no characters >= 128
2739 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2740 single-bytes only. This is an historical hangover. Maybe one day we can
2741 tidy these opcodes to handle multi-byte characters.
2743 The optimization throws away the bit map. We turn the item into a
2744 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2745 that OP_NOT does not support multibyte characters. In the positive case, it
2746 can cause firstbyte to be set. Otherwise, there can be no first char if
2747 this item is first, whatever repeat count may follow. In the case of
2748 reqbyte, save the previous value for reinstating. */
2751 if (class_charcount == 1 &&
2753 (!class_utf8 && (!negate_class || class_lastchar < 128))))
2756 if (class_charcount == 1)
2759 zeroreqbyte = reqbyte;
2761 /* The OP_NOT opcode works on one-byte characters only. */
2765 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2766 zerofirstbyte = firstbyte;
2768 *code++ = class_lastchar;
2772 /* For a single, positive character, get the value into mcbuffer, and
2773 then we can handle this with the normal one-character code. */
2776 if (utf8 && class_lastchar > 127)
2777 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2781 mcbuffer[0] = class_lastchar;
2785 } /* End of 1-char optimization */
2787 /* The general case - not the one-char optimization. If this is the first
2788 thing in the branch, there can be no first char setting, whatever the
2789 repeat count. Any reqbyte setting must remain unchanged after any kind of
2792 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2793 zerofirstbyte = firstbyte;
2794 zeroreqbyte = reqbyte;
2796 /* If there are characters with values > 255, we have to compile an
2797 extended class, with its own opcode. If there are no characters < 256,
2798 we can omit the bitmap in the actual compiled code. */
2803 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2804 *code++ = OP_XCLASS;
2806 *code = negate_class? XCL_NOT : 0;
2808 /* If the map is required, move up the extra data to make room for it;
2809 otherwise just move the code pointer to the end of the extra data. */
2811 if (class_charcount > 0)
2814 memmove(code + 32, code, class_utf8data - code);
2815 memcpy(code, classbits, 32);
2816 code = class_utf8data + 32;
2818 else code = class_utf8data;
2820 /* Now fill in the complete length of the item */
2822 PUT(previous, 1, code - previous);
2823 break; /* End of class handling */
2827 /* If there are no characters > 255, negate the 32-byte map if necessary,
2828 and copy it into the code vector. If this is the first thing in the branch,
2829 there can be no first char setting, whatever the repeat count. Any reqbyte
2830 setting must remain unchanged after any kind of repeat. */
2834 *code++ = OP_NCLASS;
2835 if (lengthptr == NULL) /* Save time in the pre-compile phase */
2836 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2841 memcpy(code, classbits, 32);
2847 /* ===================================================================*/
2848 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2849 has been tested above. */
2852 if (!is_quantifier) goto NORMAL_CHAR;
2853 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2854 if (*errorcodeptr != 0) goto FAILED;
2872 if (previous == NULL)
2874 *errorcodeptr = ERR9;
2878 if (repeat_min == 0)
2880 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2881 reqbyte = zeroreqbyte; /* Ditto */
2884 /* Remember whether this is a variable length repeat */
2886 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2888 op_type = 0; /* Default single-char op codes */
2889 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2891 /* Save start of previous item, in case we have to move it up to make space
2892 for an inserted OP_ONCE for the additional '+' extension. */
2894 tempcode = previous;
2896 /* If the next character is '+', we have a possessive quantifier. This
2897 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2898 If the next character is '?' this is a minimizing repeat, by default,
2899 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2900 repeat type to the non-default. */
2904 repeat_type = 0; /* Force greedy */
2905 possessive_quantifier = TRUE;
2908 else if (ptr[1] == '?')
2910 repeat_type = greedy_non_default;
2913 else repeat_type = greedy_default;
2915 /* If previous was a character match, abolish the item and generate a
2916 repeat item instead. If a char item has a minumum of more than one, ensure
2917 that it is set in reqbyte - it might not be if a sequence such as x{3} is
2918 the first thing in a branch because the x will have gone into firstbyte
2921 if (*previous == OP_CHAR || *previous == OP_CHARNC)
2923 /* Deal with UTF-8 characters that take up more than one byte. It's
2924 easier to write this out separately than try to macrify it. Use c to
2925 hold the length of the character in bytes, plus 0x80 to flag that it's a
2926 length rather than a small character. */
2929 if (utf8 && (code[-1] & 0x80) != 0)
2931 uschar *lastchar = code - 1;
2932 while((*lastchar & 0xc0) == 0x80) lastchar--;
2933 c = code - lastchar; /* Length of UTF-8 character */
2934 memcpy(utf8_char, lastchar, c); /* Save the char */
2935 c |= 0x80; /* Flag c as a length */
2940 /* Handle the case of a single byte - either with no UTF8 support, or
2941 with UTF-8 disabled, or for a UTF-8 character < 128. */
2945 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2948 /* If the repetition is unlimited, it pays to see if the next thing on
2949 the line is something that cannot possibly match this character. If so,
2950 automatically possessifying this item gains some performance in the case
2951 where the match fails. */
2953 if (!possessive_quantifier &&
2955 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
2958 repeat_type = 0; /* Force greedy */
2959 possessive_quantifier = TRUE;
2962 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2965 /* If previous was a single negated character ([^a] or similar), we use
2966 one of the special opcodes, replacing it. The code is shared with single-
2967 character repeats by setting opt_type to add a suitable offset into
2968 repeat_type. We can also test for auto-possessification. OP_NOT is
2969 currently used only for single-byte chars. */
2971 else if (*previous == OP_NOT)
2973 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2975 if (!possessive_quantifier &&
2977 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
2979 repeat_type = 0; /* Force greedy */
2980 possessive_quantifier = TRUE;
2982 goto OUTPUT_SINGLE_REPEAT;
2985 /* If previous was a character type match (\d or similar), abolish it and
2986 create a suitable repeat item. The code is shared with single-character
2987 repeats by setting op_type to add a suitable offset into repeat_type. Note
2988 the the Unicode property types will be present only when SUPPORT_UCP is
2989 defined, but we don't wrap the little bits of code here because it just
2990 makes it horribly messy. */
2992 else if (*previous < OP_EODN)
2995 int prop_type, prop_value;
2996 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2999 if (!possessive_quantifier &&
3001 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3003 repeat_type = 0; /* Force greedy */
3004 possessive_quantifier = TRUE;
3007 OUTPUT_SINGLE_REPEAT:
3008 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3010 prop_type = previous[1];
3011 prop_value = previous[2];
3013 else prop_type = prop_value = -1;
3016 code = previous; /* Usually overwrite previous item */
3018 /* If the maximum is zero then the minimum must also be zero; Perl allows
3019 this case, so we do too - by simply omitting the item altogether. */
3021 if (repeat_max == 0) goto END_REPEAT;
3023 /* All real repeats make it impossible to handle partial matching (maybe
3024 one day we will be able to remove this restriction). */
3026 if (repeat_max != 1) cd->nopartial = TRUE;
3028 /* Combine the op_type with the repeat_type */
3030 repeat_type += op_type;
3032 /* A minimum of zero is handled either as the special case * or ?, or as
3033 an UPTO, with the maximum given. */
3035 if (repeat_min == 0)
3037 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3038 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3041 *code++ = OP_UPTO + repeat_type;
3042 PUT2INC(code, 0, repeat_max);
3046 /* A repeat minimum of 1 is optimized into some special cases. If the
3047 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3048 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3049 one less than the maximum. */
3051 else if (repeat_min == 1)
3053 if (repeat_max == -1)
3054 *code++ = OP_PLUS + repeat_type;
3057 code = oldcode; /* leave previous item in place */
3058 if (repeat_max == 1) goto END_REPEAT;
3059 *code++ = OP_UPTO + repeat_type;
3060 PUT2INC(code, 0, repeat_max - 1);
3064 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3065 handled as an EXACT followed by an UPTO. */
3069 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3070 PUT2INC(code, 0, repeat_min);
3072 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3073 we have to insert the character for the previous code. For a repeated
3074 Unicode property match, there are two extra bytes that define the
3075 required property. In UTF-8 mode, long characters have their length in
3076 c, with the 0x80 bit as a flag. */
3081 if (utf8 && c >= 128)
3083 memcpy(code, utf8_char, c & 7);
3092 *code++ = prop_type;
3093 *code++ = prop_value;
3096 *code++ = OP_STAR + repeat_type;
3099 /* Else insert an UPTO if the max is greater than the min, again
3100 preceded by the character, for the previously inserted code. If the
3101 UPTO is just for 1 instance, we can use QUERY instead. */
3103 else if (repeat_max != repeat_min)
3106 if (utf8 && c >= 128)
3108 memcpy(code, utf8_char, c & 7);
3116 *code++ = prop_type;
3117 *code++ = prop_value;
3119 repeat_max -= repeat_min;
3121 if (repeat_max == 1)
3123 *code++ = OP_QUERY + repeat_type;
3127 *code++ = OP_UPTO + repeat_type;
3128 PUT2INC(code, 0, repeat_max);
3133 /* The character or character type itself comes last in all cases. */
3136 if (utf8 && c >= 128)
3138 memcpy(code, utf8_char, c & 7);
3145 /* For a repeated Unicode property match, there are two extra bytes that
3146 define the required property. */
3151 *code++ = prop_type;
3152 *code++ = prop_value;
3157 /* If previous was a character class or a back reference, we put the repeat
3158 stuff after it, but just skip the item if the repeat was {0,0}. */
3160 else if (*previous == OP_CLASS ||
3161 *previous == OP_NCLASS ||
3163 *previous == OP_XCLASS ||
3165 *previous == OP_REF)
3167 if (repeat_max == 0)
3173 /* All real repeats make it impossible to handle partial matching (maybe
3174 one day we will be able to remove this restriction). */
3176 if (repeat_max != 1) cd->nopartial = TRUE;
3178 if (repeat_min == 0 && repeat_max == -1)
3179 *code++ = OP_CRSTAR + repeat_type;
3180 else if (repeat_min == 1 && repeat_max == -1)
3181 *code++ = OP_CRPLUS + repeat_type;
3182 else if (repeat_min == 0 && repeat_max == 1)
3183 *code++ = OP_CRQUERY + repeat_type;
3186 *code++ = OP_CRRANGE + repeat_type;
3187 PUT2INC(code, 0, repeat_min);
3188 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3189 PUT2INC(code, 0, repeat_max);
3193 /* If previous was a bracket group, we may have to replicate it in certain
3196 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3197 *previous == OP_ONCE || *previous == OP_COND)
3201 int len = code - previous;
3202 uschar *bralink = NULL;
3204 /* Repeating a DEFINE group is pointless */
3206 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3208 *errorcodeptr = ERR55;
3212 /* This is a paranoid check to stop integer overflow later on */
3214 if (len > MAX_DUPLENGTH)
3216 *errorcodeptr = ERR50;
3220 /* If the maximum repeat count is unlimited, find the end of the bracket
3221 by scanning through from the start, and compute the offset back to it
3222 from the current code pointer. There may be an OP_OPT setting following
3223 the final KET, so we can't find the end just by going back from the code
3226 if (repeat_max == -1)
3228 register uschar *ket = previous;
3229 do ket += GET(ket, 1); while (*ket != OP_KET);
3230 ketoffset = code - ket;
3233 /* The case of a zero minimum is special because of the need to stick
3234 OP_BRAZERO in front of it, and because the group appears once in the
3235 data, whereas in other cases it appears the minimum number of times. For
3236 this reason, it is simplest to treat this case separately, as otherwise
3237 the code gets far too messy. There are several special subcases when the
3240 if (repeat_min == 0)
3242 /* If the maximum is also zero, we just omit the group from the output
3245 if (repeat_max == 0)
3251 /* If the maximum is 1 or unlimited, we just have to stick in the
3252 BRAZERO and do no more at this point. However, we do need to adjust
3253 any OP_RECURSE calls inside the group that refer to the group itself or
3254 any internal or forward referenced group, because the offset is from
3255 the start of the whole regex. Temporarily terminate the pattern while
3258 if (repeat_max <= 1)
3261 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3262 memmove(previous+1, previous, len);
3264 *previous++ = OP_BRAZERO + repeat_type;
3267 /* If the maximum is greater than 1 and limited, we have to replicate
3268 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3269 The first one has to be handled carefully because it's the original
3270 copy, which has to be moved up. The remainder can be handled by code
3271 that is common with the non-zero minimum case below. We have to
3272 adjust the value or repeat_max, since one less copy is required. Once
3273 again, we may have to adjust any OP_RECURSE calls inside the group. */
3279 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3280 memmove(previous + 2 + LINK_SIZE, previous, len);
3281 code += 2 + LINK_SIZE;
3282 *previous++ = OP_BRAZERO + repeat_type;
3283 *previous++ = OP_BRA;
3285 /* We chain together the bracket offset fields that have to be
3286 filled in later when the ends of the brackets are reached. */
3288 offset = (bralink == NULL)? 0 : previous - bralink;
3290 PUTINC(previous, 0, offset);
3296 /* If the minimum is greater than zero, replicate the group as many
3297 times as necessary, and adjust the maximum to the number of subsequent
3298 copies that we need. If we set a first char from the group, and didn't
3299 set a required char, copy the latter from the former. If there are any
3300 forward reference subroutine calls in the group, there will be entries on
3301 the workspace list; replicate these with an appropriate increment. */
3307 /* In the pre-compile phase, we don't actually do the replication. We
3308 just adjust the length as if we had. */
3310 if (lengthptr != NULL)
3311 *lengthptr += (repeat_min - 1)*length_prevgroup;
3313 /* This is compiling for real */
3317 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3318 for (i = 1; i < repeat_min; i++)
3321 uschar *this_hwm = cd->hwm;
3322 memcpy(code, previous, len);
3323 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3325 PUT(cd->hwm, 0, GET(hc, 0) + len);
3326 cd->hwm += LINK_SIZE;
3328 save_hwm = this_hwm;
3334 if (repeat_max > 0) repeat_max -= repeat_min;
3337 /* This code is common to both the zero and non-zero minimum cases. If
3338 the maximum is limited, it replicates the group in a nested fashion,
3339 remembering the bracket starts on a stack. In the case of a zero minimum,
3340 the first one was set up above. In all cases the repeat_max now specifies
3341 the number of additional copies needed. Again, we must remember to
3342 replicate entries on the forward reference list. */
3344 if (repeat_max >= 0)
3346 /* In the pre-compile phase, we don't actually do the replication. We
3347 just adjust the length as if we had. For each repetition we must add 1
3348 to the length for BRAZERO and for all but the last repetition we must
3349 add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3351 if (lengthptr != NULL && repeat_max > 0)
3352 *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3353 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3355 /* This is compiling for real */
3357 else for (i = repeat_max - 1; i >= 0; i--)
3360 uschar *this_hwm = cd->hwm;
3362 *code++ = OP_BRAZERO + repeat_type;
3364 /* All but the final copy start a new nesting, maintaining the
3365 chain of brackets outstanding. */
3371 offset = (bralink == NULL)? 0 : code - bralink;
3373 PUTINC(code, 0, offset);
3376 memcpy(code, previous, len);
3377 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3379 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3380 cd->hwm += LINK_SIZE;
3382 save_hwm = this_hwm;
3386 /* Now chain through the pending brackets, and fill in their length
3387 fields (which are holding the chain links pro tem). */
3389 while (bralink != NULL)
3392 int offset = code - bralink + 1;
3393 uschar *bra = code - offset;
3394 oldlinkoffset = GET(bra, 1);
3395 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3397 PUTINC(code, 0, offset);
3398 PUT(bra, 1, offset);
3402 /* If the maximum is unlimited, set a repeater in the final copy. We
3403 can't just offset backwards from the current code point, because we
3404 don't know if there's been an options resetting after the ket. The
3405 correct offset was computed above.
3407 Then, when we are doing the actual compile phase, check to see whether
3408 this group is a non-atomic one that could match an empty string. If so,
3409 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3410 that runtime checking can be done. [This check is also applied to
3411 atomic groups at runtime, but in a different way.] */
3415 uschar *ketcode = code - ketoffset;
3416 uschar *bracode = ketcode - GET(ketcode, 1);
3417 *ketcode = OP_KETRMAX + repeat_type;
3418 if (lengthptr == NULL && *bracode != OP_ONCE)
3420 uschar *scode = bracode;
3423 if (could_be_empty_branch(scode, ketcode, utf8))
3425 *bracode += OP_SBRA - OP_BRA;
3428 scode += GET(scode, 1);
3430 while (*scode == OP_ALT);
3435 /* Else there's some kind of shambles */
3439 *errorcodeptr = ERR11;
3443 /* If the character following a repeat is '+', or if certain optimization
3444 tests above succeeded, possessive_quantifier is TRUE. For some of the
3445 simpler opcodes, there is an special alternative opcode for this. For
3446 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3447 The '+' notation is just syntactic sugar, taken from Sun's Java package,
3448 but the special opcodes can optimize it a bit. The repeated item starts at
3449 tempcode, not at previous, which might be the first part of a string whose
3450 (former) last char we repeated.
3452 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3453 an 'upto' may follow. We skip over an 'exact' item, and then test the
3454 length of what remains before proceeding. */
3456 if (possessive_quantifier)
3459 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3460 *tempcode == OP_NOTEXACT)
3461 tempcode += _pcre_OP_lengths[*tempcode];
3462 len = code - tempcode;
3463 if (len > 0) switch (*tempcode)
3465 case OP_STAR: *tempcode = OP_POSSTAR; break;
3466 case OP_PLUS: *tempcode = OP_POSPLUS; break;
3467 case OP_QUERY: *tempcode = OP_POSQUERY; break;
3468 case OP_UPTO: *tempcode = OP_POSUPTO; break;
3470 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3471 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3472 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3473 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3475 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3476 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3477 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3478 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3481 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3482 code += 1 + LINK_SIZE;
3483 len += 1 + LINK_SIZE;
3484 tempcode[0] = OP_ONCE;
3486 PUTINC(code, 0, len);
3487 PUT(tempcode, 1, len);
3492 /* In all case we no longer have a previous item. We also set the
3493 "follows varying string" flag for subsequently encountered reqbytes if
3494 it isn't already set and we have just passed a varying length item. */
3498 cd->req_varyopt |= reqvary;
3502 /* ===================================================================*/
3503 /* Start of nested parenthesized sub-expression, or comment or lookahead or
3504 lookbehind or option setting or condition or all the other extended
3505 parenthesis forms. First deal with the specials; all are introduced by ?,
3506 and the appearance of any of them means that this is not a capturing
3510 newoptions = options;
3515 if (*(++ptr) == '?')
3517 int i, set, unset, namelen;
3524 case '#': /* Comment; skip to ket */
3526 while (*ptr != 0 && *ptr != ')') ptr++;
3529 *errorcodeptr = ERR18;
3535 /* ------------------------------------------------------------ */
3536 case ':': /* Non-capturing bracket */
3542 /* ------------------------------------------------------------ */
3544 bravalue = OP_COND; /* Conditional group */
3546 /* A condition can be an assertion, a number (referring to a numbered
3547 group), a name (referring to a named group), or 'R', referring to
3548 recursion. R<digits> and R&name are also permitted for recursion tests.
3550 There are several syntaxes for testing a named group: (?(name)) is used
3551 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3553 There are two unfortunate ambiguities, caused by history. (a) 'R' can
3554 be the recursive thing or the name 'R' (and similarly for 'R' followed
3555 by digits), and (b) a number could be a name that consists of digits.
3556 In both cases, we look for a name first; if not found, we try the other
3559 /* For conditions that are assertions, check the syntax, and then exit
3560 the switch. This will take control down to where bracketed groups,
3561 including assertions, are processed. */
3563 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3566 /* Most other conditions use OP_CREF (a couple change to OP_RREF
3567 below), and all need to skip 3 bytes at the start of the group. */
3569 code[1+LINK_SIZE] = OP_CREF;
3572 /* Check for a test for recursion in a named group. */
3574 if (ptr[1] == 'R' && ptr[2] == '&')
3578 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
3581 /* Check for a test for a named group's having been set, using the Perl
3582 syntax (?(<name>) or (?('name') */
3584 else if (ptr[1] == '<')
3589 else if (ptr[1] == '\'')
3594 else terminator = 0;
3596 /* We now expect to read a name; any thing else is an error */
3598 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3600 ptr += 1; /* To get the right offset */
3601 *errorcodeptr = ERR28;
3605 /* Read the name, but also get it as a number if it's all digits */
3609 while ((cd->ctypes[*ptr] & ctype_word) != 0)
3612 recno = (g_ascii_isdigit(*ptr) != 0)?
3613 recno * 10 + *ptr - '0' : -1;
3616 namelen = ptr - name;
3618 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3620 ptr--; /* Error offset */
3621 *errorcodeptr = ERR26;
3625 /* Do no further checking in the pre-compile phase. */
3627 if (lengthptr != NULL) break;
3629 /* In the real compile we do the work of looking for the actual
3632 slot = cd->name_table;
3633 for (i = 0; i < cd->names_found; i++)
3635 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3636 slot += cd->name_entry_size;
3639 /* Found a previous named subpattern */
3641 if (i < cd->names_found)
3643 recno = GET2(slot, 0);
3644 PUT2(code, 2+LINK_SIZE, recno);
3647 /* Search the pattern for a forward reference */
3649 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3650 (options & PCRE_EXTENDED) != 0)) > 0)
3652 PUT2(code, 2+LINK_SIZE, i);
3655 /* If terminator == 0 it means that the name followed directly after
3656 the opening parenthesis [e.g. (?(abc)...] and in this case there are
3657 some further alternatives to try. For the cases where terminator != 0
3658 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3659 now checked all the possibilities, so give an error. */
3661 else if (terminator != 0)
3663 *errorcodeptr = ERR15;
3667 /* Check for (?(R) for recursion. Allow digits after R to specify a
3668 specific group number. */
3670 else if (*name == 'R')
3673 for (i = 1; i < namelen; i++)
3675 if (g_ascii_isdigit(name[i]) == 0)
3677 *errorcodeptr = ERR15;
3680 recno = recno * 10 + name[i] - '0';
3682 if (recno == 0) recno = RREF_ANY;
3683 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
3684 PUT2(code, 2+LINK_SIZE, recno);
3687 /* Similarly, check for the (?(DEFINE) "condition", which is always
3690 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3692 code[1+LINK_SIZE] = OP_DEF;
3696 /* Check for the "name" actually being a subpattern number. */
3700 PUT2(code, 2+LINK_SIZE, recno);
3703 /* Either an unidentified subpattern, or a reference to (?(0) */
3707 *errorcodeptr = (recno == 0)? ERR35: ERR15;
3713 /* ------------------------------------------------------------ */
3714 case '=': /* Positive lookahead */
3715 bravalue = OP_ASSERT;
3720 /* ------------------------------------------------------------ */
3721 case '!': /* Negative lookahead */
3722 bravalue = OP_ASSERT_NOT;
3727 /* ------------------------------------------------------------ */
3728 case '<': /* Lookbehind or named define */
3731 case '=': /* Positive lookbehind */
3732 bravalue = OP_ASSERTBACK;
3736 case '!': /* Negative lookbehind */
3737 bravalue = OP_ASSERTBACK_NOT;
3741 default: /* Could be name define, else bad */
3742 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3743 ptr++; /* Correct offset for error */
3744 *errorcodeptr = ERR24;
3750 /* ------------------------------------------------------------ */
3751 case '>': /* One-time brackets */
3757 /* ------------------------------------------------------------ */
3758 case 'C': /* Callout - may be followed by digits; */
3759 previous_callout = code; /* Save for later completion */
3760 after_manual_callout = 1; /* Skip one item before completing */
3761 *code++ = OP_CALLOUT;
3764 while (g_ascii_isdigit(*(++ptr)) != 0)
3765 n = n * 10 + *ptr - '0';
3768 *errorcodeptr = ERR39;
3773 *errorcodeptr = ERR38;
3777 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3778 PUT(code, LINK_SIZE, 0); /* Default length */
3779 code += 2 * LINK_SIZE;
3785 /* ------------------------------------------------------------ */
3786 case 'P': /* Python-style named subpattern handling */
3787 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
3789 is_recurse = *ptr == '>';
3791 goto NAMED_REF_OR_RECURSE;
3793 else if (*ptr != '<') /* Test for Python-style definition */
3795 *errorcodeptr = ERR41;
3798 /* Fall through to handle (?P< as (?< is handled */
3801 /* ------------------------------------------------------------ */
3802 DEFINE_NAME: /* Come here from (?< handling */
3805 terminator = (*ptr == '<')? '>' : '\'';
3808 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3809 namelen = ptr - name;
3811 /* In the pre-compile phase, just do a syntax check. */
3813 if (lengthptr != NULL)
3815 if (*ptr != terminator)
3817 *errorcodeptr = ERR42;
3820 if (cd->names_found >= MAX_NAME_COUNT)
3822 *errorcodeptr = ERR49;
3825 if (namelen + 3 > cd->name_entry_size)
3827 cd->name_entry_size = namelen + 3;
3828 if (namelen > MAX_NAME_SIZE)
3830 *errorcodeptr = ERR48;
3836 /* In the real compile, create the entry in the table */
3840 slot = cd->name_table;
3841 for (i = 0; i < cd->names_found; i++)
3843 int crc = memcmp(name, slot+2, namelen);
3846 if (slot[2+namelen] == 0)
3848 if ((options & PCRE_DUPNAMES) == 0)
3850 *errorcodeptr = ERR43;
3854 else crc = -1; /* Current name is substring */
3858 memmove(slot + cd->name_entry_size, slot,
3859 (cd->names_found - i) * cd->name_entry_size);
3862 slot += cd->name_entry_size;
3865 PUT2(slot, 0, cd->bracount + 1);
3866 memcpy(slot + 2, name, namelen);
3867 slot[2+namelen] = 0;
3871 /* In both cases, count the number of names we've encountered. */
3873 ptr++; /* Move past > or ' */
3875 goto NUMBERED_GROUP;
3878 /* ------------------------------------------------------------ */
3879 case '&': /* Perl recursion/subroutine syntax */
3884 /* We come here from the Python syntax above that handles both
3885 references (?P=name) and recursion (?P>name), as well as falling
3886 through from the Perl recursion syntax (?&name). */
3888 NAMED_REF_OR_RECURSE:
3890 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3891 namelen = ptr - name;
3893 /* In the pre-compile phase, do a syntax check and set a dummy
3894 reference number. */
3896 if (lengthptr != NULL)
3898 if (*ptr != terminator)
3900 *errorcodeptr = ERR42;
3903 if (namelen > MAX_NAME_SIZE)
3905 *errorcodeptr = ERR48;
3911 /* In the real compile, seek the name in the table */
3915 slot = cd->name_table;
3916 for (i = 0; i < cd->names_found; i++)
3918 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3919 slot += cd->name_entry_size;
3922 if (i < cd->names_found) /* Back reference */
3924 recno = GET2(slot, 0);
3926 else if ((recno = /* Forward back reference */
3927 find_parens(ptr, cd->bracount, name, namelen,
3928 (options & PCRE_EXTENDED) != 0)) <= 0)
3930 *errorcodeptr = ERR15;
3935 /* In both phases, we can now go to the code than handles numerical
3936 recursion or backreferences. */
3938 if (is_recurse) goto HANDLE_RECURSION;
3939 else goto HANDLE_REFERENCE;
3942 /* ------------------------------------------------------------ */
3943 case 'R': /* Recursion */
3944 ptr++; /* Same as (?0) */
3948 /* ------------------------------------------------------------ */
3949 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
3950 case '5': case '6': case '7': case '8': case '9': /* subroutine */
3952 const uschar *called;
3954 while(g_ascii_isdigit(*ptr) != 0)
3955 recno = recno * 10 + *ptr++ - '0';
3958 *errorcodeptr = ERR29;
3962 /* Come here from code above that handles a named recursion */
3967 called = cd->start_code;
3969 /* When we are actually compiling, find the bracket that is being
3970 referenced. Temporarily end the regex in case it doesn't exist before
3971 this point. If we end up with a forward reference, first check that
3972 the bracket does occur later so we can give the error (and position)
3973 now. Then remember this forward reference in the workspace so it can
3974 be filled in at the end. */
3976 if (lengthptr == NULL)
3979 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
3981 /* Forward reference */
3985 if (find_parens(ptr, cd->bracount, NULL, recno,
3986 (options & PCRE_EXTENDED) != 0) < 0)
3988 *errorcodeptr = ERR15;
3991 called = cd->start_code + recno;
3992 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
3995 /* If not a forward reference, and the subpattern is still open,
3996 this is a recursive call. We check to see if this is a left
3997 recursion that could loop for ever, and diagnose that case. */
3999 else if (GET(called, 1) == 0 &&
4000 could_be_empty(called, code, bcptr, utf8))
4002 *errorcodeptr = ERR40;
4007 /* Insert the recursion/subroutine item, automatically wrapped inside
4008 "once" brackets. Set up a "previous group" length so that a
4009 subsequent quantifier will work. */
4012 PUT(code, 1, 2 + 2*LINK_SIZE);
4013 code += 1 + LINK_SIZE;
4016 PUT(code, 1, called - cd->start_code);
4017 code += 1 + LINK_SIZE;
4020 PUT(code, 1, 2 + 2*LINK_SIZE);
4021 code += 1 + LINK_SIZE;
4023 length_prevgroup = 3 + 3*LINK_SIZE;
4026 /* Can't determine a first byte now */
4028 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4032 /* ------------------------------------------------------------ */
4033 default: /* Other characters: check option setting */
4037 while (*ptr != ')' && *ptr != ':')
4041 case '-': optset = &unset; break;
4043 case 'J': /* Record that it changed in the external options */
4044 *optset |= PCRE_DUPNAMES;
4045 cd->external_options |= PCRE_JCHANGED;
4048 case 'i': *optset |= PCRE_CASELESS; break;
4049 case 'm': *optset |= PCRE_MULTILINE; break;
4050 case 's': *optset |= PCRE_DOTALL; break;
4051 case 'x': *optset |= PCRE_EXTENDED; break;
4052 case 'U': *optset |= PCRE_UNGREEDY; break;
4053 case 'X': *optset |= PCRE_EXTRA; break;
4055 default: *errorcodeptr = ERR12;
4056 ptr--; /* Correct the offset */
4061 /* Set up the changed option bits, but don't change anything yet. */
4063 newoptions = (options | set) & (~unset);
4065 /* If the options ended with ')' this is not the start of a nested
4066 group with option changes, so the options change at this level. If this
4067 item is right at the start of the pattern, the options can be
4068 abstracted and made external in the pre-compile phase, and ignored in
4069 the compile phase. This can be helpful when matching -- for instance in
4070 caseless checking of required bytes.
4072 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4073 definitely *not* at the start of the pattern because something has been
4074 compiled. In the pre-compile phase, however, the code pointer can have
4075 that value after the start, because it gets reset as code is discarded
4076 during the pre-compile. However, this can happen only at top level - if
4077 we are within parentheses, the starting BRA will still be present. At
4078 any parenthesis level, the length value can be used to test if anything
4079 has been compiled at that level. Thus, a test for both these conditions
4080 is necessary to ensure we correctly detect the start of the pattern in
4083 If we are not at the pattern start, compile code to change the ims
4084 options if this setting actually changes any of them. We also pass the
4085 new setting back so that it can be put at the start of any following
4086 branches, and when this group ends (if we are in a group), a resetting
4087 item can be compiled. */
4091 if (code == cd->start_code + 1 + LINK_SIZE &&
4092 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4094 cd->external_options = newoptions;
4095 options = newoptions;
4099 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4102 *code++ = newoptions & PCRE_IMS;
4105 /* Change options at this level, and pass them back for use
4106 in subsequent branches. Reset the greedy defaults and the case
4107 value for firstbyte and reqbyte. */
4109 *optionsptr = options = newoptions;
4110 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4111 greedy_non_default = greedy_default ^ 1;
4112 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4115 previous = NULL; /* This item can't be repeated */
4116 continue; /* It is complete */
4119 /* If the options ended with ':' we are heading into a nested group
4120 with possible change of options. Such groups are non-capturing and are
4121 not assertions of any kind. All we need to do is skip over the ':';
4122 the newoptions value is handled below. */
4126 } /* End of switch for character following (? */
4127 } /* End of (? handling */
4129 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4130 all unadorned brackets become non-capturing and behave like (?:...)
4133 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4138 /* Else we have a capturing group. */
4144 PUT2(code, 1+LINK_SIZE, cd->bracount);
4148 /* Process nested bracketed regex. Assertions may not be repeated, but
4149 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4150 non-register variable in order to be able to pass its address because some
4151 compilers complain otherwise. Pass in a new setting for the ims options if
4152 they have changed. */
4154 previous = (bravalue >= OP_ONCE)? code : NULL;
4157 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4158 length_prevgroup = 0; /* Initialize for pre-compile phase */
4161 newoptions, /* The complete new option state */
4162 options & PCRE_IMS, /* The previous ims option state */
4163 &tempcode, /* Where to put code (updated) */
4164 &ptr, /* Input pointer (updated) */
4165 errorcodeptr, /* Where to put an error message */
4166 (bravalue == OP_ASSERTBACK ||
4167 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4168 skipbytes, /* Skip over bracket number */
4169 &subfirstbyte, /* For possible first char */
4170 &subreqbyte, /* For possible last char */
4171 bcptr, /* Current branch chain */
4172 cd, /* Tables block */
4173 (lengthptr == NULL)? NULL : /* Actual compile phase */
4174 &length_prevgroup /* Pre-compile phase */
4178 /* At the end of compiling, code is still pointing to the start of the
4179 group, while tempcode has been updated to point past the end of the group
4180 and any option resetting that may follow it. The pattern pointer (ptr)
4181 is on the bracket. */
4183 /* If this is a conditional bracket, check that there are no more than
4184 two branches in the group, or just one if it's a DEFINE group. */
4186 if (bravalue == OP_COND)
4195 while (*tc != OP_KET);
4197 /* A DEFINE group is never obeyed inline (the "condition" is always
4198 false). It must have only one branch. */
4200 if (code[LINK_SIZE+1] == OP_DEF)
4204 *errorcodeptr = ERR54;
4207 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4210 /* A "normal" conditional group. If there is just one branch, we must not
4211 make use of its firstbyte or reqbyte, because this is equivalent to an
4212 empty second branch. */
4218 *errorcodeptr = ERR27;
4221 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4225 /* Error if hit end of pattern */
4229 *errorcodeptr = ERR14;
4233 /* In the pre-compile phase, update the length by the length of the nested
4234 group, less the brackets at either end. Then reduce the compiled code to
4235 just the brackets so that it doesn't use much memory if it is duplicated by
4238 if (lengthptr != NULL)
4240 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4242 PUTINC(code, 0, 1 + LINK_SIZE);
4244 PUTINC(code, 0, 1 + LINK_SIZE);
4247 /* Otherwise update the main code pointer to the end of the group. */
4249 else code = tempcode;
4251 /* For a DEFINE group, required and first character settings are not
4254 if (bravalue == OP_DEF) break;
4256 /* Handle updating of the required and first characters for other types of
4257 group. Update for normal brackets of all kinds, and conditions with two
4258 branches (see code above). If the bracket is followed by a quantifier with
4259 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4260 zerofirstbyte outside the main loop so that they can be accessed for the
4263 zeroreqbyte = reqbyte;
4264 zerofirstbyte = firstbyte;
4265 groupsetfirstbyte = FALSE;
4267 if (bravalue >= OP_ONCE)
4269 /* If we have not yet set a firstbyte in this branch, take it from the
4270 subpattern, remembering that it was set here so that a repeat of more
4271 than one can replicate it as reqbyte if necessary. If the subpattern has
4272 no firstbyte, set "none" for the whole branch. In both cases, a zero
4273 repeat forces firstbyte to "none". */
4275 if (firstbyte == REQ_UNSET)
4277 if (subfirstbyte >= 0)
4279 firstbyte = subfirstbyte;
4280 groupsetfirstbyte = TRUE;
4282 else firstbyte = REQ_NONE;
4283 zerofirstbyte = REQ_NONE;
4286 /* If firstbyte was previously set, convert the subpattern's firstbyte
4287 into reqbyte if there wasn't one, using the vary flag that was in
4288 existence beforehand. */
4290 else if (subfirstbyte >= 0 && subreqbyte < 0)
4291 subreqbyte = subfirstbyte | tempreqvary;
4293 /* If the subpattern set a required byte (or set a first byte that isn't
4294 really the first byte - see above), set it. */
4296 if (subreqbyte >= 0) reqbyte = subreqbyte;
4299 /* For a forward assertion, we take the reqbyte, if set. This can be
4300 helpful if the pattern that follows the assertion doesn't set a different
4301 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4302 for an assertion, however because it leads to incorrect effect for patterns
4303 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4304 of a firstbyte. This is overcome by a scan at the end if there's no
4305 firstbyte, looking for an asserted first char. */
4307 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4308 break; /* End of processing '(' */
4311 /* ===================================================================*/
4312 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4313 are arranged to be the negation of the corresponding OP_values. For the
4314 back references, the values are ESC_REF plus the reference number. Only
4315 back references and those types that consume a character may be repeated.
4316 We can test for values between ESC_b and ESC_Z for the latter; this may
4317 have to change if any new ones are ever created. */
4321 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4322 if (*errorcodeptr != 0) goto FAILED;
4326 if (-c == ESC_Q) /* Handle start of quoted string */
4328 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
4333 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
4335 /* For metasequences that actually match a character, we disable the
4336 setting of a first character if it hasn't already been set. */
4338 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
4339 firstbyte = REQ_NONE;
4341 /* Set values to reset to if this is followed by a zero repeat. */
4343 zerofirstbyte = firstbyte;
4344 zeroreqbyte = reqbyte;
4346 /* \k<name> or \k'name' is a back reference by name (Perl syntax) */
4348 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\''))
4351 terminator = (*(++ptr) == '<')? '>' : '\'';
4352 goto NAMED_REF_OR_RECURSE;
4355 /* Back references are handled specially; must disable firstbyte if
4356 not set to cope with cases like (?=(\w+))\1: which would otherwise set
4361 recno = -c - ESC_REF;
4363 HANDLE_REFERENCE: /* Come here from named backref handling */
4364 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4367 PUT2INC(code, 0, recno);
4368 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4369 if (recno > cd->top_backref) cd->top_backref = recno;
4372 /* So are Unicode property matches, if supported. */
4375 else if (-c == ESC_P || -c == ESC_p)
4379 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4380 if (ptype < 0) goto FAILED;
4382 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4388 /* If Unicode properties are not supported, \X, \P, and \p are not
4391 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4393 *errorcodeptr = ERR45;
4398 /* For the rest (including \X when Unicode properties are supported), we
4399 can obtain the OP value by negating the escape value. */
4403 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
4409 /* We have a data character whose value is in c. In UTF-8 mode it may have
4410 a value > 127. We set its representation in the length/buffer, and then
4411 handle it as a data character. */
4414 if (utf8 && c > 127)
4415 mclength = _pcre_ord2utf8(c, mcbuffer);
4426 /* ===================================================================*/
4427 /* Handle a literal character. It is guaranteed not to be whitespace or #
4428 when the extended flag is set. If we are in UTF-8 mode, it may be a
4429 multi-byte literal character. */
4437 if (utf8 && c >= 0xc0)
4439 while ((ptr[1] & 0xc0) == 0x80)
4440 mcbuffer[mclength++] = *(++ptr);
4444 /* At this point we have the character's bytes in mcbuffer, and the length
4445 in mclength. When not in UTF-8 mode, the length is always 1. */
4449 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
4450 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
4452 /* Set the first and required bytes appropriately. If no previous first
4453 byte, set it from this character, but revert to none on a zero repeat.
4454 Otherwise, leave the firstbyte value alone, and don't change it on a zero
4457 if (firstbyte == REQ_UNSET)
4459 zerofirstbyte = REQ_NONE;
4460 zeroreqbyte = reqbyte;
4462 /* If the character is more than one byte long, we can set firstbyte
4463 only if it is not to be matched caselessly. */
4465 if (mclength == 1 || req_caseopt == 0)
4467 firstbyte = mcbuffer[0] | req_caseopt;
4468 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
4470 else firstbyte = reqbyte = REQ_NONE;
4473 /* firstbyte was previously set; we can set reqbyte only the length is
4474 1 or the matching is caseful. */
4478 zerofirstbyte = firstbyte;
4479 zeroreqbyte = reqbyte;
4480 if (mclength == 1 || req_caseopt == 0)
4481 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
4484 break; /* End of literal character handling */
4486 } /* end of big loop */
4489 /* Control never reaches here by falling through, only by a goto for all the
4490 error states. Pass back the position in the pattern so that it can be displayed
4491 to the user for diagnosing the error. */
4501 /*************************************************
4502 * Compile sequence of alternatives *
4503 *************************************************/
4505 /* On entry, ptr is pointing past the bracket character, but on return it
4506 points to the closing bracket, or vertical bar, or end of string. The code
4507 variable is pointing at the byte into which the BRA operator has been stored.
4508 If the ims options are changed at the start (for a (?ims: group) or during any
4509 branch, we need to insert an OP_OPT item at the start of every following branch
4510 to ensure they get set correctly at run time, and also pass the new options
4511 into every subsequent branch compile.
4513 This function is used during the pre-compile phase when we are trying to find
4514 out the amount of memory needed, as well as during the real compile phase. The
4515 value of lengthptr distinguishes the two phases.
4518 options option bits, including any changes for this subpattern
4519 oldims previous settings of ims option bits
4520 codeptr -> the address of the current code pointer
4521 ptrptr -> the address of the current pattern pointer
4522 errorcodeptr -> pointer to error code variable
4523 lookbehind TRUE if this is a lookbehind assertion
4524 skipbytes skip this many bytes at start (for brackets and OP_COND)
4525 firstbyteptr place to put the first required character, or a negative number
4526 reqbyteptr place to put the last required character, or a negative number
4527 bcptr pointer to the chain of currently open branches
4528 cd points to the data block with tables pointers etc.
4529 lengthptr NULL during the real compile phase
4530 points to length accumulator during pre-compile phase
4532 Returns: TRUE on success
4536 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
4537 int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr,
4538 int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr)
4540 const uschar *ptr = *ptrptr;
4541 uschar *code = *codeptr;
4542 uschar *last_branch = code;
4543 uschar *start_bracket = code;
4544 uschar *reverse_count = NULL;
4545 int firstbyte, reqbyte;
4546 int branchfirstbyte, branchreqbyte;
4553 firstbyte = reqbyte = REQ_UNSET;
4555 /* Accumulate the length for use in the pre-compile phase. Start with the
4556 length of the BRA and KET and any extra bytes that are required at the
4557 beginning. We accumulate in a local variable to save frequent testing of
4558 lenthptr for NULL. We cannot do this by looking at the value of code at the
4559 start and end of each alternative, because compiled items are discarded during
4560 the pre-compile phase so that the work space is not exceeded. */
4562 length = 2 + 2*LINK_SIZE + skipbytes;
4564 /* WARNING: If the above line is changed for any reason, you must also change
4565 the code that abstracts option settings at the start of the pattern and makes
4566 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
4567 pre-compile phase to find out whether anything has yet been compiled or not. */
4569 /* Offset is set zero to mark that this bracket is still open */
4572 code += 1 + LINK_SIZE + skipbytes;
4574 /* Loop for each alternative branch */
4578 /* Handle a change of ims options at the start of the branch */
4580 if ((options & PCRE_IMS) != oldims)
4583 *code++ = options & PCRE_IMS;
4587 /* Set up dummy OP_REVERSE if lookbehind assertion */
4591 *code++ = OP_REVERSE;
4592 reverse_count = code;
4594 length += 1 + LINK_SIZE;
4597 /* Now compile the branch; in the pre-compile phase its length gets added
4600 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
4601 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
4607 /* In the real compile phase, there is some post-processing to be done. */
4609 if (lengthptr == NULL)
4611 /* If this is the first branch, the firstbyte and reqbyte values for the
4612 branch become the values for the regex. */
4614 if (*last_branch != OP_ALT)
4616 firstbyte = branchfirstbyte;
4617 reqbyte = branchreqbyte;
4620 /* If this is not the first branch, the first char and reqbyte have to
4621 match the values from all the previous branches, except that if the
4622 previous value for reqbyte didn't have REQ_VARY set, it can still match,
4623 and we set REQ_VARY for the regex. */
4627 /* If we previously had a firstbyte, but it doesn't match the new branch,
4628 we have to abandon the firstbyte for the regex, but if there was
4629 previously no reqbyte, it takes on the value of the old firstbyte. */
4631 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4633 if (reqbyte < 0) reqbyte = firstbyte;
4634 firstbyte = REQ_NONE;
4637 /* If we (now or from before) have no firstbyte, a firstbyte from the
4638 branch becomes a reqbyte if there isn't a branch reqbyte. */
4640 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4641 branchreqbyte = branchfirstbyte;
4643 /* Now ensure that the reqbytes match */
4645 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4647 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
4650 /* If lookbehind, check that this branch matches a fixed-length string, and
4651 put the length into the OP_REVERSE item. Temporarily mark the end of the
4652 branch with OP_END. */
4658 fixed_length = find_fixedlength(last_branch, options);
4659 DPRINTF(("fixed length = %d\n", fixed_length));
4660 if (fixed_length < 0)
4662 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
4666 PUT(reverse_count, 0, fixed_length);
4670 /* Reached end of expression, either ')' or end of pattern. Go back through
4671 the alternative branches and reverse the chain of offsets, with the field in
4672 the BRA item now becoming an offset to the first alternative. If there are
4673 no alternatives, it points to the end of the group. The length in the
4674 terminating ket is always the length of the whole bracketed item. If any of
4675 the ims options were changed inside the group, compile a resetting op-code
4676 following, except at the very end of the pattern. Return leaving the pointer
4677 at the terminating char. */
4681 int branch_length = code - last_branch;
4684 int prev_length = GET(last_branch, 1);
4685 PUT(last_branch, 1, branch_length);
4686 branch_length = prev_length;
4687 last_branch -= branch_length;
4689 while (branch_length > 0);
4691 /* Fill in the ket */
4694 PUT(code, 1, code - start_bracket);
4695 code += 1 + LINK_SIZE;
4697 /* Resetting option if needed */
4699 if ((options & PCRE_IMS) != oldims && *ptr == ')')
4706 /* Set values to pass back */
4710 *firstbyteptr = firstbyte;
4711 *reqbyteptr = reqbyte;
4712 if (lengthptr != NULL) *lengthptr += length;
4716 /* Another branch follows; insert an "or" node. Its length field points back
4717 to the previous branch while the bracket remains open. At the end the chain
4718 is reversed. It's done like this so that the start of the bracket has a
4719 zero offset until it is closed, making it possible to detect recursion. */
4722 PUT(code, 1, code - last_branch);
4723 bc.current = last_branch = code;
4724 code += 1 + LINK_SIZE;
4726 length += 1 + LINK_SIZE;
4728 /* Control never reaches here */
4734 /*************************************************
4735 * Check for anchored expression *
4736 *************************************************/
4738 /* Try to find out if this is an anchored regular expression. Consider each
4739 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
4740 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
4741 it's anchored. However, if this is a multiline pattern, then only OP_SOD
4742 counts, since OP_CIRC can match in the middle.
4744 We can also consider a regex to be anchored if OP_SOM starts all its branches.
4745 This is the code for \G, which means "match at start of match position, taking
4746 into account the match offset".
4748 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
4749 because that will try the rest of the pattern at all possible matching points,
4750 so there is no point trying again.... er ....
4752 .... except when the .* appears inside capturing parentheses, and there is a
4753 subsequent back reference to those parentheses. We haven't enough information
4754 to catch that case precisely.
4756 At first, the best we could do was to detect when .* was in capturing brackets
4757 and the highest back reference was greater than or equal to that level.
4758 However, by keeping a bitmap of the first 31 back references, we can catch some
4759 of the more common cases more precisely.
4762 code points to start of expression (the bracket)
4763 options points to the options setting
4764 bracket_map a bitmap of which brackets we are inside while testing; this
4765 handles up to substring 31; after that we just have to take
4766 the less precise approach
4767 backref_map the back reference bitmap
4769 Returns: TRUE or FALSE
4773 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
4774 unsigned int backref_map)
4777 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
4778 options, PCRE_MULTILINE, FALSE);
4779 register int op = *scode;
4781 /* Non-capturing brackets */
4785 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4788 /* Capturing brackets */
4790 else if (op == OP_CBRA)
4792 int n = GET2(scode, 1+LINK_SIZE);
4793 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
4794 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
4797 /* Other brackets */
4799 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4801 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4804 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
4805 are or may be referenced. */
4807 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
4808 op == OP_TYPEPOSSTAR) &&
4809 (*options & PCRE_DOTALL) != 0)
4811 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4814 /* Check for explicit anchoring */
4816 else if (op != OP_SOD && op != OP_SOM &&
4817 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
4819 code += GET(code, 1);
4821 while (*code == OP_ALT); /* Loop for each alternative */
4827 /*************************************************
4828 * Check for starting with ^ or .* *
4829 *************************************************/
4831 /* This is called to find out if every branch starts with ^ or .* so that
4832 "first char" processing can be done to speed things up in multiline
4833 matching and for non-DOTALL patterns that start with .* (which must start at
4834 the beginning or after \n). As in the case of is_anchored() (see above), we
4835 have to take account of back references to capturing brackets that contain .*
4836 because in that case we can't make the assumption.
4839 code points to start of expression (the bracket)
4840 bracket_map a bitmap of which brackets we are inside while testing; this
4841 handles up to substring 31; after that we just have to take
4842 the less precise approach
4843 backref_map the back reference bitmap
4845 Returns: TRUE or FALSE
4849 is_startline(const uschar *code, unsigned int bracket_map,
4850 unsigned int backref_map)
4853 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
4855 register int op = *scode;
4857 /* Non-capturing brackets */
4861 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
4864 /* Capturing brackets */
4866 else if (op == OP_CBRA)
4868 int n = GET2(scode, 1+LINK_SIZE);
4869 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
4870 if (!is_startline(scode, new_map, backref_map)) return FALSE;
4873 /* Other brackets */
4875 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4876 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
4878 /* .* means "start at start or after \n" if it isn't in brackets that
4879 may be referenced. */
4881 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
4883 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4886 /* Check for explicit circumflex */
4888 else if (op != OP_CIRC) return FALSE;
4890 /* Move on to the next alternative */
4892 code += GET(code, 1);
4894 while (*code == OP_ALT); /* Loop for each alternative */
4900 /*************************************************
4901 * Check for asserted fixed first char *
4902 *************************************************/
4904 /* During compilation, the "first char" settings from forward assertions are
4905 discarded, because they can cause conflicts with actual literals that follow.
4906 However, if we end up without a first char setting for an unanchored pattern,
4907 it is worth scanning the regex to see if there is an initial asserted first
4908 char. If all branches start with the same asserted char, or with a bracket all
4909 of whose alternatives start with the same asserted char (recurse ad lib), then
4910 we return that char, otherwise -1.
4913 code points to start of expression (the bracket)
4914 options pointer to the options (used to check casing changes)
4915 inassert TRUE if in an assertion
4917 Returns: -1 or the fixed first char
4921 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
4923 register int c = -1;
4926 const uschar *scode =
4927 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
4928 register int op = *scode;
4940 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
4942 if (c < 0) c = d; else if (c != d) return -1;
4945 case OP_EXACT: /* Fall through */
4953 if (!inassert) return -1;
4957 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
4959 else if (c != scode[1]) return -1;
4963 code += GET(code, 1);
4965 while (*code == OP_ALT);
4971 /*************************************************
4972 * Compile a Regular Expression *
4973 *************************************************/
4975 /* This function takes a string and returns a pointer to a block of store
4976 holding a compiled version of the expression. The original API for this
4977 function had no error code return variable; it is retained for backwards
4978 compatibility. The new function is given a new name.
4981 pattern the regular expression
4982 options various option bits
4983 errorcodeptr pointer to error code variable (pcre_compile2() only)
4984 can be NULL if you don't want a code value
4985 errorptr pointer to pointer to error text
4986 erroroffset ptr offset in pattern where error was detected
4987 tables pointer to character tables or NULL
4989 Returns: pointer to compiled data block, or NULL on error,
4990 with errorptr and erroroffset set
4993 PCRE_DATA_SCOPE pcre *
4994 pcre_compile(const char *pattern, int options, const char **errorptr,
4995 int *erroroffset, const unsigned char *tables)
4997 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5001 PCRE_DATA_SCOPE pcre *
5002 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5003 const char **errorptr, int *erroroffset, const unsigned char *tables)
5006 int length = 1; /* For final END opcode */
5007 int firstbyte, reqbyte, newline;
5014 const uschar *codestart;
5016 compile_data compile_block;
5017 compile_data *cd = &compile_block;
5019 /* This space is used for "compiling" into during the first phase, when we are
5020 computing the amount of memory that is needed. Compiled items are thrown away
5021 as soon as possible, so that a fairly large buffer should be sufficient for
5022 this purpose. The same space is used in the second phase for remembering where
5023 to fill in forward references to subpatterns. */
5025 uschar cworkspace[COMPILE_WORK_SIZE];
5028 /* Set this early so that early errors get offset 0. */
5030 ptr = (const uschar *)pattern;
5032 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5033 can do is just return NULL, but we can set a code value if there is a code
5036 if (errorptr == NULL)
5038 if (errorcodeptr != NULL) *errorcodeptr = 99;
5043 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5045 /* However, we can give a message for this error */
5047 if (erroroffset == NULL)
5050 goto PCRE_EARLY_ERROR_RETURN;
5055 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5058 utf8 = (options & PCRE_UTF8) != 0;
5059 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5060 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5063 goto PCRE_UTF8_ERROR_RETURN;
5066 if ((options & PCRE_UTF8) != 0)
5069 goto PCRE_EARLY_ERROR_RETURN;
5073 if ((options & ~PUBLIC_OPTIONS) != 0)
5076 goto PCRE_EARLY_ERROR_RETURN;
5079 /* Set up pointers to the individual character tables */
5081 if (tables == NULL) tables = _pcre_default_tables;
5082 cd->lcc = tables + lcc_offset;
5083 cd->fcc = tables + fcc_offset;
5084 cd->cbits = tables + cbits_offset;
5085 cd->ctypes = tables + ctypes_offset;
5087 /* Handle different types of newline. The three bits give seven cases. The
5088 current code allows for fixed one- or two-byte sequences, plus "any". */
5090 switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5092 case 0: newline = NEWLINE; break; /* Compile-time default */
5093 case PCRE_NEWLINE_CR: newline = '\r'; break;
5094 case PCRE_NEWLINE_LF: newline = '\n'; break;
5095 case PCRE_NEWLINE_CR+
5096 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5097 case PCRE_NEWLINE_ANY: newline = -1; break;
5098 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5103 cd->nltype = NLTYPE_ANY;
5107 cd->nltype = NLTYPE_FIXED;
5111 cd->nl[0] = (newline >> 8) & 255;
5112 cd->nl[1] = newline & 255;
5117 cd->nl[0] = newline;
5121 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5122 references to help in deciding whether (.*) can be treated as anchored or not.
5125 cd->top_backref = 0;
5126 cd->backref_map = 0;
5128 /* Reflect pattern for debugging output */
5130 DPRINTF(("------------------------------------------------------------------\n"));
5131 DPRINTF(("%s\n", pattern));
5133 /* Pretend to compile the pattern while actually just accumulating the length
5134 of memory required. This behaviour is triggered by passing a non-NULL final
5135 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5136 to compile parts of the pattern into; the compiled code is discarded when it is
5137 no longer needed, so hopefully this workspace will never overflow, though there
5138 is a test for its doing so. */
5141 cd->names_found = 0;
5142 cd->name_entry_size = 0;
5143 cd->name_table = NULL;
5144 cd->start_workspace = cworkspace;
5145 cd->start_code = cworkspace;
5146 cd->hwm = cworkspace;
5147 cd->start_pattern = (const uschar *)pattern;
5148 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5149 cd->req_varyopt = 0;
5150 cd->nopartial = FALSE;
5151 cd->external_options = options;
5153 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5154 don't need to look at the result of the function here. The initial options have
5155 been put into the cd block so that they can be changed if an option setting is
5156 found within the regex right at the beginning. Bringing initial option settings
5157 outside can help speed up starting point checks. */
5161 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5162 &code, &ptr, &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, &length);
5163 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5165 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5166 cd->hwm - cworkspace));
5168 if (length > MAX_PATTERN_SIZE)
5171 goto PCRE_EARLY_ERROR_RETURN;
5174 /* Compute the size of data block needed and get it, either from malloc or
5175 externally provided function. Integer overflow should no longer be possible
5176 because nowadays we limit the maximum value of cd->names_found and
5177 cd->name_entry_size. */
5179 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5180 re = (real_pcre *)(pcre_malloc)(size);
5185 goto PCRE_EARLY_ERROR_RETURN;
5188 /* Put in the magic number, and save the sizes, initial options, and character
5189 table pointer. NULL is used for the default character tables. The nullpad field
5190 is at the end; it's there to help in the case when a regex compiled on a system
5191 with 4-byte pointers is run on another with 8-byte pointers. */
5193 re->magic_number = MAGIC_NUMBER;
5195 re->options = cd->external_options;
5199 re->name_table_offset = sizeof(real_pcre);
5200 re->name_entry_size = cd->name_entry_size;
5201 re->name_count = cd->names_found;
5203 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5206 /* The starting points of the name/number translation table and of the code are
5207 passed around in the compile data block. The start/end pattern and initial
5208 options are already set from the pre-compile phase, as is the name_entry_size
5209 field. Reset the bracket count and the names_found field. Also reset the hwm
5210 field; this time it's used for remembering forward references to subpatterns.
5214 cd->names_found = 0;
5215 cd->name_table = (uschar *)re + re->name_table_offset;
5216 codestart = cd->name_table + re->name_entry_size * re->name_count;
5217 cd->start_code = codestart;
5218 cd->hwm = cworkspace;
5219 cd->req_varyopt = 0;
5220 cd->nopartial = FALSE;
5222 /* Set up a starting, non-extracting bracket, then compile the expression. On
5223 error, errorcode will be set non-zero, so we don't need to look at the result
5224 of the function here. */
5226 ptr = (const uschar *)pattern;
5227 code = (uschar *)codestart;
5229 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5230 &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5231 re->top_bracket = cd->bracount;
5232 re->top_backref = cd->top_backref;
5234 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5236 /* If not reached end of pattern on success, there's an excess bracket. */
5238 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
5240 /* Fill in the terminating state and check for disastrous overflow, but
5241 if debugging, leave the test till after things are printed out. */
5246 if (code - codestart > length) errorcode = ERR23;
5249 /* Fill in any forward references that are required. */
5251 while (errorcode == 0 && cd->hwm > cworkspace)
5254 const uschar *groupptr;
5255 cd->hwm -= LINK_SIZE;
5256 offset = GET(cd->hwm, 0);
5257 recno = GET(codestart, offset);
5258 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
5259 if (groupptr == NULL) errorcode = ERR53;
5260 else PUT(((uschar *)codestart), offset, groupptr - codestart);
5263 /* Give an error if there's back reference to a non-existent capturing
5266 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
5268 /* Failed to compile, or error while post-processing */
5273 PCRE_EARLY_ERROR_RETURN:
5274 *erroroffset = ptr - (const uschar *)pattern;
5276 PCRE_UTF8_ERROR_RETURN:
5278 *errorptr = error_texts + error_texts_offsets[errorcode];
5279 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5283 /* If the anchored option was not passed, set the flag if we can determine that
5284 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5285 as starting with .* when DOTALL is set).
5287 Otherwise, if we know what the first byte has to be, save it, because that
5288 speeds up unanchored matches no end. If not, see if we can set the
5289 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5290 start with ^. and also when all branches start with .* for non-DOTALL matches.
5293 if ((re->options & PCRE_ANCHORED) == 0)
5295 int temp_options = re->options; /* May get changed during these scans */
5296 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
5297 re->options |= PCRE_ANCHORED;
5301 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5302 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5304 int ch = firstbyte & 255;
5305 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5306 cd->fcc[ch] == ch)? ch : firstbyte;
5307 re->options |= PCRE_FIRSTSET;
5309 else if (is_startline(codestart, 0, cd->backref_map))
5310 re->options |= PCRE_STARTLINE;
5314 /* For an anchored pattern, we use the "required byte" only if it follows a
5315 variable length item in the regex. Remove the caseless flag for non-caseable
5319 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5321 int ch = reqbyte & 255;
5322 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5323 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5324 re->options |= PCRE_REQCHSET;
5327 /* Print out the compiled data if debugging is enabled. This is never the
5328 case when building a production library. */
5332 printf("Length = %d top_bracket = %d top_backref = %d\n",
5333 length, re->top_bracket, re->top_backref);
5335 if (re->options != 0)
5337 printf("%s%s%s%s%s%s%s%s%s\n",
5338 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5339 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5340 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5341 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5342 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5343 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5344 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5345 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5346 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5349 if ((re->options & PCRE_FIRSTSET) != 0)
5351 int ch = re->first_byte & 255;
5352 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
5354 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5355 else printf("First char = \\x%02x%s\n", ch, caseless);
5358 if ((re->options & PCRE_REQCHSET) != 0)
5360 int ch = re->req_byte & 255;
5361 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
5363 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5364 else printf("Req char = \\x%02x%s\n", ch, caseless);
5367 pcre_printint(re, stdout);
5369 /* This check is done here in the debugging case so that the code that
5370 was compiled can be seen. */
5372 if (code - codestart > length)
5375 *errorptr = error_texts + error_texts_offsets[ERR23];
5376 *erroroffset = ptr - (uschar *)pattern;
5377 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5385 /* End of pcre_compile.c */