1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Copyright (c) 1997-2006 University of Cambridge
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
41 /* This module contains the external function pcre_dfa_exec(), which is an
42 alternative matching function that uses a sort of DFA algorithm (not a true
43 FSM). This is NOT Perl- compatible, but it has advantages in certain
47 #define NLBLOCK md /* Block containing newline information */
48 #define PSSTART start_subject /* Field containing processed string start */
49 #define PSEND end_subject /* Field containing processed string end */
51 #include "pcre_internal.h"
54 /* For use to indent debugging output */
60 /*************************************************
61 * Code parameters and static tables *
62 *************************************************/
64 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
65 into others, under special conditions. A gap of 20 between the blocks should be
68 #define OP_PROP_EXTRA 100
69 #define OP_EXTUNI_EXTRA 120
70 #define OP_ANYNL_EXTRA 140
73 /* This table identifies those opcodes that are followed immediately by a
74 character that is to be tested in some way. This makes is possible to
75 centralize the loading of these characters. In the case of Type * etc, the
76 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
79 static uschar coptable[] = {
81 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* \A, \G, \B, \b, \D, \d, \S, \s, \W, \w */
82 0, 0, /* Any, Anybyte */
83 0, 0, 0, 0, /* NOTPROP, PROP, EXTUNI, ANYNL */
84 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
88 /* Positive single-char repeats */
89 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
90 3, 3, 3, /* upto, minupto, exact */
91 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
92 /* Negative single-char repeats - only for chars < 256 */
93 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
94 3, 3, 3, /* NOT upto, minupto, exact */
95 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
96 /* Positive type repeats */
97 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
98 3, 3, 3, /* Type upto, minupto, exact */
99 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
100 /* Character class & ref repeats */
101 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
102 0, 0, /* CRRANGE, CRMINRANGE */
105 0, /* XCLASS - variable length */
115 0, /* Assert behind */
116 0, /* Assert behind not */
118 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
119 0, 0, 0, /* SBRA, SCBRA, SCOND */
123 0, 0 /* BRAZERO, BRAMINZERO */
126 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
129 static uschar toptable1[] = {
131 ctype_digit, ctype_digit,
132 ctype_space, ctype_space,
133 ctype_word, ctype_word,
137 static uschar toptable2[] = {
146 /* Structure for holding data about a particular state, which is in effect the
147 current data for an active path through the match tree. It must consist
148 entirely of ints because the working vector we are passed, and which we put
149 these structures in, is a vector of ints. */
151 typedef struct stateblock {
152 int offset; /* Offset to opcode */
153 int count; /* Count for repeats */
154 int ims; /* ims flag bits */
155 int data; /* Some use extra data */
158 #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
162 /*************************************************
163 * Print character string *
164 *************************************************/
166 /* Character string printing function for debugging.
170 length number of bytes
177 pchars(unsigned char *p, int length, FILE *f)
182 if (isprint(c = *(p++)))
185 fprintf(f, "\\x%02x", c);
192 /*************************************************
193 * Execute a Regular Expression - DFA engine *
194 *************************************************/
196 /* This internal function applies a compiled pattern to a subject string,
197 starting at a given point, using a DFA engine. This function is called from the
198 external one, possibly multiple times if the pattern is not anchored. The
199 function calls itself recursively for some kinds of subpattern.
202 md the match_data block with fixed information
203 this_start_code the opening bracket of this subexpression's code
204 current_subject where we currently are in the subject string
205 start_offset start offset in the subject string
206 offsets vector to contain the matching string offsets
207 offsetcount size of same
208 workspace vector of workspace
210 ims the current ims flags
211 rlevel function call recursion level
212 recursing regex recursive call level
216 -1 => failed to match
217 < -1 => some kind of unexpected problem
219 The following macros are used for adding states to the two state vectors (one
220 for the current character, one for the following character). */
222 #define ADD_ACTIVE(x,y) \
223 if (active_count++ < wscount) \
225 next_active_state->offset = (x); \
226 next_active_state->count = (y); \
227 next_active_state->ims = ims; \
228 next_active_state++; \
229 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
231 else return PCRE_ERROR_DFA_WSSIZE
233 #define ADD_ACTIVE_DATA(x,y,z) \
234 if (active_count++ < wscount) \
236 next_active_state->offset = (x); \
237 next_active_state->count = (y); \
238 next_active_state->ims = ims; \
239 next_active_state->data = (z); \
240 next_active_state++; \
241 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
243 else return PCRE_ERROR_DFA_WSSIZE
245 #define ADD_NEW(x,y) \
246 if (new_count++ < wscount) \
248 next_new_state->offset = (x); \
249 next_new_state->count = (y); \
250 next_new_state->ims = ims; \
252 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
254 else return PCRE_ERROR_DFA_WSSIZE
256 #define ADD_NEW_DATA(x,y,z) \
257 if (new_count++ < wscount) \
259 next_new_state->offset = (x); \
260 next_new_state->count = (y); \
261 next_new_state->ims = ims; \
262 next_new_state->data = (z); \
264 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
266 else return PCRE_ERROR_DFA_WSSIZE
268 /* And now, here is the code */
273 const uschar *this_start_code,
274 const uschar *current_subject,
284 stateblock *active_states, *new_states, *temp_states;
285 stateblock *next_active_state, *next_new_state;
287 const uschar *ctypes, *lcc, *fcc;
289 const uschar *end_code, *first_op;
291 int active_count, new_count, match_count;
293 /* Some fields in the md block are frequently referenced, so we load them into
294 independent variables in the hope that this will perform better. */
296 const uschar *start_subject = md->start_subject;
297 const uschar *end_subject = md->end_subject;
298 const uschar *start_code = md->start_code;
301 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
310 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
311 (2 * INTS_PER_STATEBLOCK);
313 DPRINTF(("\n%.*s---------------------\n"
314 "%.*sCall to internal_dfa_exec f=%d r=%d\n",
315 rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
317 ctypes = md->tables + ctypes_offset;
318 lcc = md->tables + lcc_offset;
319 fcc = md->tables + fcc_offset;
321 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
323 active_states = (stateblock *)(workspace + 2);
324 next_new_state = new_states = active_states + wscount;
327 first_op = this_start_code + 1 + LINK_SIZE +
328 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
330 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
331 the alternative states onto the list, and find out where the end is. This
332 makes is possible to use this function recursively, when we want to stop at a
333 matching internal ket rather than at the end.
335 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
336 a backward assertion. In that case, we have to find out the maximum amount to
337 move back, and set up each alternative appropriately. */
339 if (*first_op == OP_REVERSE)
344 end_code = this_start_code;
347 int back = GET(end_code, 2+LINK_SIZE);
348 if (back > max_back) max_back = back;
349 end_code += GET(end_code, 1);
351 while (*end_code == OP_ALT);
353 /* If we can't go back the amount required for the longest lookbehind
354 pattern, go back as far as we can; some alternatives may still be viable. */
357 /* In character mode we have to step back character by character */
361 for (gone_back = 0; gone_back < max_back; gone_back++)
363 if (current_subject <= start_subject) break;
365 while (current_subject > start_subject &&
366 (*current_subject & 0xc0) == 0x80)
373 /* In byte-mode we can do this quickly. */
376 gone_back = (current_subject - max_back < start_subject)?
377 current_subject - start_subject : max_back;
378 current_subject -= gone_back;
381 /* Now we can process the individual branches. */
383 end_code = this_start_code;
386 int back = GET(end_code, 2+LINK_SIZE);
387 if (back <= gone_back)
389 int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
390 ADD_NEW_DATA(-bstate, 0, gone_back - back);
392 end_code += GET(end_code, 1);
394 while (*end_code == OP_ALT);
397 /* This is the code for a "normal" subpattern (not a backward assertion). The
398 start of a whole pattern is always one of these. If we are at the top level,
399 we may be asked to restart matching from the same point that we reached for a
400 previous partial match. We still have to scan through the top-level branches to
401 find the end state. */
405 end_code = this_start_code;
409 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
411 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
412 new_count = workspace[1];
414 memcpy(new_states, active_states, new_count * sizeof(stateblock));
421 int length = 1 + LINK_SIZE +
422 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
425 ADD_NEW(end_code - start_code + length, 0);
426 end_code += GET(end_code, 1);
427 length = 1 + LINK_SIZE;
429 while (*end_code == OP_ALT);
433 workspace[0] = 0; /* Bit indicating which vector is current */
435 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
437 /* Loop for scanning the subject */
439 ptr = current_subject;
446 /* Make the new state list into the active state list and empty the
449 temp_states = active_states;
450 active_states = new_states;
451 new_states = temp_states;
452 active_count = new_count;
455 workspace[0] ^= 1; /* Remember for the restarting feature */
456 workspace[1] = active_count;
459 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
460 pchars((uschar *)ptr, strlen((char *)ptr), stdout);
463 printf("%.*sActive states: ", rlevel*2-2, SP);
464 for (i = 0; i < active_count; i++)
465 printf("%d/%d ", active_states[i].offset, active_states[i].count);
469 /* Set the pointers for adding new states */
471 next_active_state = active_states + active_count;
472 next_new_state = new_states;
474 /* Load the current character from the subject outside the loop, as many
475 different states may want to look at it, and we assume that at least one
478 if (ptr < end_subject)
480 clen = 1; /* Number of bytes in the character */
482 if (utf8) { GETCHARLEN(c, ptr, clen); } else
483 #endif /* SUPPORT_UTF8 */
488 clen = 0; /* This indicates the end of the subject */
489 c = NOTACHAR; /* This value should never actually be used */
492 /* Scan up the active states and act on each one. The result of an action
493 may be to add more states to the currently active list (e.g. on hitting a
494 parenthesis) or it may be to put states on the new list, for considering
495 when we move the character pointer on. */
497 for (i = 0; i < active_count; i++)
499 stateblock *current_state = active_states + i;
501 int state_offset = current_state->offset;
502 int count, codevalue;
503 int chartype, script;
506 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
507 if (clen == 0) printf("EOL\n");
508 else if (c > 32 && c < 127) printf("'%c'\n", c);
509 else printf("0x%02x\n", c);
512 /* This variable is referred to implicity in the ADD_xxx macros. */
514 ims = current_state->ims;
516 /* A negative offset is a special case meaning "hold off going to this
517 (negated) state until the number of characters in the data field have
520 if (state_offset < 0)
522 if (current_state->data > 0)
524 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
525 ADD_NEW_DATA(state_offset, current_state->count,
526 current_state->data - 1);
531 current_state->offset = state_offset = -state_offset;
535 /* Check for a duplicate state with the same count, and skip if found. */
537 for (j = 0; j < i; j++)
539 if (active_states[j].offset == state_offset &&
540 active_states[j].count == current_state->count)
542 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
543 goto NEXT_ACTIVE_STATE;
547 /* The state offset is the offset to the opcode */
549 code = start_code + state_offset;
552 /* If this opcode is followed by an inline character, load it. It is
553 tempting to test for the presence of a subject character here, but that
554 is wrong, because sometimes zero repetitions of the subject are
557 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
558 argument that is not a data character - but is always one byte long.
559 Unfortunately, we have to take special action to deal with \P, \p, and
560 \X in this case. To keep the other cases fast, convert these ones to new
563 if (coptable[codevalue] > 0)
567 if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
568 #endif /* SUPPORT_UTF8 */
569 d = code[coptable[codevalue]];
570 if (codevalue >= OP_TYPESTAR)
574 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
576 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
577 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
578 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
585 dlen = 0; /* Not strictly necessary, but compilers moan */
586 d = NOTACHAR; /* if these variables are not set. */
590 /* Now process the individual opcodes */
595 /* ========================================================================== */
596 /* Reached a closing bracket. If not at the end of the pattern, carry
597 on with the next opcode. Otherwise, unless we have an empty string and
598 PCRE_NOTEMPTY is set, save the match data, shifting up all previous
599 matches so we always have the longest first. */
604 if (code != end_code)
606 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
607 if (codevalue != OP_KET)
609 ADD_ACTIVE(state_offset - GET(code, 1), 0);
612 else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
614 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
615 else if (match_count > 0 && ++match_count * 2 >= offsetcount)
617 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
618 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
619 if (offsetcount >= 2)
621 offsets[0] = current_subject - start_subject;
622 offsets[1] = ptr - start_subject;
623 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
624 offsets[1] - offsets[0], current_subject));
626 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
628 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
629 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
630 match_count, rlevel*2-2, SP));
636 /* ========================================================================== */
637 /* These opcodes add to the current list of states without looking
638 at the current character. */
640 /*-----------------------------------------------------------------*/
642 do { code += GET(code, 1); } while (*code == OP_ALT);
643 ADD_ACTIVE(code - start_code, 0);
646 /*-----------------------------------------------------------------*/
651 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
652 code += GET(code, 1);
654 while (*code == OP_ALT);
657 /*-----------------------------------------------------------------*/
660 ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
661 code += GET(code, 1);
662 while (*code == OP_ALT)
664 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
665 code += GET(code, 1);
669 /*-----------------------------------------------------------------*/
672 ADD_ACTIVE(state_offset + 1, 0);
673 code += 1 + GET(code, 2);
674 while (*code == OP_ALT) code += GET(code, 1);
675 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
678 /*-----------------------------------------------------------------*/
680 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
681 ((ims & PCRE_MULTILINE) != 0 &&
682 ptr != end_subject &&
684 { ADD_ACTIVE(state_offset + 1, 0); }
687 /*-----------------------------------------------------------------*/
689 if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
692 /*-----------------------------------------------------------------*/
695 ADD_ACTIVE(state_offset + 2, 0);
698 /*-----------------------------------------------------------------*/
700 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
703 /*-----------------------------------------------------------------*/
705 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
709 /* ========================================================================== */
710 /* These opcodes inspect the next subject character, and sometimes
711 the previous one as well, but do not have an argument. The variable
712 clen contains the length of the current character and is zero if we are
713 at the end of the subject. */
715 /*-----------------------------------------------------------------*/
717 if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))
718 { ADD_NEW(state_offset + 1, 0); }
721 /*-----------------------------------------------------------------*/
723 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
724 { ADD_ACTIVE(state_offset + 1, 0); }
727 /*-----------------------------------------------------------------*/
729 if ((md->moptions & PCRE_NOTEOL) == 0)
733 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
735 { ADD_ACTIVE(state_offset + 1, 0); }
737 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
738 { ADD_ACTIVE(state_offset + 1, 0); }
741 /*-----------------------------------------------------------------*/
746 if (clen > 0 && c < 256 &&
747 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
748 { ADD_NEW(state_offset + 1, 0); }
751 /*-----------------------------------------------------------------*/
753 case OP_NOT_WHITESPACE:
754 case OP_NOT_WORDCHAR:
755 if (clen > 0 && (c >= 256 ||
756 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
757 { ADD_NEW(state_offset + 1, 0); }
760 /*-----------------------------------------------------------------*/
761 case OP_WORD_BOUNDARY:
762 case OP_NOT_WORD_BOUNDARY:
764 int left_word, right_word;
766 if (ptr > start_subject)
768 const uschar *temp = ptr - 1;
770 if (utf8) BACKCHAR(temp);
772 GETCHARTEST(d, temp);
773 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
777 if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
780 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
781 { ADD_ACTIVE(state_offset + 1, 0); }
788 /*-----------------------------------------------------------------*/
789 /* Check the next character by Unicode property. We will get here only
790 if the support is in the binary; otherwise a compile-time error occurs.
798 int category = _pcre_ucp_findprop(c, &chartype, &script);
806 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
810 OK = category == code[2];
814 OK = chartype == code[2];
818 OK = script == code[2];
821 /* Should never occur, but keep compilers from grumbling. */
824 OK = codevalue != OP_PROP;
828 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
835 /* ========================================================================== */
836 /* These opcodes likewise inspect the subject character, but have an
837 argument that is not a data character. It is one of these opcodes:
838 OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
839 OP_NOT_WORDCHAR. The value is loaded into d. */
844 count = current_state->count; /* Already matched */
845 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
848 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
851 (ims & PCRE_DOTALL) != 0 ||
854 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
856 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
858 active_count--; /* Remove non-match possibility */
862 ADD_NEW(state_offset, count);
867 /*-----------------------------------------------------------------*/
869 case OP_TYPEMINQUERY:
870 case OP_TYPEPOSQUERY:
871 ADD_ACTIVE(state_offset + 2, 0);
874 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
877 (ims & PCRE_DOTALL) != 0 ||
880 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
882 if (codevalue == OP_TYPEPOSQUERY)
884 active_count--; /* Remove non-match possibility */
887 ADD_NEW(state_offset + 2, 0);
892 /*-----------------------------------------------------------------*/
896 ADD_ACTIVE(state_offset + 2, 0);
899 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
902 (ims & PCRE_DOTALL) != 0 ||
905 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
907 if (codevalue == OP_TYPEPOSSTAR)
909 active_count--; /* Remove non-match possibility */
912 ADD_NEW(state_offset, 0);
917 /*-----------------------------------------------------------------*/
919 count = current_state->count; /* Number already matched */
922 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
925 (ims & PCRE_DOTALL) != 0 ||
928 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
930 if (++count >= GET2(code, 1))
931 { ADD_NEW(state_offset + 4, 0); }
933 { ADD_NEW(state_offset, count); }
938 /*-----------------------------------------------------------------*/
942 ADD_ACTIVE(state_offset + 4, 0);
943 count = current_state->count; /* Number already matched */
946 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
949 (ims & PCRE_DOTALL) != 0 ||
952 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
954 if (codevalue == OP_TYPEPOSUPTO)
956 active_count--; /* Remove non-match possibility */
959 if (++count >= GET2(code, 1))
960 { ADD_NEW(state_offset + 4, 0); }
962 { ADD_NEW(state_offset, count); }
967 /* ========================================================================== */
968 /* These are virtual opcodes that are used when something like
969 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
970 argument. It keeps the code above fast for the other cases. The argument
971 is in the d variable. */
973 case OP_PROP_EXTRA + OP_TYPEPLUS:
974 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
975 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
976 count = current_state->count; /* Already matched */
977 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
981 int category = _pcre_ucp_findprop(c, &chartype, &script);
989 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
993 OK = category == code[3];
997 OK = chartype == code[3];
1001 OK = script == code[3];
1004 /* Should never occur, but keep compilers from grumbling. */
1007 OK = codevalue != OP_PROP;
1011 if (OK == (d == OP_PROP))
1013 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1015 active_count--; /* Remove non-match possibility */
1016 next_active_state--;
1019 ADD_NEW(state_offset, count);
1024 /*-----------------------------------------------------------------*/
1025 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1026 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1027 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1028 count = current_state->count; /* Already matched */
1029 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1030 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1032 const uschar *nptr = ptr + clen;
1034 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1036 active_count--; /* Remove non-match possibility */
1037 next_active_state--;
1039 while (nptr < end_subject)
1043 GETCHARLEN(nd, nptr, ndlen);
1044 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1049 ADD_NEW_DATA(-state_offset, count, ncount);
1053 /*-----------------------------------------------------------------*/
1054 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1055 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1056 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1057 count = current_state->count; /* Already matched */
1058 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1065 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1073 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1075 active_count--; /* Remove non-match possibility */
1076 next_active_state--;
1079 ADD_NEW_DATA(-state_offset, count, ncount);
1087 /*-----------------------------------------------------------------*/
1088 case OP_PROP_EXTRA + OP_TYPEQUERY:
1089 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1090 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1094 case OP_PROP_EXTRA + OP_TYPESTAR:
1095 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1096 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1101 ADD_ACTIVE(state_offset + 4, 0);
1105 int category = _pcre_ucp_findprop(c, &chartype, &script);
1113 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1117 OK = category == code[3];
1121 OK = chartype == code[3];
1125 OK = script == code[3];
1128 /* Should never occur, but keep compilers from grumbling. */
1131 OK = codevalue != OP_PROP;
1135 if (OK == (d == OP_PROP))
1137 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1138 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1140 active_count--; /* Remove non-match possibility */
1141 next_active_state--;
1143 ADD_NEW(state_offset + count, 0);
1148 /*-----------------------------------------------------------------*/
1149 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1150 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1151 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1155 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1156 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1157 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1162 ADD_ACTIVE(state_offset + 2, 0);
1163 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1165 const uschar *nptr = ptr + clen;
1167 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1168 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1170 active_count--; /* Remove non-match possibility */
1171 next_active_state--;
1173 while (nptr < end_subject)
1177 GETCHARLEN(nd, nptr, ndlen);
1178 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1182 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1186 /*-----------------------------------------------------------------*/
1187 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1188 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1189 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1193 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1194 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1195 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1199 ADD_ACTIVE(state_offset + 2, 0);
1206 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1214 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1215 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1217 active_count--; /* Remove non-match possibility */
1218 next_active_state--;
1220 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1228 /*-----------------------------------------------------------------*/
1229 case OP_PROP_EXTRA + OP_TYPEEXACT:
1230 case OP_PROP_EXTRA + OP_TYPEUPTO:
1231 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1232 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1233 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1234 { ADD_ACTIVE(state_offset + 6, 0); }
1235 count = current_state->count; /* Number already matched */
1239 int category = _pcre_ucp_findprop(c, &chartype, &script);
1247 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1251 OK = category == code[5];
1255 OK = chartype == code[5];
1259 OK = script == code[5];
1262 /* Should never occur, but keep compilers from grumbling. */
1265 OK = codevalue != OP_PROP;
1269 if (OK == (d == OP_PROP))
1271 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1273 active_count--; /* Remove non-match possibility */
1274 next_active_state--;
1276 if (++count >= GET2(code, 1))
1277 { ADD_NEW(state_offset + 6, 0); }
1279 { ADD_NEW(state_offset, count); }
1284 /*-----------------------------------------------------------------*/
1285 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1286 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1287 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1288 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1289 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1290 { ADD_ACTIVE(state_offset + 4, 0); }
1291 count = current_state->count; /* Number already matched */
1292 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1294 const uschar *nptr = ptr + clen;
1296 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1298 active_count--; /* Remove non-match possibility */
1299 next_active_state--;
1301 while (nptr < end_subject)
1305 GETCHARLEN(nd, nptr, ndlen);
1306 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1310 if (++count >= GET2(code, 1))
1311 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1313 { ADD_NEW_DATA(-state_offset, count, ncount); }
1317 /*-----------------------------------------------------------------*/
1318 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1319 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1320 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1321 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1322 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1323 { ADD_ACTIVE(state_offset + 4, 0); }
1324 count = current_state->count; /* Number already matched */
1331 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1339 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1341 active_count--; /* Remove non-match possibility */
1342 next_active_state--;
1344 if (++count >= GET2(code, 1))
1345 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1347 { ADD_NEW_DATA(-state_offset, count, ncount); }
1355 /* ========================================================================== */
1356 /* These opcodes are followed by a character that is usually compared
1357 to the current subject character; it is loaded into d. We still get
1358 here even if there is no subject character, because in some cases zero
1359 repetitions are permitted. */
1361 /*-----------------------------------------------------------------*/
1363 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1366 /*-----------------------------------------------------------------*/
1368 if (clen == 0) break;
1373 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1375 unsigned int othercase;
1376 if (c < 128) othercase = fcc[c]; else
1378 /* If we have Unicode property support, we can use it to test the
1379 other case of the character. */
1382 othercase = _pcre_ucp_othercase(c);
1384 othercase = NOTACHAR;
1387 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1391 #endif /* SUPPORT_UTF8 */
1393 /* Non-UTF-8 mode */
1395 if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1401 /*-----------------------------------------------------------------*/
1402 /* This is a tricky one because it can match more than one character.
1403 Find out how many characters to skip, and then set up a negative state
1404 to wait for them to pass before continuing. */
1407 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1409 const uschar *nptr = ptr + clen;
1411 while (nptr < end_subject)
1414 GETCHARLEN(c, nptr, nclen);
1415 if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1419 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1424 /*-----------------------------------------------------------------*/
1425 /* This is a tricky like EXTUNI because it too can match more than one
1426 character (when CR is followed by LF). In this case, set up a negative
1427 state to wait for one character to pass before continuing. */
1430 if (clen > 0) switch(c)
1438 ADD_NEW(state_offset + 1, 0);
1441 if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1443 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1447 ADD_NEW(state_offset + 1, 0);
1453 /*-----------------------------------------------------------------*/
1454 /* Match a negated single character. This is only used for one-byte
1455 characters, that is, we know that d < 256. The character we are
1456 checking (c) can be multibyte. */
1461 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1462 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1466 /*-----------------------------------------------------------------*/
1473 count = current_state->count; /* Already matched */
1474 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1477 unsigned int otherd = NOTACHAR;
1478 if ((ims & PCRE_CASELESS) != 0)
1481 if (utf8 && d >= 128)
1484 otherd = _pcre_ucp_othercase(d);
1485 #endif /* SUPPORT_UCP */
1488 #endif /* SUPPORT_UTF8 */
1491 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1494 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1496 active_count--; /* Remove non-match possibility */
1497 next_active_state--;
1500 ADD_NEW(state_offset, count);
1505 /*-----------------------------------------------------------------*/
1510 case OP_NOTMINQUERY:
1511 case OP_NOTPOSQUERY:
1512 ADD_ACTIVE(state_offset + dlen + 1, 0);
1515 unsigned int otherd = NOTACHAR;
1516 if ((ims & PCRE_CASELESS) != 0)
1519 if (utf8 && d >= 128)
1522 otherd = _pcre_ucp_othercase(d);
1523 #endif /* SUPPORT_UCP */
1526 #endif /* SUPPORT_UTF8 */
1529 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1531 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1533 active_count--; /* Remove non-match possibility */
1534 next_active_state--;
1536 ADD_NEW(state_offset + dlen + 1, 0);
1541 /*-----------------------------------------------------------------*/
1548 ADD_ACTIVE(state_offset + dlen + 1, 0);
1551 unsigned int otherd = NOTACHAR;
1552 if ((ims & PCRE_CASELESS) != 0)
1555 if (utf8 && d >= 128)
1558 otherd = _pcre_ucp_othercase(d);
1559 #endif /* SUPPORT_UCP */
1562 #endif /* SUPPORT_UTF8 */
1565 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1567 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1569 active_count--; /* Remove non-match possibility */
1570 next_active_state--;
1572 ADD_NEW(state_offset, 0);
1577 /*-----------------------------------------------------------------*/
1580 count = current_state->count; /* Number already matched */
1583 unsigned int otherd = NOTACHAR;
1584 if ((ims & PCRE_CASELESS) != 0)
1587 if (utf8 && d >= 128)
1590 otherd = _pcre_ucp_othercase(d);
1591 #endif /* SUPPORT_UCP */
1594 #endif /* SUPPORT_UTF8 */
1597 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1599 if (++count >= GET2(code, 1))
1600 { ADD_NEW(state_offset + dlen + 3, 0); }
1602 { ADD_NEW(state_offset, count); }
1607 /*-----------------------------------------------------------------*/
1614 ADD_ACTIVE(state_offset + dlen + 3, 0);
1615 count = current_state->count; /* Number already matched */
1618 unsigned int otherd = NOTACHAR;
1619 if ((ims & PCRE_CASELESS) != 0)
1622 if (utf8 && d >= 128)
1625 otherd = _pcre_ucp_othercase(d);
1626 #endif /* SUPPORT_UCP */
1629 #endif /* SUPPORT_UTF8 */
1632 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1634 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
1636 active_count--; /* Remove non-match possibility */
1637 next_active_state--;
1639 if (++count >= GET2(code, 1))
1640 { ADD_NEW(state_offset + dlen + 3, 0); }
1642 { ADD_NEW(state_offset, count); }
1648 /* ========================================================================== */
1649 /* These are the class-handling opcodes */
1655 BOOL isinclass = FALSE;
1656 int next_state_offset;
1657 const uschar *ecode;
1659 /* For a simple class, there is always just a 32-byte table, and we
1660 can set isinclass from it. */
1662 if (codevalue != OP_XCLASS)
1667 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
1668 ((code[1 + c/8] & (1 << (c&7))) != 0);
1672 /* An extended class may have a table or a list of single characters,
1673 ranges, or both, and it may be positive or negative. There's a
1674 function that sorts all this out. */
1678 ecode = code + GET(code, 1);
1679 if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
1682 /* At this point, isinclass is set for all kinds of class, and ecode
1683 points to the byte after the end of the class. If there is a
1684 quantifier, this is where it will be. */
1686 next_state_offset = ecode - start_code;
1692 ADD_ACTIVE(next_state_offset + 1, 0);
1693 if (isinclass) { ADD_NEW(state_offset, 0); }
1698 count = current_state->count; /* Already matched */
1699 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
1700 if (isinclass) { count++; ADD_NEW(state_offset, count); }
1705 ADD_ACTIVE(next_state_offset + 1, 0);
1706 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
1711 count = current_state->count; /* Already matched */
1712 if (count >= GET2(ecode, 1))
1713 { ADD_ACTIVE(next_state_offset + 5, 0); }
1716 int max = GET2(ecode, 3);
1717 if (++count >= max && max != 0) /* Max 0 => no limit */
1718 { ADD_NEW(next_state_offset + 5, 0); }
1720 { ADD_NEW(state_offset, count); }
1725 if (isinclass) { ADD_NEW(next_state_offset, 0); }
1731 /* ========================================================================== */
1732 /* These are the opcodes for fancy brackets of various kinds. We have
1733 to use recursion in order to handle them. */
1738 case OP_ASSERTBACK_NOT:
1741 int local_offsets[2];
1742 int local_workspace[1000];
1743 const uschar *endasscode = code + GET(code, 1);
1745 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
1747 rc = internal_dfa_exec(
1748 md, /* static match data */
1749 code, /* this subexpression's code */
1750 ptr, /* where we currently are */
1751 ptr - start_subject, /* start offset */
1752 local_offsets, /* offset vector */
1753 sizeof(local_offsets)/sizeof(int), /* size of same */
1754 local_workspace, /* workspace vector */
1755 sizeof(local_workspace)/sizeof(int), /* size of same */
1756 ims, /* the current ims flags */
1757 rlevel, /* function recursion level */
1758 recursing); /* pass on regex recursion */
1760 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
1761 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
1765 /*-----------------------------------------------------------------*/
1769 int local_offsets[1000];
1770 int local_workspace[1000];
1771 int condcode = code[LINK_SIZE+1];
1773 /* Back reference conditions are not supported */
1775 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
1777 /* The DEFINE condition is always false */
1779 if (condcode == OP_DEF)
1781 ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
1784 /* The only supported version of OP_RREF is for the value RREF_ANY,
1785 which means "test if in any recursion". We can't test for specifically
1788 else if (condcode == OP_RREF)
1790 int value = GET2(code, LINK_SIZE+2);
1791 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
1792 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
1793 else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
1796 /* Otherwise, the condition is an assertion */
1801 const uschar *asscode = code + LINK_SIZE + 1;
1802 const uschar *endasscode = asscode + GET(asscode, 1);
1804 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
1806 rc = internal_dfa_exec(
1807 md, /* fixed match data */
1808 asscode, /* this subexpression's code */
1809 ptr, /* where we currently are */
1810 ptr - start_subject, /* start offset */
1811 local_offsets, /* offset vector */
1812 sizeof(local_offsets)/sizeof(int), /* size of same */
1813 local_workspace, /* workspace vector */
1814 sizeof(local_workspace)/sizeof(int), /* size of same */
1815 ims, /* the current ims flags */
1816 rlevel, /* function recursion level */
1817 recursing); /* pass on regex recursion */
1820 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
1821 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
1823 { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
1828 /*-----------------------------------------------------------------*/
1831 int local_offsets[1000];
1832 int local_workspace[1000];
1835 DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
1838 rc = internal_dfa_exec(
1839 md, /* fixed match data */
1840 start_code + GET(code, 1), /* this subexpression's code */
1841 ptr, /* where we currently are */
1842 ptr - start_subject, /* start offset */
1843 local_offsets, /* offset vector */
1844 sizeof(local_offsets)/sizeof(int), /* size of same */
1845 local_workspace, /* workspace vector */
1846 sizeof(local_workspace)/sizeof(int), /* size of same */
1847 ims, /* the current ims flags */
1848 rlevel, /* function recursion level */
1849 recursing + 1); /* regex recurse level */
1851 DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
1852 recursing + 1, rc));
1854 /* Ran out of internal offsets */
1856 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
1858 /* For each successful matched substring, set up the next state with a
1859 count of characters to skip before trying it. Note that the count is in
1860 characters, not bytes. */
1864 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
1866 const uschar *p = start_subject + local_offsets[rc];
1867 const uschar *pp = start_subject + local_offsets[rc+1];
1868 int charcount = local_offsets[rc+1] - local_offsets[rc];
1869 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
1872 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
1876 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
1880 else if (rc != PCRE_ERROR_NOMATCH) return rc;
1884 /*-----------------------------------------------------------------*/
1887 int local_offsets[2];
1888 int local_workspace[1000];
1890 int rc = internal_dfa_exec(
1891 md, /* fixed match data */
1892 code, /* this subexpression's code */
1893 ptr, /* where we currently are */
1894 ptr - start_subject, /* start offset */
1895 local_offsets, /* offset vector */
1896 sizeof(local_offsets)/sizeof(int), /* size of same */
1897 local_workspace, /* workspace vector */
1898 sizeof(local_workspace)/sizeof(int), /* size of same */
1899 ims, /* the current ims flags */
1900 rlevel, /* function recursion level */
1901 recursing); /* pass on regex recursion */
1905 const uschar *end_subpattern = code;
1906 int charcount = local_offsets[1] - local_offsets[0];
1907 int next_state_offset, repeat_state_offset;
1909 do { end_subpattern += GET(end_subpattern, 1); }
1910 while (*end_subpattern == OP_ALT);
1911 next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
1913 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
1914 arrange for the repeat state also to be added to the relevant list.
1915 Calculate the offset, or set -1 for no repeat. */
1917 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
1918 *end_subpattern == OP_KETRMIN)?
1919 end_subpattern - start_code - GET(end_subpattern, 1) : -1;
1921 /* If we have matched an empty string, add the next state at the
1922 current character pointer. This is important so that the duplicate
1923 checking kicks in, which is what breaks infinite loops that match an
1928 ADD_ACTIVE(next_state_offset, 0);
1931 /* Optimization: if there are no more active states, and there
1932 are no new states yet set up, then skip over the subject string
1933 right here, to save looping. Otherwise, set up the new state to swing
1934 into action when the end of the substring is reached. */
1936 else if (i + 1 >= active_count && new_count == 0)
1940 ADD_NEW(next_state_offset, 0);
1942 /* If we are adding a repeat state at the new character position,
1943 we must fudge things so that it is the only current state.
1944 Otherwise, it might be a duplicate of one we processed before, and
1945 that would cause it to be skipped. */
1947 if (repeat_state_offset >= 0)
1949 next_active_state = active_states;
1952 ADD_ACTIVE(repeat_state_offset, 0);
1957 const uschar *p = start_subject + local_offsets[0];
1958 const uschar *pp = start_subject + local_offsets[1];
1959 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
1960 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
1961 if (repeat_state_offset >= 0)
1962 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
1966 else if (rc != PCRE_ERROR_NOMATCH) return rc;
1971 /* ========================================================================== */
1972 /* Handle callouts */
1975 if (pcre_callout != NULL)
1978 pcre_callout_block cb;
1979 cb.version = 1; /* Version 1 of the callout block */
1980 cb.callout_number = code[1];
1981 cb.offset_vector = offsets;
1982 cb.subject = (PCRE_SPTR)start_subject;
1983 cb.subject_length = end_subject - start_subject;
1984 cb.start_match = current_subject - start_subject;
1985 cb.current_position = ptr - start_subject;
1986 cb.pattern_position = GET(code, 2);
1987 cb.next_item_length = GET(code, 2 + LINK_SIZE);
1989 cb.capture_last = -1;
1990 cb.callout_data = md->callout_data;
1991 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
1992 if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
1997 /* ========================================================================== */
1998 default: /* Unsupported opcode */
1999 return PCRE_ERROR_DFA_UITEM;
2002 NEXT_ACTIVE_STATE: continue;
2004 } /* End of loop scanning active states */
2006 /* We have finished the processing at the current subject character. If no
2007 new states have been set for the next character, we have found all the
2008 matches that we are going to find. If we are at the top level and partial
2009 matching has been requested, check for appropriate conditions. */
2013 if (match_count < 0 && /* No matches found */
2014 rlevel == 1 && /* Top level match function */
2015 (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
2016 ptr >= end_subject && /* Reached end of subject */
2017 ptr > current_subject) /* Matched non-empty string */
2019 if (offsetcount >= 2)
2021 offsets[0] = current_subject - start_subject;
2022 offsets[1] = end_subject - start_subject;
2024 match_count = PCRE_ERROR_PARTIAL;
2027 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2028 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2030 break; /* In effect, "return", but see the comment below */
2033 /* One or more states are active for the next character. */
2035 ptr += clen; /* Advance to next subject character */
2036 } /* Loop to move along the subject string */
2038 /* Control gets here from "break" a few lines above. We do it this way because
2039 if we use "return" above, we have compiler trouble. Some compilers warn if
2040 there's nothing here because they think the function doesn't return a value. On
2041 the other hand, if we put a dummy statement here, some more clever compilers
2042 complain that it can't be reached. Sigh. */
2050 /*************************************************
2051 * Execute a Regular Expression - DFA engine *
2052 *************************************************/
2054 /* This external function applies a compiled re to a subject string using a DFA
2055 engine. This function calls the internal function multiple times if the pattern
2059 argument_re points to the compiled expression
2060 extra_data points to extra data or is NULL (not currently used)
2061 subject points to the subject string
2062 length length of subject string (may contain binary zeros)
2063 start_offset where to start in the subject string
2065 offsets vector of match offsets
2066 offsetcount size of same
2067 workspace workspace vector
2068 wscount size of same
2070 Returns: > 0 => number of match offset pairs placed in offsets
2071 = 0 => offsets overflowed; longest matches are present
2072 -1 => failed to match
2073 < -1 => some kind of unexpected problem
2077 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2078 const char *subject, int length, int start_offset, int options, int *offsets,
2079 int offsetcount, int *workspace, int wscount)
2081 real_pcre *re = (real_pcre *)argument_re;
2082 dfa_match_data match_block;
2083 dfa_match_data *md = &match_block;
2084 BOOL utf8, anchored, startline, firstline;
2085 const uschar *current_subject, *end_subject, *lcc;
2087 pcre_study_data internal_study;
2088 const pcre_study_data *study = NULL;
2089 real_pcre internal_re;
2091 const uschar *req_byte_ptr;
2092 const uschar *start_bits = NULL;
2093 BOOL first_byte_caseless = FALSE;
2094 BOOL req_byte_caseless = FALSE;
2095 int first_byte = -1;
2100 /* Plausibility checks */
2102 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2103 if (re == NULL || subject == NULL || workspace == NULL ||
2104 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2105 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2106 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2108 /* We need to find the pointer to any study data before we test for byte
2109 flipping, so we scan the extra_data block first. This may set two fields in the
2110 match block, so we must initialize them beforehand. However, the other fields
2111 in the match block must not be set until after the byte flipping. */
2113 md->tables = re->tables;
2114 md->callout_data = NULL;
2116 if (extra_data != NULL)
2118 unsigned int flags = extra_data->flags;
2119 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2120 study = (const pcre_study_data *)extra_data->study_data;
2121 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2122 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2123 return PCRE_ERROR_DFA_UMLIMIT;
2124 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2125 md->callout_data = extra_data->callout_data;
2126 if ((flags & PCRE_EXTRA_TABLES) != 0)
2127 md->tables = extra_data->tables;
2130 /* Check that the first field in the block is the magic number. If it is not,
2131 test for a regex that was compiled on a host of opposite endianness. If this is
2132 the case, flipped values are put in internal_re and internal_study if there was
2135 if (re->magic_number != MAGIC_NUMBER)
2137 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2138 if (re == NULL) return PCRE_ERROR_BADMAGIC;
2139 if (study != NULL) study = &internal_study;
2142 /* Set some local values */
2144 current_subject = (const unsigned char *)subject + start_offset;
2145 end_subject = (const unsigned char *)subject + length;
2146 req_byte_ptr = current_subject - 1;
2149 utf8 = (re->options & PCRE_UTF8) != 0;
2154 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2155 (re->options & PCRE_ANCHORED) != 0;
2157 /* The remaining fixed data for passing around. */
2159 md->start_code = (const uschar *)argument_re +
2160 re->name_table_offset + re->name_count * re->name_entry_size;
2161 md->start_subject = (const unsigned char *)subject;
2162 md->end_subject = end_subject;
2163 md->moptions = options;
2164 md->poptions = re->options;
2166 /* Handle different types of newline. The two bits give four cases. If nothing
2167 is set at run time, whatever was used at compile time applies. */
2169 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : options) &
2172 case 0: newline = NEWLINE; break; /* Compile-time default */
2173 case PCRE_NEWLINE_CR: newline = '\r'; break;
2174 case PCRE_NEWLINE_LF: newline = '\n'; break;
2175 case PCRE_NEWLINE_CR+
2176 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2177 case PCRE_NEWLINE_ANY: newline = -1; break;
2178 default: return PCRE_ERROR_BADNEWLINE;
2183 md->nltype = NLTYPE_ANY;
2187 md->nltype = NLTYPE_FIXED;
2191 md->nl[0] = (newline >> 8) & 255;
2192 md->nl[1] = newline & 255;
2197 md->nl[0] = newline;
2201 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2202 back the character offset. */
2205 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2207 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2208 return PCRE_ERROR_BADUTF8;
2209 if (start_offset > 0 && start_offset < length)
2211 int tb = ((uschar *)subject)[start_offset];
2215 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2221 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2222 is a feature that makes it possible to save compiled regex and re-use them
2223 in other programs later. */
2225 if (md->tables == NULL) md->tables = _pcre_default_tables;
2227 /* The lower casing table and the "must be at the start of a line" flag are
2228 used in a loop when finding where to start. */
2230 lcc = md->tables + lcc_offset;
2231 startline = (re->options & PCRE_STARTLINE) != 0;
2232 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2234 /* Set up the first character to match, if available. The first_byte value is
2235 never set for an anchored regular expression, but the anchoring may be forced
2236 at run time, so we have to test for anchoring. The first char may be unset for
2237 an unanchored pattern, of course. If there's no first char and the pattern was
2238 studied, there may be a bitmap of possible first characters. */
2242 if ((re->options & PCRE_FIRSTSET) != 0)
2244 first_byte = re->first_byte & 255;
2245 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2246 first_byte = lcc[first_byte];
2250 if (startline && study != NULL &&
2251 (study->options & PCRE_STUDY_MAPPED) != 0)
2252 start_bits = study->start_bits;
2256 /* For anchored or unanchored matches, there may be a "last known required
2259 if ((re->options & PCRE_REQCHSET) != 0)
2261 req_byte = re->req_byte & 255;
2262 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2263 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2266 /* Call the main matching function, looping for a non-anchored regex after a
2267 failed match. Unless restarting, optimize by moving to the first match
2268 character if possible, when not anchored. Then unless wanting a partial match,
2269 check for a required later character. */
2275 if ((options & PCRE_DFA_RESTART) == 0)
2277 const uschar *save_end_subject = end_subject;
2279 /* Advance to a unique first char if possible. If firstline is TRUE, the
2280 start of the match is constrained to the first line of a multiline string.
2281 Implement this by temporarily adjusting end_subject so that we stop
2282 scanning at a newline. If the match fails at the newline, later code breaks
2287 const uschar *t = current_subject;
2288 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2292 if (first_byte >= 0)
2294 if (first_byte_caseless)
2295 while (current_subject < end_subject &&
2296 lcc[*current_subject] != first_byte)
2299 while (current_subject < end_subject && *current_subject != first_byte)
2303 /* Or to just after a linebreak for a multiline match if possible */
2307 if (current_subject > md->start_subject + start_offset)
2309 while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
2314 /* Or to a non-unique first char after study */
2316 else if (start_bits != NULL)
2318 while (current_subject < end_subject)
2320 register unsigned int c = *current_subject;
2321 if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2326 /* Restore fudged end_subject */
2328 end_subject = save_end_subject;
2331 /* If req_byte is set, we know that that character must appear in the subject
2332 for the match to succeed. If the first character is set, req_byte must be
2333 later in the subject; otherwise the test starts at the match point. This
2334 optimization can save a huge amount of work in patterns with nested unlimited
2335 repeats that aren't going to match. Writing separate code for cased/caseless
2336 versions makes it go faster, as does using an autoincrement and backing off
2339 HOWEVER: when the subject string is very, very long, searching to its end can
2340 take a long time, and give bad performance on quite ordinary patterns. This
2341 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2342 don't do this when the string is sufficiently long.
2344 ALSO: this processing is disabled when partial matching is requested.
2347 if (req_byte >= 0 &&
2348 end_subject - current_subject < REQ_BYTE_MAX &&
2349 (options & PCRE_PARTIAL) == 0)
2351 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2353 /* We don't need to repeat the search if we haven't yet reached the
2354 place we found it at last time. */
2356 if (p > req_byte_ptr)
2358 if (req_byte_caseless)
2360 while (p < end_subject)
2362 register int pp = *p++;
2363 if (pp == req_byte || pp == req_byte2) { p--; break; }
2368 while (p < end_subject)
2370 if (*p++ == req_byte) { p--; break; }
2374 /* If we can't find the required character, break the matching loop,
2375 which will cause a return or PCRE_ERROR_NOMATCH. */
2377 if (p >= end_subject) break;
2379 /* If we have found the required character, save the point where we
2380 found it, so that we don't search again next time round the loop if
2381 the start hasn't passed this character yet. */
2387 /* OK, now we can do the business */
2389 rc = internal_dfa_exec(
2390 md, /* fixed match data */
2391 md->start_code, /* this subexpression's code */
2392 current_subject, /* where we currently are */
2393 start_offset, /* start offset in subject */
2394 offsets, /* offset vector */
2395 offsetcount, /* size of same */
2396 workspace, /* workspace vector */
2397 wscount, /* size of same */
2398 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2399 0, /* function recurse level */
2400 0); /* regex recurse level */
2402 /* Anything other than "no match" means we are done, always; otherwise, carry
2403 on only if not anchored. */
2405 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2407 /* Advance to the next subject character unless we are at the end of a line
2408 and firstline is set. */
2410 if (firstline && IS_NEWLINE(current_subject)) break;
2414 while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2417 if (current_subject > end_subject) break;
2419 /* If we have just passed a CR and the newline option is CRLF or ANY, and we
2420 are now at a LF, advance the match position by one more character. */
2422 if (current_subject[-1] == '\r' &&
2423 (md->nltype == NLTYPE_ANY || md->nllen == 2) &&
2424 current_subject < end_subject &&
2425 *current_subject == '\n')
2428 } /* "Bumpalong" loop */
2430 return PCRE_ERROR_NOMATCH;
2433 /* End of pcre_dfa_exec.c */