1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language (but see
7 below for why this module is different).
9 Written by Philip Hazel
10 Copyright (c) 1997-2010 University of Cambridge
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
42 /* This module contains the external function pcre_dfa_exec(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl- compatible, but it has advantages in certain
48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49 the performance of his patterns greatly. I could not use it as it stood, as it
50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 test 7 to loop, and test 9 to crash with a segfault.
53 The issue is the check for duplicate states, which is done by a simple linear
54 search up the state list. (Grep for "duplicate" below to find the code.) For
55 many patterns, there will never be many states active at one time, so a simple
56 linear search is fine. In patterns that have many active states, it might be a
57 bottleneck. The suggested code used an indexing scheme to remember which states
58 had previously been used for each character, and avoided the linear search when
59 it knew there was no chance of a duplicate. This was implemented when adding
60 states to the state lists.
62 I wrote some thread-safe, not-limited code to try something similar at the time
63 of checking for duplicates (instead of when adding states), using index vectors
64 on the stack. It did give a 13% improvement with one specially constructed
65 pattern for certain subject strings, but on other strings and on many of the
66 simpler patterns in the test suite it did worse. The major problem, I think,
67 was the extra time to initialize the index. This had to be done for each call
68 of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69 only once - I suspect this was the cause of the problems with the tests.)
71 Overall, I concluded that the gains in some cases did not outweigh the losses
72 in others, so I abandoned this code. */
80 #define NLBLOCK md /* Block containing newline information */
81 #define PSSTART start_subject /* Field containing processed string start */
82 #define PSEND end_subject /* Field containing processed string end */
84 #include "pcre_internal.h"
87 /* For use to indent debugging output */
92 /*************************************************
93 * Code parameters and static tables *
94 *************************************************/
96 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 into others, under special conditions. A gap of 20 between the blocks should be
98 enough. The resulting opcodes don't have to be less than 256 because they are
99 never stored, so we push them well clear of the normal opcodes. */
101 #define OP_PROP_EXTRA 300
102 #define OP_EXTUNI_EXTRA 320
103 #define OP_ANYNL_EXTRA 340
104 #define OP_HSPACE_EXTRA 360
105 #define OP_VSPACE_EXTRA 380
108 /* This table identifies those opcodes that are followed immediately by a
109 character that is to be tested in some way. This makes is possible to
110 centralize the loading of these characters. In the case of Type * etc, the
111 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 small value. Non-zero values in the table are the offsets from the opcode where
113 the character is to be found. ***NOTE*** If the start of this table is
114 modified, the three tables that follow must also be modified. */
116 static const uschar coptable[] = {
118 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120 0, 0, 0, /* Any, AllAny, Anybyte */
122 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
124 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
128 /* Positive single-char repeats */
129 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
130 3, 3, 3, /* upto, minupto, exact */
131 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
132 /* Negative single-char repeats - only for chars < 256 */
133 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
134 3, 3, 3, /* NOT upto, minupto, exact */
135 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
136 /* Positive type repeats */
137 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
138 3, 3, 3, /* Type upto, minupto, exact */
139 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
140 /* Character class & ref repeats */
141 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
142 0, 0, /* CRRANGE, CRMINRANGE */
145 0, /* XCLASS - variable length */
155 0, /* Assert behind */
156 0, /* Assert behind not */
158 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
159 0, 0, 0, /* SBRA, SCBRA, SCOND */
160 0, 0, /* CREF, NCREF */
161 0, 0, /* RREF, NRREF */
163 0, 0, /* BRAZERO, BRAMINZERO */
164 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
165 0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
168 /* This table identifies those opcodes that inspect a character. It is used to
169 remember the fact that a character could have been inspected when the end of
170 the subject is reached. ***NOTE*** If the start of this table is modified, the
171 two tables that follow must also be modified. */
173 static const uschar poptable[] = {
175 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
176 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
177 1, 1, 1, /* Any, AllAny, Anybyte */
179 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
181 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
185 /* Positive single-char repeats */
186 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
187 1, 1, 1, /* upto, minupto, exact */
188 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
189 /* Negative single-char repeats - only for chars < 256 */
190 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
191 1, 1, 1, /* NOT upto, minupto, exact */
192 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
193 /* Positive type repeats */
194 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
195 1, 1, 1, /* Type upto, minupto, exact */
196 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
197 /* Character class & ref repeats */
198 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
199 1, 1, /* CRRANGE, CRMINRANGE */
202 1, /* XCLASS - variable length */
212 0, /* Assert behind */
213 0, /* Assert behind not */
215 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
216 0, 0, 0, /* SBRA, SCBRA, SCOND */
217 0, 0, /* CREF, NCREF */
218 0, 0, /* RREF, NRREF */
220 0, 0, /* BRAZERO, BRAMINZERO */
221 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
222 0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
225 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
228 static const uschar toptable1[] = {
230 ctype_digit, ctype_digit,
231 ctype_space, ctype_space,
232 ctype_word, ctype_word,
233 0, 0 /* OP_ANY, OP_ALLANY */
236 static const uschar toptable2[] = {
241 1, 1 /* OP_ANY, OP_ALLANY */
245 /* Structure for holding data about a particular state, which is in effect the
246 current data for an active path through the match tree. It must consist
247 entirely of ints because the working vector we are passed, and which we put
248 these structures in, is a vector of ints. */
250 typedef struct stateblock {
251 int offset; /* Offset to opcode */
252 int count; /* Count for repeats */
253 int ims; /* ims flag bits */
254 int data; /* Some use extra data */
257 #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
261 /*************************************************
262 * Print character string *
263 *************************************************/
265 /* Character string printing function for debugging.
269 length number of bytes
276 pchars(unsigned char *p, int length, FILE *f)
281 if (isprint(c = *(p++)))
284 fprintf(f, "\\x%02x", c);
291 /*************************************************
292 * Execute a Regular Expression - DFA engine *
293 *************************************************/
295 /* This internal function applies a compiled pattern to a subject string,
296 starting at a given point, using a DFA engine. This function is called from the
297 external one, possibly multiple times if the pattern is not anchored. The
298 function calls itself recursively for some kinds of subpattern.
301 md the match_data block with fixed information
302 this_start_code the opening bracket of this subexpression's code
303 current_subject where we currently are in the subject string
304 start_offset start offset in the subject string
305 offsets vector to contain the matching string offsets
306 offsetcount size of same
307 workspace vector of workspace
309 ims the current ims flags
310 rlevel function call recursion level
311 recursing regex recursive call level
313 Returns: > 0 => number of match offset pairs placed in offsets
314 = 0 => offsets overflowed; longest matches are present
315 -1 => failed to match
316 < -1 => some kind of unexpected problem
318 The following macros are used for adding states to the two state vectors (one
319 for the current character, one for the following character). */
321 #define ADD_ACTIVE(x,y) \
322 if (active_count++ < wscount) \
324 next_active_state->offset = (x); \
325 next_active_state->count = (y); \
326 next_active_state->ims = ims; \
327 next_active_state++; \
328 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
330 else return PCRE_ERROR_DFA_WSSIZE
332 #define ADD_ACTIVE_DATA(x,y,z) \
333 if (active_count++ < wscount) \
335 next_active_state->offset = (x); \
336 next_active_state->count = (y); \
337 next_active_state->ims = ims; \
338 next_active_state->data = (z); \
339 next_active_state++; \
340 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
342 else return PCRE_ERROR_DFA_WSSIZE
344 #define ADD_NEW(x,y) \
345 if (new_count++ < wscount) \
347 next_new_state->offset = (x); \
348 next_new_state->count = (y); \
349 next_new_state->ims = ims; \
351 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
353 else return PCRE_ERROR_DFA_WSSIZE
355 #define ADD_NEW_DATA(x,y,z) \
356 if (new_count++ < wscount) \
358 next_new_state->offset = (x); \
359 next_new_state->count = (y); \
360 next_new_state->ims = ims; \
361 next_new_state->data = (z); \
363 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
365 else return PCRE_ERROR_DFA_WSSIZE
367 /* And now, here is the code */
372 const uschar *this_start_code,
373 const uschar *current_subject,
383 stateblock *active_states, *new_states, *temp_states;
384 stateblock *next_active_state, *next_new_state;
386 const uschar *ctypes, *lcc, *fcc;
388 const uschar *end_code, *first_op;
390 int active_count, new_count, match_count;
392 /* Some fields in the md block are frequently referenced, so we load them into
393 independent variables in the hope that this will perform better. */
395 const uschar *start_subject = md->start_subject;
396 const uschar *end_subject = md->end_subject;
397 const uschar *start_code = md->start_code;
400 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
409 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
410 (2 * INTS_PER_STATEBLOCK);
412 DPRINTF(("\n%.*s---------------------\n"
413 "%.*sCall to internal_dfa_exec f=%d r=%d\n",
414 rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
416 ctypes = md->tables + ctypes_offset;
417 lcc = md->tables + lcc_offset;
418 fcc = md->tables + fcc_offset;
420 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
422 active_states = (stateblock *)(workspace + 2);
423 next_new_state = new_states = active_states + wscount;
426 first_op = this_start_code + 1 + LINK_SIZE +
427 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
429 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
430 the alternative states onto the list, and find out where the end is. This
431 makes is possible to use this function recursively, when we want to stop at a
432 matching internal ket rather than at the end.
434 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
435 a backward assertion. In that case, we have to find out the maximum amount to
436 move back, and set up each alternative appropriately. */
438 if (*first_op == OP_REVERSE)
443 end_code = this_start_code;
446 int back = GET(end_code, 2+LINK_SIZE);
447 if (back > max_back) max_back = back;
448 end_code += GET(end_code, 1);
450 while (*end_code == OP_ALT);
452 /* If we can't go back the amount required for the longest lookbehind
453 pattern, go back as far as we can; some alternatives may still be viable. */
456 /* In character mode we have to step back character by character */
460 for (gone_back = 0; gone_back < max_back; gone_back++)
462 if (current_subject <= start_subject) break;
464 while (current_subject > start_subject &&
465 (*current_subject & 0xc0) == 0x80)
472 /* In byte-mode we can do this quickly. */
475 gone_back = (current_subject - max_back < start_subject)?
476 current_subject - start_subject : max_back;
477 current_subject -= gone_back;
480 /* Save the earliest consulted character */
482 if (current_subject < md->start_used_ptr)
483 md->start_used_ptr = current_subject;
485 /* Now we can process the individual branches. */
487 end_code = this_start_code;
490 int back = GET(end_code, 2+LINK_SIZE);
491 if (back <= gone_back)
493 int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
494 ADD_NEW_DATA(-bstate, 0, gone_back - back);
496 end_code += GET(end_code, 1);
498 while (*end_code == OP_ALT);
501 /* This is the code for a "normal" subpattern (not a backward assertion). The
502 start of a whole pattern is always one of these. If we are at the top level,
503 we may be asked to restart matching from the same point that we reached for a
504 previous partial match. We still have to scan through the top-level branches to
505 find the end state. */
509 end_code = this_start_code;
513 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
515 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
516 new_count = workspace[1];
518 memcpy(new_states, active_states, new_count * sizeof(stateblock));
525 int length = 1 + LINK_SIZE +
526 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
529 ADD_NEW(end_code - start_code + length, 0);
530 end_code += GET(end_code, 1);
531 length = 1 + LINK_SIZE;
533 while (*end_code == OP_ALT);
537 workspace[0] = 0; /* Bit indicating which vector is current */
539 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
541 /* Loop for scanning the subject */
543 ptr = current_subject;
550 BOOL could_continue = FALSE;
552 /* Make the new state list into the active state list and empty the
555 temp_states = active_states;
556 active_states = new_states;
557 new_states = temp_states;
558 active_count = new_count;
561 workspace[0] ^= 1; /* Remember for the restarting feature */
562 workspace[1] = active_count;
565 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
566 pchars((uschar *)ptr, strlen((char *)ptr), stdout);
569 printf("%.*sActive states: ", rlevel*2-2, SP);
570 for (i = 0; i < active_count; i++)
571 printf("%d/%d ", active_states[i].offset, active_states[i].count);
575 /* Set the pointers for adding new states */
577 next_active_state = active_states + active_count;
578 next_new_state = new_states;
580 /* Load the current character from the subject outside the loop, as many
581 different states may want to look at it, and we assume that at least one
584 if (ptr < end_subject)
586 clen = 1; /* Number of bytes in the character */
588 if (utf8) { GETCHARLEN(c, ptr, clen); } else
589 #endif /* SUPPORT_UTF8 */
594 clen = 0; /* This indicates the end of the subject */
595 c = NOTACHAR; /* This value should never actually be used */
598 /* Scan up the active states and act on each one. The result of an action
599 may be to add more states to the currently active list (e.g. on hitting a
600 parenthesis) or it may be to put states on the new list, for considering
601 when we move the character pointer on. */
603 for (i = 0; i < active_count; i++)
605 stateblock *current_state = active_states + i;
607 int state_offset = current_state->offset;
608 int count, codevalue, rrc;
611 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
612 if (clen == 0) printf("EOL\n");
613 else if (c > 32 && c < 127) printf("'%c'\n", c);
614 else printf("0x%02x\n", c);
617 /* This variable is referred to implicity in the ADD_xxx macros. */
619 ims = current_state->ims;
621 /* A negative offset is a special case meaning "hold off going to this
622 (negated) state until the number of characters in the data field have
625 if (state_offset < 0)
627 if (current_state->data > 0)
629 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
630 ADD_NEW_DATA(state_offset, current_state->count,
631 current_state->data - 1);
636 current_state->offset = state_offset = -state_offset;
640 /* Check for a duplicate state with the same count, and skip if found.
641 See the note at the head of this module about the possibility of improving
644 for (j = 0; j < i; j++)
646 if (active_states[j].offset == state_offset &&
647 active_states[j].count == current_state->count)
649 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
650 goto NEXT_ACTIVE_STATE;
654 /* The state offset is the offset to the opcode */
656 code = start_code + state_offset;
659 /* If this opcode inspects a character, but we are at the end of the
660 subject, remember the fact for use when testing for a partial match. */
662 if (clen == 0 && poptable[codevalue] != 0)
663 could_continue = TRUE;
665 /* If this opcode is followed by an inline character, load it. It is
666 tempting to test for the presence of a subject character here, but that
667 is wrong, because sometimes zero repetitions of the subject are
670 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
671 argument that is not a data character - but is always one byte long. We
672 have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
673 this case. To keep the other cases fast, convert these ones to new opcodes.
676 if (coptable[codevalue] > 0)
680 if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
681 #endif /* SUPPORT_UTF8 */
682 d = code[coptable[codevalue]];
683 if (codevalue >= OP_TYPESTAR)
687 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
689 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
690 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
691 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
693 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
695 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
702 dlen = 0; /* Not strictly necessary, but compilers moan */
703 d = NOTACHAR; /* if these variables are not set. */
707 /* Now process the individual opcodes */
711 /* ========================================================================== */
712 /* These cases are never obeyed. This is a fudge that causes a compile-
713 time error if the vectors coptable or poptable, which are indexed by
714 opcode, are not the correct length. It seems to be the only way to do
715 such a check at compile time, as the sizeof() operator does not work
716 in the C preprocessor. */
718 case OP_TABLE_LENGTH:
719 case OP_TABLE_LENGTH +
720 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
721 (sizeof(poptable) == OP_TABLE_LENGTH)):
724 /* ========================================================================== */
725 /* Reached a closing bracket. If not at the end of the pattern, carry
726 on with the next opcode. Otherwise, unless we have an empty string and
727 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
728 start of the subject, save the match data, shifting up all previous
729 matches so we always have the longest first. */
734 if (code != end_code)
736 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
737 if (codevalue != OP_KET)
739 ADD_ACTIVE(state_offset - GET(code, 1), 0);
744 if (ptr > current_subject ||
745 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
746 ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
747 current_subject > start_subject + md->start_offset)))
749 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
750 else if (match_count > 0 && ++match_count * 2 >= offsetcount)
752 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
753 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
754 if (offsetcount >= 2)
756 offsets[0] = current_subject - start_subject;
757 offsets[1] = ptr - start_subject;
758 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
759 offsets[1] - offsets[0], current_subject));
761 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
763 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
764 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
765 match_count, rlevel*2-2, SP));
772 /* ========================================================================== */
773 /* These opcodes add to the current list of states without looking
774 at the current character. */
776 /*-----------------------------------------------------------------*/
778 do { code += GET(code, 1); } while (*code == OP_ALT);
779 ADD_ACTIVE(code - start_code, 0);
782 /*-----------------------------------------------------------------*/
787 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
788 code += GET(code, 1);
790 while (*code == OP_ALT);
793 /*-----------------------------------------------------------------*/
796 ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
797 code += GET(code, 1);
798 while (*code == OP_ALT)
800 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
801 code += GET(code, 1);
805 /*-----------------------------------------------------------------*/
808 ADD_ACTIVE(state_offset + 1, 0);
809 code += 1 + GET(code, 2);
810 while (*code == OP_ALT) code += GET(code, 1);
811 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
814 /*-----------------------------------------------------------------*/
816 code += 1 + GET(code, 2);
817 while (*code == OP_ALT) code += GET(code, 1);
818 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
821 /*-----------------------------------------------------------------*/
823 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
824 ((ims & PCRE_MULTILINE) != 0 &&
825 ptr != end_subject &&
827 { ADD_ACTIVE(state_offset + 1, 0); }
830 /*-----------------------------------------------------------------*/
832 if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
835 /*-----------------------------------------------------------------*/
838 ADD_ACTIVE(state_offset + 2, 0);
841 /*-----------------------------------------------------------------*/
843 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
846 /*-----------------------------------------------------------------*/
848 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
852 /* ========================================================================== */
853 /* These opcodes inspect the next subject character, and sometimes
854 the previous one as well, but do not have an argument. The variable
855 clen contains the length of the current character and is zero if we are
856 at the end of the subject. */
858 /*-----------------------------------------------------------------*/
860 if (clen > 0 && !IS_NEWLINE(ptr))
861 { ADD_NEW(state_offset + 1, 0); }
864 /*-----------------------------------------------------------------*/
867 { ADD_NEW(state_offset + 1, 0); }
870 /*-----------------------------------------------------------------*/
872 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
873 { ADD_ACTIVE(state_offset + 1, 0); }
876 /*-----------------------------------------------------------------*/
878 if ((md->moptions & PCRE_NOTEOL) == 0)
881 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
882 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
884 { ADD_ACTIVE(state_offset + 1, 0); }
886 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
887 { ADD_ACTIVE(state_offset + 1, 0); }
890 /*-----------------------------------------------------------------*/
895 if (clen > 0 && c < 256 &&
896 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
897 { ADD_NEW(state_offset + 1, 0); }
900 /*-----------------------------------------------------------------*/
902 case OP_NOT_WHITESPACE:
903 case OP_NOT_WORDCHAR:
904 if (clen > 0 && (c >= 256 ||
905 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
906 { ADD_NEW(state_offset + 1, 0); }
909 /*-----------------------------------------------------------------*/
910 case OP_WORD_BOUNDARY:
911 case OP_NOT_WORD_BOUNDARY:
913 int left_word, right_word;
915 if (ptr > start_subject)
917 const uschar *temp = ptr - 1;
918 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
920 if (utf8) BACKCHAR(temp);
922 GETCHARTEST(d, temp);
923 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
928 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
931 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
932 { ADD_ACTIVE(state_offset + 1, 0); }
937 /*-----------------------------------------------------------------*/
938 /* Check the next character by Unicode property. We will get here only
939 if the support is in the binary; otherwise a compile-time error occurs.
948 int chartype = UCD_CHARTYPE(c);
956 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
960 OK = _pcre_ucp_gentype[chartype] == code[2];
964 OK = chartype == code[2];
968 OK = UCD_SCRIPT(c) == code[2];
971 /* Should never occur, but keep compilers from grumbling. */
974 OK = codevalue != OP_PROP;
978 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
985 /* ========================================================================== */
986 /* These opcodes likewise inspect the subject character, but have an
987 argument that is not a data character. It is one of these opcodes:
988 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
989 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
994 count = current_state->count; /* Already matched */
995 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
998 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1000 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1001 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1003 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1005 active_count--; /* Remove non-match possibility */
1006 next_active_state--;
1009 ADD_NEW(state_offset, count);
1014 /*-----------------------------------------------------------------*/
1016 case OP_TYPEMINQUERY:
1017 case OP_TYPEPOSQUERY:
1018 ADD_ACTIVE(state_offset + 2, 0);
1021 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1023 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1024 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1026 if (codevalue == OP_TYPEPOSQUERY)
1028 active_count--; /* Remove non-match possibility */
1029 next_active_state--;
1031 ADD_NEW(state_offset + 2, 0);
1036 /*-----------------------------------------------------------------*/
1038 case OP_TYPEMINSTAR:
1039 case OP_TYPEPOSSTAR:
1040 ADD_ACTIVE(state_offset + 2, 0);
1043 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1045 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1046 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1048 if (codevalue == OP_TYPEPOSSTAR)
1050 active_count--; /* Remove non-match possibility */
1051 next_active_state--;
1053 ADD_NEW(state_offset, 0);
1058 /*-----------------------------------------------------------------*/
1060 count = current_state->count; /* Number already matched */
1063 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1065 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1066 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1068 if (++count >= GET2(code, 1))
1069 { ADD_NEW(state_offset + 4, 0); }
1071 { ADD_NEW(state_offset, count); }
1076 /*-----------------------------------------------------------------*/
1078 case OP_TYPEMINUPTO:
1079 case OP_TYPEPOSUPTO:
1080 ADD_ACTIVE(state_offset + 4, 0);
1081 count = current_state->count; /* Number already matched */
1084 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1086 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1087 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1089 if (codevalue == OP_TYPEPOSUPTO)
1091 active_count--; /* Remove non-match possibility */
1092 next_active_state--;
1094 if (++count >= GET2(code, 1))
1095 { ADD_NEW(state_offset + 4, 0); }
1097 { ADD_NEW(state_offset, count); }
1102 /* ========================================================================== */
1103 /* These are virtual opcodes that are used when something like
1104 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1105 argument. It keeps the code above fast for the other cases. The argument
1106 is in the d variable. */
1109 case OP_PROP_EXTRA + OP_TYPEPLUS:
1110 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1111 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1112 count = current_state->count; /* Already matched */
1113 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1117 int chartype = UCD_CHARTYPE(c);
1125 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1129 OK = _pcre_ucp_gentype[chartype] == code[3];
1133 OK = chartype == code[3];
1137 OK = UCD_SCRIPT(c) == code[3];
1140 /* Should never occur, but keep compilers from grumbling. */
1143 OK = codevalue != OP_PROP;
1147 if (OK == (d == OP_PROP))
1149 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1151 active_count--; /* Remove non-match possibility */
1152 next_active_state--;
1155 ADD_NEW(state_offset, count);
1160 /*-----------------------------------------------------------------*/
1161 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1162 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1163 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1164 count = current_state->count; /* Already matched */
1165 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1166 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1168 const uschar *nptr = ptr + clen;
1170 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1172 active_count--; /* Remove non-match possibility */
1173 next_active_state--;
1175 while (nptr < end_subject)
1179 GETCHARLEN(nd, nptr, ndlen);
1180 if (UCD_CATEGORY(nd) != ucp_M) break;
1185 ADD_NEW_DATA(-state_offset, count, ncount);
1190 /*-----------------------------------------------------------------*/
1191 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1192 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1193 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1194 count = current_state->count; /* Already matched */
1195 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1206 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1210 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1215 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1217 active_count--; /* Remove non-match possibility */
1218 next_active_state--;
1221 ADD_NEW_DATA(-state_offset, count, ncount);
1230 /*-----------------------------------------------------------------*/
1231 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1232 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1233 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1234 count = current_state->count; /* Already matched */
1235 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1256 if (OK == (d == OP_VSPACE))
1258 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1260 active_count--; /* Remove non-match possibility */
1261 next_active_state--;
1264 ADD_NEW_DATA(-state_offset, count, 0);
1269 /*-----------------------------------------------------------------*/
1270 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1271 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1272 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1273 count = current_state->count; /* Already matched */
1274 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1281 case 0x20: /* SPACE */
1282 case 0xa0: /* NBSP */
1283 case 0x1680: /* OGHAM SPACE MARK */
1284 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1285 case 0x2000: /* EN QUAD */
1286 case 0x2001: /* EM QUAD */
1287 case 0x2002: /* EN SPACE */
1288 case 0x2003: /* EM SPACE */
1289 case 0x2004: /* THREE-PER-EM SPACE */
1290 case 0x2005: /* FOUR-PER-EM SPACE */
1291 case 0x2006: /* SIX-PER-EM SPACE */
1292 case 0x2007: /* FIGURE SPACE */
1293 case 0x2008: /* PUNCTUATION SPACE */
1294 case 0x2009: /* THIN SPACE */
1295 case 0x200A: /* HAIR SPACE */
1296 case 0x202f: /* NARROW NO-BREAK SPACE */
1297 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1298 case 0x3000: /* IDEOGRAPHIC SPACE */
1307 if (OK == (d == OP_HSPACE))
1309 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1311 active_count--; /* Remove non-match possibility */
1312 next_active_state--;
1315 ADD_NEW_DATA(-state_offset, count, 0);
1320 /*-----------------------------------------------------------------*/
1322 case OP_PROP_EXTRA + OP_TYPEQUERY:
1323 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1324 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1328 case OP_PROP_EXTRA + OP_TYPESTAR:
1329 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1330 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1335 ADD_ACTIVE(state_offset + 4, 0);
1339 int chartype = UCD_CHARTYPE(c);
1347 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1351 OK = _pcre_ucp_gentype[chartype] == code[3];
1355 OK = chartype == code[3];
1359 OK = UCD_SCRIPT(c) == code[3];
1362 /* Should never occur, but keep compilers from grumbling. */
1365 OK = codevalue != OP_PROP;
1369 if (OK == (d == OP_PROP))
1371 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1372 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1374 active_count--; /* Remove non-match possibility */
1375 next_active_state--;
1377 ADD_NEW(state_offset + count, 0);
1382 /*-----------------------------------------------------------------*/
1383 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1384 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1385 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1389 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1390 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1391 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1396 ADD_ACTIVE(state_offset + 2, 0);
1397 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1399 const uschar *nptr = ptr + clen;
1401 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1402 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1404 active_count--; /* Remove non-match possibility */
1405 next_active_state--;
1407 while (nptr < end_subject)
1411 GETCHARLEN(nd, nptr, ndlen);
1412 if (UCD_CATEGORY(nd) != ucp_M) break;
1416 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1421 /*-----------------------------------------------------------------*/
1422 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1423 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1424 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1428 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1429 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1430 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1434 ADD_ACTIVE(state_offset + 2, 0);
1445 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1449 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1454 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1455 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1457 active_count--; /* Remove non-match possibility */
1458 next_active_state--;
1460 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1469 /*-----------------------------------------------------------------*/
1470 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1471 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1472 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1476 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1477 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1478 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1482 ADD_ACTIVE(state_offset + 2, 0);
1502 if (OK == (d == OP_VSPACE))
1504 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1505 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1507 active_count--; /* Remove non-match possibility */
1508 next_active_state--;
1510 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1515 /*-----------------------------------------------------------------*/
1516 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1517 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1518 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1522 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1523 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1524 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1528 ADD_ACTIVE(state_offset + 2, 0);
1535 case 0x20: /* SPACE */
1536 case 0xa0: /* NBSP */
1537 case 0x1680: /* OGHAM SPACE MARK */
1538 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1539 case 0x2000: /* EN QUAD */
1540 case 0x2001: /* EM QUAD */
1541 case 0x2002: /* EN SPACE */
1542 case 0x2003: /* EM SPACE */
1543 case 0x2004: /* THREE-PER-EM SPACE */
1544 case 0x2005: /* FOUR-PER-EM SPACE */
1545 case 0x2006: /* SIX-PER-EM SPACE */
1546 case 0x2007: /* FIGURE SPACE */
1547 case 0x2008: /* PUNCTUATION SPACE */
1548 case 0x2009: /* THIN SPACE */
1549 case 0x200A: /* HAIR SPACE */
1550 case 0x202f: /* NARROW NO-BREAK SPACE */
1551 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1552 case 0x3000: /* IDEOGRAPHIC SPACE */
1561 if (OK == (d == OP_HSPACE))
1563 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1564 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1566 active_count--; /* Remove non-match possibility */
1567 next_active_state--;
1569 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1574 /*-----------------------------------------------------------------*/
1576 case OP_PROP_EXTRA + OP_TYPEEXACT:
1577 case OP_PROP_EXTRA + OP_TYPEUPTO:
1578 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1579 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1580 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1581 { ADD_ACTIVE(state_offset + 6, 0); }
1582 count = current_state->count; /* Number already matched */
1586 int chartype = UCD_CHARTYPE(c);
1594 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1598 OK = _pcre_ucp_gentype[chartype] == code[5];
1602 OK = chartype == code[5];
1606 OK = UCD_SCRIPT(c) == code[5];
1609 /* Should never occur, but keep compilers from grumbling. */
1612 OK = codevalue != OP_PROP;
1616 if (OK == (d == OP_PROP))
1618 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1620 active_count--; /* Remove non-match possibility */
1621 next_active_state--;
1623 if (++count >= GET2(code, 1))
1624 { ADD_NEW(state_offset + 6, 0); }
1626 { ADD_NEW(state_offset, count); }
1631 /*-----------------------------------------------------------------*/
1632 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1633 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1634 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1635 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1636 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1637 { ADD_ACTIVE(state_offset + 4, 0); }
1638 count = current_state->count; /* Number already matched */
1639 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1641 const uschar *nptr = ptr + clen;
1643 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1645 active_count--; /* Remove non-match possibility */
1646 next_active_state--;
1648 while (nptr < end_subject)
1652 GETCHARLEN(nd, nptr, ndlen);
1653 if (UCD_CATEGORY(nd) != ucp_M) break;
1657 if (++count >= GET2(code, 1))
1658 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1660 { ADD_NEW_DATA(-state_offset, count, ncount); }
1665 /*-----------------------------------------------------------------*/
1666 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1667 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1668 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1669 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1670 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1671 { ADD_ACTIVE(state_offset + 4, 0); }
1672 count = current_state->count; /* Number already matched */
1683 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1687 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1692 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1694 active_count--; /* Remove non-match possibility */
1695 next_active_state--;
1697 if (++count >= GET2(code, 1))
1698 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1700 { ADD_NEW_DATA(-state_offset, count, ncount); }
1709 /*-----------------------------------------------------------------*/
1710 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1711 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1712 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1713 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1714 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1715 { ADD_ACTIVE(state_offset + 4, 0); }
1716 count = current_state->count; /* Number already matched */
1736 if (OK == (d == OP_VSPACE))
1738 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1740 active_count--; /* Remove non-match possibility */
1741 next_active_state--;
1743 if (++count >= GET2(code, 1))
1744 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1746 { ADD_NEW_DATA(-state_offset, count, 0); }
1751 /*-----------------------------------------------------------------*/
1752 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1753 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1754 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1755 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1756 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1757 { ADD_ACTIVE(state_offset + 4, 0); }
1758 count = current_state->count; /* Number already matched */
1765 case 0x20: /* SPACE */
1766 case 0xa0: /* NBSP */
1767 case 0x1680: /* OGHAM SPACE MARK */
1768 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1769 case 0x2000: /* EN QUAD */
1770 case 0x2001: /* EM QUAD */
1771 case 0x2002: /* EN SPACE */
1772 case 0x2003: /* EM SPACE */
1773 case 0x2004: /* THREE-PER-EM SPACE */
1774 case 0x2005: /* FOUR-PER-EM SPACE */
1775 case 0x2006: /* SIX-PER-EM SPACE */
1776 case 0x2007: /* FIGURE SPACE */
1777 case 0x2008: /* PUNCTUATION SPACE */
1778 case 0x2009: /* THIN SPACE */
1779 case 0x200A: /* HAIR SPACE */
1780 case 0x202f: /* NARROW NO-BREAK SPACE */
1781 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1782 case 0x3000: /* IDEOGRAPHIC SPACE */
1791 if (OK == (d == OP_HSPACE))
1793 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1795 active_count--; /* Remove non-match possibility */
1796 next_active_state--;
1798 if (++count >= GET2(code, 1))
1799 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1801 { ADD_NEW_DATA(-state_offset, count, 0); }
1806 /* ========================================================================== */
1807 /* These opcodes are followed by a character that is usually compared
1808 to the current subject character; it is loaded into d. We still get
1809 here even if there is no subject character, because in some cases zero
1810 repetitions are permitted. */
1812 /*-----------------------------------------------------------------*/
1814 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1817 /*-----------------------------------------------------------------*/
1819 if (clen == 0) break;
1824 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1826 unsigned int othercase;
1827 if (c < 128) othercase = fcc[c]; else
1829 /* If we have Unicode property support, we can use it to test the
1830 other case of the character. */
1833 othercase = UCD_OTHERCASE(c);
1835 othercase = NOTACHAR;
1838 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1842 #endif /* SUPPORT_UTF8 */
1844 /* Non-UTF-8 mode */
1846 if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1852 /*-----------------------------------------------------------------*/
1853 /* This is a tricky one because it can match more than one character.
1854 Find out how many characters to skip, and then set up a negative state
1855 to wait for them to pass before continuing. */
1858 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1860 const uschar *nptr = ptr + clen;
1862 while (nptr < end_subject)
1865 GETCHARLEN(c, nptr, nclen);
1866 if (UCD_CATEGORY(c) != ucp_M) break;
1870 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1875 /*-----------------------------------------------------------------*/
1876 /* This is a tricky like EXTUNI because it too can match more than one
1877 character (when CR is followed by LF). In this case, set up a negative
1878 state to wait for one character to pass before continuing. */
1881 if (clen > 0) switch(c)
1888 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1891 ADD_NEW(state_offset + 1, 0);
1895 if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1897 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1901 ADD_NEW(state_offset + 1, 0);
1907 /*-----------------------------------------------------------------*/
1909 if (clen > 0) switch(c)
1921 ADD_NEW(state_offset + 1, 0);
1926 /*-----------------------------------------------------------------*/
1928 if (clen > 0) switch(c)
1937 ADD_NEW(state_offset + 1, 0);
1944 /*-----------------------------------------------------------------*/
1946 if (clen > 0) switch(c)
1949 case 0x20: /* SPACE */
1950 case 0xa0: /* NBSP */
1951 case 0x1680: /* OGHAM SPACE MARK */
1952 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1953 case 0x2000: /* EN QUAD */
1954 case 0x2001: /* EM QUAD */
1955 case 0x2002: /* EN SPACE */
1956 case 0x2003: /* EM SPACE */
1957 case 0x2004: /* THREE-PER-EM SPACE */
1958 case 0x2005: /* FOUR-PER-EM SPACE */
1959 case 0x2006: /* SIX-PER-EM SPACE */
1960 case 0x2007: /* FIGURE SPACE */
1961 case 0x2008: /* PUNCTUATION SPACE */
1962 case 0x2009: /* THIN SPACE */
1963 case 0x200A: /* HAIR SPACE */
1964 case 0x202f: /* NARROW NO-BREAK SPACE */
1965 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1966 case 0x3000: /* IDEOGRAPHIC SPACE */
1970 ADD_NEW(state_offset + 1, 0);
1975 /*-----------------------------------------------------------------*/
1977 if (clen > 0) switch(c)
1980 case 0x20: /* SPACE */
1981 case 0xa0: /* NBSP */
1982 case 0x1680: /* OGHAM SPACE MARK */
1983 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1984 case 0x2000: /* EN QUAD */
1985 case 0x2001: /* EM QUAD */
1986 case 0x2002: /* EN SPACE */
1987 case 0x2003: /* EM SPACE */
1988 case 0x2004: /* THREE-PER-EM SPACE */
1989 case 0x2005: /* FOUR-PER-EM SPACE */
1990 case 0x2006: /* SIX-PER-EM SPACE */
1991 case 0x2007: /* FIGURE SPACE */
1992 case 0x2008: /* PUNCTUATION SPACE */
1993 case 0x2009: /* THIN SPACE */
1994 case 0x200A: /* HAIR SPACE */
1995 case 0x202f: /* NARROW NO-BREAK SPACE */
1996 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1997 case 0x3000: /* IDEOGRAPHIC SPACE */
1998 ADD_NEW(state_offset + 1, 0);
2003 /*-----------------------------------------------------------------*/
2004 /* Match a negated single character. This is only used for one-byte
2005 characters, that is, we know that d < 256. The character we are
2006 checking (c) can be multibyte. */
2011 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
2012 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
2016 /*-----------------------------------------------------------------*/
2023 count = current_state->count; /* Already matched */
2024 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2027 unsigned int otherd = NOTACHAR;
2028 if ((ims & PCRE_CASELESS) != 0)
2031 if (utf8 && d >= 128)
2034 otherd = UCD_OTHERCASE(d);
2035 #endif /* SUPPORT_UCP */
2038 #endif /* SUPPORT_UTF8 */
2041 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2044 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2046 active_count--; /* Remove non-match possibility */
2047 next_active_state--;
2050 ADD_NEW(state_offset, count);
2055 /*-----------------------------------------------------------------*/
2060 case OP_NOTMINQUERY:
2061 case OP_NOTPOSQUERY:
2062 ADD_ACTIVE(state_offset + dlen + 1, 0);
2065 unsigned int otherd = NOTACHAR;
2066 if ((ims & PCRE_CASELESS) != 0)
2069 if (utf8 && d >= 128)
2072 otherd = UCD_OTHERCASE(d);
2073 #endif /* SUPPORT_UCP */
2076 #endif /* SUPPORT_UTF8 */
2079 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2081 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2083 active_count--; /* Remove non-match possibility */
2084 next_active_state--;
2086 ADD_NEW(state_offset + dlen + 1, 0);
2091 /*-----------------------------------------------------------------*/
2098 ADD_ACTIVE(state_offset + dlen + 1, 0);
2101 unsigned int otherd = NOTACHAR;
2102 if ((ims & PCRE_CASELESS) != 0)
2105 if (utf8 && d >= 128)
2108 otherd = UCD_OTHERCASE(d);
2109 #endif /* SUPPORT_UCP */
2112 #endif /* SUPPORT_UTF8 */
2115 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2117 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2119 active_count--; /* Remove non-match possibility */
2120 next_active_state--;
2122 ADD_NEW(state_offset, 0);
2127 /*-----------------------------------------------------------------*/
2130 count = current_state->count; /* Number already matched */
2133 unsigned int otherd = NOTACHAR;
2134 if ((ims & PCRE_CASELESS) != 0)
2137 if (utf8 && d >= 128)
2140 otherd = UCD_OTHERCASE(d);
2141 #endif /* SUPPORT_UCP */
2144 #endif /* SUPPORT_UTF8 */
2147 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2149 if (++count >= GET2(code, 1))
2150 { ADD_NEW(state_offset + dlen + 3, 0); }
2152 { ADD_NEW(state_offset, count); }
2157 /*-----------------------------------------------------------------*/
2164 ADD_ACTIVE(state_offset + dlen + 3, 0);
2165 count = current_state->count; /* Number already matched */
2168 unsigned int otherd = NOTACHAR;
2169 if ((ims & PCRE_CASELESS) != 0)
2172 if (utf8 && d >= 128)
2175 otherd = UCD_OTHERCASE(d);
2176 #endif /* SUPPORT_UCP */
2179 #endif /* SUPPORT_UTF8 */
2182 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2184 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2186 active_count--; /* Remove non-match possibility */
2187 next_active_state--;
2189 if (++count >= GET2(code, 1))
2190 { ADD_NEW(state_offset + dlen + 3, 0); }
2192 { ADD_NEW(state_offset, count); }
2198 /* ========================================================================== */
2199 /* These are the class-handling opcodes */
2205 BOOL isinclass = FALSE;
2206 int next_state_offset;
2207 const uschar *ecode;
2209 /* For a simple class, there is always just a 32-byte table, and we
2210 can set isinclass from it. */
2212 if (codevalue != OP_XCLASS)
2217 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2218 ((code[1 + c/8] & (1 << (c&7))) != 0);
2222 /* An extended class may have a table or a list of single characters,
2223 ranges, or both, and it may be positive or negative. There's a
2224 function that sorts all this out. */
2228 ecode = code + GET(code, 1);
2229 if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2232 /* At this point, isinclass is set for all kinds of class, and ecode
2233 points to the byte after the end of the class. If there is a
2234 quantifier, this is where it will be. */
2236 next_state_offset = ecode - start_code;
2242 ADD_ACTIVE(next_state_offset + 1, 0);
2243 if (isinclass) { ADD_NEW(state_offset, 0); }
2248 count = current_state->count; /* Already matched */
2249 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2250 if (isinclass) { count++; ADD_NEW(state_offset, count); }
2255 ADD_ACTIVE(next_state_offset + 1, 0);
2256 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2261 count = current_state->count; /* Already matched */
2262 if (count >= GET2(ecode, 1))
2263 { ADD_ACTIVE(next_state_offset + 5, 0); }
2266 int max = GET2(ecode, 3);
2267 if (++count >= max && max != 0) /* Max 0 => no limit */
2268 { ADD_NEW(next_state_offset + 5, 0); }
2270 { ADD_NEW(state_offset, count); }
2275 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2281 /* ========================================================================== */
2282 /* These are the opcodes for fancy brackets of various kinds. We have
2283 to use recursion in order to handle them. The "always failing" assertion
2284 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2285 though the other "backtracking verbs" are not supported. */
2288 forced_fail++; /* Count FAILs for multiple states */
2294 case OP_ASSERTBACK_NOT:
2297 int local_offsets[2];
2298 int local_workspace[1000];
2299 const uschar *endasscode = code + GET(code, 1);
2301 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2303 rc = internal_dfa_exec(
2304 md, /* static match data */
2305 code, /* this subexpression's code */
2306 ptr, /* where we currently are */
2307 ptr - start_subject, /* start offset */
2308 local_offsets, /* offset vector */
2309 sizeof(local_offsets)/sizeof(int), /* size of same */
2310 local_workspace, /* workspace vector */
2311 sizeof(local_workspace)/sizeof(int), /* size of same */
2312 ims, /* the current ims flags */
2313 rlevel, /* function recursion level */
2314 recursing); /* pass on regex recursion */
2316 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2317 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2318 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2322 /*-----------------------------------------------------------------*/
2326 int local_offsets[1000];
2327 int local_workspace[1000];
2328 int codelink = GET(code, 1);
2331 /* Because of the way auto-callout works during compile, a callout item
2332 is inserted between OP_COND and an assertion condition. This does not
2333 happen for the other conditions. */
2335 if (code[LINK_SIZE+1] == OP_CALLOUT)
2338 if (pcre_callout != NULL)
2340 pcre_callout_block cb;
2341 cb.version = 1; /* Version 1 of the callout block */
2342 cb.callout_number = code[LINK_SIZE+2];
2343 cb.offset_vector = offsets;
2344 cb.subject = (PCRE_SPTR)start_subject;
2345 cb.subject_length = end_subject - start_subject;
2346 cb.start_match = current_subject - start_subject;
2347 cb.current_position = ptr - start_subject;
2348 cb.pattern_position = GET(code, LINK_SIZE + 3);
2349 cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2351 cb.capture_last = -1;
2352 cb.callout_data = md->callout_data;
2353 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2355 if (rrc > 0) break; /* Fail this thread */
2356 code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */
2359 condcode = code[LINK_SIZE+1];
2361 /* Back reference conditions are not supported */
2363 if (condcode == OP_CREF || condcode == OP_NCREF)
2364 return PCRE_ERROR_DFA_UCOND;
2366 /* The DEFINE condition is always false */
2368 if (condcode == OP_DEF)
2369 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2371 /* The only supported version of OP_RREF is for the value RREF_ANY,
2372 which means "test if in any recursion". We can't test for specifically
2375 else if (condcode == OP_RREF || condcode == OP_NRREF)
2377 int value = GET2(code, LINK_SIZE+2);
2378 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2380 { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2381 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2384 /* Otherwise, the condition is an assertion */
2389 const uschar *asscode = code + LINK_SIZE + 1;
2390 const uschar *endasscode = asscode + GET(asscode, 1);
2392 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2394 rc = internal_dfa_exec(
2395 md, /* fixed match data */
2396 asscode, /* this subexpression's code */
2397 ptr, /* where we currently are */
2398 ptr - start_subject, /* start offset */
2399 local_offsets, /* offset vector */
2400 sizeof(local_offsets)/sizeof(int), /* size of same */
2401 local_workspace, /* workspace vector */
2402 sizeof(local_workspace)/sizeof(int), /* size of same */
2403 ims, /* the current ims flags */
2404 rlevel, /* function recursion level */
2405 recursing); /* pass on regex recursion */
2407 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2409 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2410 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2412 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2417 /*-----------------------------------------------------------------*/
2420 int local_offsets[1000];
2421 int local_workspace[1000];
2424 DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2427 rc = internal_dfa_exec(
2428 md, /* fixed match data */
2429 start_code + GET(code, 1), /* this subexpression's code */
2430 ptr, /* where we currently are */
2431 ptr - start_subject, /* start offset */
2432 local_offsets, /* offset vector */
2433 sizeof(local_offsets)/sizeof(int), /* size of same */
2434 local_workspace, /* workspace vector */
2435 sizeof(local_workspace)/sizeof(int), /* size of same */
2436 ims, /* the current ims flags */
2437 rlevel, /* function recursion level */
2438 recursing + 1); /* regex recurse level */
2440 DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2441 recursing + 1, rc));
2443 /* Ran out of internal offsets */
2445 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2447 /* For each successful matched substring, set up the next state with a
2448 count of characters to skip before trying it. Note that the count is in
2449 characters, not bytes. */
2453 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2455 const uschar *p = start_subject + local_offsets[rc];
2456 const uschar *pp = start_subject + local_offsets[rc+1];
2457 int charcount = local_offsets[rc+1] - local_offsets[rc];
2458 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2461 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2465 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2469 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2473 /*-----------------------------------------------------------------*/
2476 int local_offsets[2];
2477 int local_workspace[1000];
2479 int rc = internal_dfa_exec(
2480 md, /* fixed match data */
2481 code, /* this subexpression's code */
2482 ptr, /* where we currently are */
2483 ptr - start_subject, /* start offset */
2484 local_offsets, /* offset vector */
2485 sizeof(local_offsets)/sizeof(int), /* size of same */
2486 local_workspace, /* workspace vector */
2487 sizeof(local_workspace)/sizeof(int), /* size of same */
2488 ims, /* the current ims flags */
2489 rlevel, /* function recursion level */
2490 recursing); /* pass on regex recursion */
2494 const uschar *end_subpattern = code;
2495 int charcount = local_offsets[1] - local_offsets[0];
2496 int next_state_offset, repeat_state_offset;
2498 do { end_subpattern += GET(end_subpattern, 1); }
2499 while (*end_subpattern == OP_ALT);
2500 next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2502 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2503 arrange for the repeat state also to be added to the relevant list.
2504 Calculate the offset, or set -1 for no repeat. */
2506 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2507 *end_subpattern == OP_KETRMIN)?
2508 end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2510 /* If we have matched an empty string, add the next state at the
2511 current character pointer. This is important so that the duplicate
2512 checking kicks in, which is what breaks infinite loops that match an
2517 ADD_ACTIVE(next_state_offset, 0);
2520 /* Optimization: if there are no more active states, and there
2521 are no new states yet set up, then skip over the subject string
2522 right here, to save looping. Otherwise, set up the new state to swing
2523 into action when the end of the substring is reached. */
2525 else if (i + 1 >= active_count && new_count == 0)
2529 ADD_NEW(next_state_offset, 0);
2531 /* If we are adding a repeat state at the new character position,
2532 we must fudge things so that it is the only current state.
2533 Otherwise, it might be a duplicate of one we processed before, and
2534 that would cause it to be skipped. */
2536 if (repeat_state_offset >= 0)
2538 next_active_state = active_states;
2541 ADD_ACTIVE(repeat_state_offset, 0);
2546 const uschar *p = start_subject + local_offsets[0];
2547 const uschar *pp = start_subject + local_offsets[1];
2548 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2549 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2550 if (repeat_state_offset >= 0)
2551 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2555 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2560 /* ========================================================================== */
2561 /* Handle callouts */
2565 if (pcre_callout != NULL)
2567 pcre_callout_block cb;
2568 cb.version = 1; /* Version 1 of the callout block */
2569 cb.callout_number = code[1];
2570 cb.offset_vector = offsets;
2571 cb.subject = (PCRE_SPTR)start_subject;
2572 cb.subject_length = end_subject - start_subject;
2573 cb.start_match = current_subject - start_subject;
2574 cb.current_position = ptr - start_subject;
2575 cb.pattern_position = GET(code, 2);
2576 cb.next_item_length = GET(code, 2 + LINK_SIZE);
2578 cb.capture_last = -1;
2579 cb.callout_data = md->callout_data;
2580 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2583 { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2587 /* ========================================================================== */
2588 default: /* Unsupported opcode */
2589 return PCRE_ERROR_DFA_UITEM;
2592 NEXT_ACTIVE_STATE: continue;
2594 } /* End of loop scanning active states */
2596 /* We have finished the processing at the current subject character. If no
2597 new states have been set for the next character, we have found all the
2598 matches that we are going to find. If we are at the top level and partial
2599 matching has been requested, check for appropriate conditions.
2601 The "forced_ fail" variable counts the number of (*F) encountered for the
2602 character. If it is equal to the original active_count (saved in
2603 workspace[1]) it means that (*F) was found on every active state. In this
2604 case we don't want to give a partial match.
2606 The "could_continue" variable is true if a state could have continued but
2607 for the fact that the end of the subject was reached. */
2611 if (rlevel == 1 && /* Top level, and */
2612 could_continue && /* Some could go on */
2613 forced_fail != workspace[1] && /* Not all forced fail & */
2615 (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2617 ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2618 match_count < 0) /* no matches */
2620 ptr >= end_subject && /* Reached end of subject */
2621 ptr > current_subject) /* Matched non-empty string */
2623 if (offsetcount >= 2)
2625 offsets[0] = md->start_used_ptr - start_subject;
2626 offsets[1] = end_subject - start_subject;
2628 match_count = PCRE_ERROR_PARTIAL;
2631 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2632 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2634 break; /* In effect, "return", but see the comment below */
2637 /* One or more states are active for the next character. */
2639 ptr += clen; /* Advance to next subject character */
2640 } /* Loop to move along the subject string */
2642 /* Control gets here from "break" a few lines above. We do it this way because
2643 if we use "return" above, we have compiler trouble. Some compilers warn if
2644 there's nothing here because they think the function doesn't return a value. On
2645 the other hand, if we put a dummy statement here, some more clever compilers
2646 complain that it can't be reached. Sigh. */
2654 /*************************************************
2655 * Execute a Regular Expression - DFA engine *
2656 *************************************************/
2658 /* This external function applies a compiled re to a subject string using a DFA
2659 engine. This function calls the internal function multiple times if the pattern
2663 argument_re points to the compiled expression
2664 extra_data points to extra data or is NULL
2665 subject points to the subject string
2666 length length of subject string (may contain binary zeros)
2667 start_offset where to start in the subject string
2669 offsets vector of match offsets
2670 offsetcount size of same
2671 workspace workspace vector
2672 wscount size of same
2674 Returns: > 0 => number of match offset pairs placed in offsets
2675 = 0 => offsets overflowed; longest matches are present
2676 -1 => failed to match
2677 < -1 => some kind of unexpected problem
2680 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2681 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2682 const char *subject, int length, int start_offset, int options, int *offsets,
2683 int offsetcount, int *workspace, int wscount)
2685 real_pcre *re = (real_pcre *)argument_re;
2686 dfa_match_data match_block;
2687 dfa_match_data *md = &match_block;
2688 BOOL utf8, anchored, startline, firstline;
2689 const uschar *current_subject, *end_subject, *lcc;
2691 pcre_study_data internal_study;
2692 const pcre_study_data *study = NULL;
2693 real_pcre internal_re;
2695 const uschar *req_byte_ptr;
2696 const uschar *start_bits = NULL;
2697 BOOL first_byte_caseless = FALSE;
2698 BOOL req_byte_caseless = FALSE;
2699 int first_byte = -1;
2704 /* Plausibility checks */
2706 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2707 if (re == NULL || subject == NULL || workspace == NULL ||
2708 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2709 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2710 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2712 /* We need to find the pointer to any study data before we test for byte
2713 flipping, so we scan the extra_data block first. This may set two fields in the
2714 match block, so we must initialize them beforehand. However, the other fields
2715 in the match block must not be set until after the byte flipping. */
2717 md->tables = re->tables;
2718 md->callout_data = NULL;
2720 if (extra_data != NULL)
2722 unsigned int flags = extra_data->flags;
2723 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2724 study = (const pcre_study_data *)extra_data->study_data;
2725 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2726 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2727 return PCRE_ERROR_DFA_UMLIMIT;
2728 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2729 md->callout_data = extra_data->callout_data;
2730 if ((flags & PCRE_EXTRA_TABLES) != 0)
2731 md->tables = extra_data->tables;
2734 /* Check that the first field in the block is the magic number. If it is not,
2735 test for a regex that was compiled on a host of opposite endianness. If this is
2736 the case, flipped values are put in internal_re and internal_study if there was
2739 if (re->magic_number != MAGIC_NUMBER)
2741 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2742 if (re == NULL) return PCRE_ERROR_BADMAGIC;
2743 if (study != NULL) study = &internal_study;
2746 /* Set some local values */
2748 current_subject = (const unsigned char *)subject + start_offset;
2749 end_subject = (const unsigned char *)subject + length;
2750 req_byte_ptr = current_subject - 1;
2753 utf8 = (re->options & PCRE_UTF8) != 0;
2758 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2759 (re->options & PCRE_ANCHORED) != 0;
2761 /* The remaining fixed data for passing around. */
2763 md->start_code = (const uschar *)argument_re +
2764 re->name_table_offset + re->name_count * re->name_entry_size;
2765 md->start_subject = (const unsigned char *)subject;
2766 md->end_subject = end_subject;
2767 md->start_offset = start_offset;
2768 md->moptions = options;
2769 md->poptions = re->options;
2771 /* If the BSR option is not set at match time, copy what was set
2774 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2776 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2777 md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2779 else md->moptions |= PCRE_BSR_ANYCRLF;
2783 /* Handle different types of newline. The three bits give eight cases. If
2784 nothing is set at run time, whatever was used at compile time applies. */
2786 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2789 case 0: newline = NEWLINE; break; /* Compile-time default */
2790 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2791 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2792 case PCRE_NEWLINE_CR+
2793 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2794 case PCRE_NEWLINE_ANY: newline = -1; break;
2795 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2796 default: return PCRE_ERROR_BADNEWLINE;
2801 md->nltype = NLTYPE_ANYCRLF;
2803 else if (newline < 0)
2805 md->nltype = NLTYPE_ANY;
2809 md->nltype = NLTYPE_FIXED;
2813 md->nl[0] = (newline >> 8) & 255;
2814 md->nl[1] = newline & 255;
2819 md->nl[0] = newline;
2823 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2824 back the character offset. */
2827 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2829 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2830 return PCRE_ERROR_BADUTF8;
2831 if (start_offset > 0 && start_offset < length)
2833 int tb = ((uschar *)subject)[start_offset];
2837 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2843 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2844 is a feature that makes it possible to save compiled regex and re-use them
2845 in other programs later. */
2847 if (md->tables == NULL) md->tables = _pcre_default_tables;
2849 /* The lower casing table and the "must be at the start of a line" flag are
2850 used in a loop when finding where to start. */
2852 lcc = md->tables + lcc_offset;
2853 startline = (re->flags & PCRE_STARTLINE) != 0;
2854 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2856 /* Set up the first character to match, if available. The first_byte value is
2857 never set for an anchored regular expression, but the anchoring may be forced
2858 at run time, so we have to test for anchoring. The first char may be unset for
2859 an unanchored pattern, of course. If there's no first char and the pattern was
2860 studied, there may be a bitmap of possible first characters. */
2864 if ((re->flags & PCRE_FIRSTSET) != 0)
2866 first_byte = re->first_byte & 255;
2867 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2868 first_byte = lcc[first_byte];
2872 if (!startline && study != NULL &&
2873 (study->flags & PCRE_STUDY_MAPPED) != 0)
2874 start_bits = study->start_bits;
2878 /* For anchored or unanchored matches, there may be a "last known required
2881 if ((re->flags & PCRE_REQCHSET) != 0)
2883 req_byte = re->req_byte & 255;
2884 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2885 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2888 /* Call the main matching function, looping for a non-anchored regex after a
2889 failed match. If not restarting, perform certain optimizations at the start of
2896 if ((options & PCRE_DFA_RESTART) == 0)
2898 const uschar *save_end_subject = end_subject;
2900 /* If firstline is TRUE, the start of the match is constrained to the first
2901 line of a multiline string. Implement this by temporarily adjusting
2902 end_subject so that we stop scanning at a newline. If the match fails at
2903 the newline, later code breaks this loop. */
2907 USPTR t = current_subject;
2911 while (t < md->end_subject && !IS_NEWLINE(t))
2914 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2919 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2923 /* There are some optimizations that avoid running the match if a known
2924 starting point is not found. However, there is an option that disables
2925 these, for testing and for ensuring that all callouts do actually occur. */
2927 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2929 /* Advance to a known first byte. */
2931 if (first_byte >= 0)
2933 if (first_byte_caseless)
2934 while (current_subject < end_subject &&
2935 lcc[*current_subject] != first_byte)
2938 while (current_subject < end_subject &&
2939 *current_subject != first_byte)
2943 /* Or to just after a linebreak for a multiline match if possible */
2947 if (current_subject > md->start_subject + start_offset)
2952 while (current_subject < end_subject &&
2953 !WAS_NEWLINE(current_subject))
2956 while(current_subject < end_subject &&
2957 (*current_subject & 0xc0) == 0x80)
2963 while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2966 /* If we have just passed a CR and the newline option is ANY or
2967 ANYCRLF, and we are now at a LF, advance the match position by one
2970 if (current_subject[-1] == CHAR_CR &&
2971 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2972 current_subject < end_subject &&
2973 *current_subject == CHAR_NL)
2978 /* Or to a non-unique first char after study */
2980 else if (start_bits != NULL)
2982 while (current_subject < end_subject)
2984 register unsigned int c = *current_subject;
2985 if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2991 /* Restore fudged end_subject */
2993 end_subject = save_end_subject;
2995 /* The following two optimizations are disabled for partial matching or if
2996 disabling is explicitly requested (and of course, by the test above, this
2997 code is not obeyed when restarting after a partial match). */
2999 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
3000 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3002 /* If the pattern was studied, a minimum subject length may be set. This
3003 is a lower bound; no actual string of that length may actually match the
3004 pattern. Although the value is, strictly, in characters, we treat it as
3005 bytes to avoid spending too much time in this optimization. */
3007 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3008 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3009 return PCRE_ERROR_NOMATCH;
3011 /* If req_byte is set, we know that that character must appear in the
3012 subject for the match to succeed. If the first character is set, req_byte
3013 must be later in the subject; otherwise the test starts at the match
3014 point. This optimization can save a huge amount of work in patterns with
3015 nested unlimited repeats that aren't going to match. Writing separate
3016 code for cased/caseless versions makes it go faster, as does using an
3017 autoincrement and backing off on a match.
3019 HOWEVER: when the subject string is very, very long, searching to its end
3020 can take a long time, and give bad performance on quite ordinary
3021 patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3022 string... so we don't do this when the string is sufficiently long. */
3024 if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
3026 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
3028 /* We don't need to repeat the search if we haven't yet reached the
3029 place we found it at last time. */
3031 if (p > req_byte_ptr)
3033 if (req_byte_caseless)
3035 while (p < end_subject)
3037 register int pp = *p++;
3038 if (pp == req_byte || pp == req_byte2) { p--; break; }
3043 while (p < end_subject)
3045 if (*p++ == req_byte) { p--; break; }
3049 /* If we can't find the required character, break the matching loop,
3050 which will cause a return or PCRE_ERROR_NOMATCH. */
3052 if (p >= end_subject) break;
3054 /* If we have found the required character, save the point where we
3055 found it, so that we don't search again next time round the loop if
3056 the start hasn't passed this character yet. */
3062 } /* End of optimizations that are done when not restarting */
3064 /* OK, now we can do the business */
3066 md->start_used_ptr = current_subject;
3068 rc = internal_dfa_exec(
3069 md, /* fixed match data */
3070 md->start_code, /* this subexpression's code */
3071 current_subject, /* where we currently are */
3072 start_offset, /* start offset in subject */
3073 offsets, /* offset vector */
3074 offsetcount, /* size of same */
3075 workspace, /* workspace vector */
3076 wscount, /* size of same */
3077 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
3078 0, /* function recurse level */
3079 0); /* regex recurse level */
3081 /* Anything other than "no match" means we are done, always; otherwise, carry
3082 on only if not anchored. */
3084 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3086 /* Advance to the next subject character unless we are at the end of a line
3087 and firstline is set. */
3089 if (firstline && IS_NEWLINE(current_subject)) break;
3093 while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
3096 if (current_subject > end_subject) break;
3098 /* If we have just passed a CR and we are now at a LF, and the pattern does
3099 not contain any explicit matches for \r or \n, and the newline option is CRLF
3100 or ANY or ANYCRLF, advance the match position by one more character. */
3102 if (current_subject[-1] == CHAR_CR &&
3103 current_subject < end_subject &&
3104 *current_subject == CHAR_NL &&
3105 (re->flags & PCRE_HASCRORLF) == 0 &&
3106 (md->nltype == NLTYPE_ANY ||
3107 md->nltype == NLTYPE_ANYCRLF ||
3111 } /* "Bumpalong" loop */
3113 return PCRE_ERROR_NOMATCH;
3116 /* End of pcre_dfa_exec.c */